hevProject.glue

delete project hev

create project hev "A GLUE project for Hepatitis E virus (HEV)" --minVersion 0.1.151

run file glue/hevSchemaExtensions.glue
 
project hev

  run file glue/hevProjectSettings.glue
  
  run file glue/hevModules.glue
  
  import source sources/ncbi-hev 
  import source sources/ncbi-outgroup 

  # import unconstrained alignment  
  # The alignment is based on Smith, et. al 2016, but has also been modified to:
  # 1. Replace internal NN's with --'s for KJ873911 in order to make blast fasta importer work properly.
  # 2. Add a row for sequence RR_L08816 -- this sequence is present in order to provide feature locations
  # 3. Add a row for sequence OO_AM943647 -- this sequence is present in order to provide an outgroup.
  # 4. The missing subtype has been added for JQ013791 (3ra) KJ013415 (3ra) and EU723513 (3f)
  

  module unconstrainedAlignmentImporter import AL_UNCONSTRAINED -f alignments/SmithAlignment_23_1_2016_modified.fna

  run file glue/hevFeatures.glue
  run file glue/hevReferences.glue

  module hevGenbankXmlPopulator populate
  # Missing data for some full genome NCBI sequences
  module hevDataMissingImporter populate --fileName tabular/AhmedAlnamrotyFullGenomeMissing.txt
  module hevCollectionBoundsPopulator populate
  
  # identify full genome sequences.
  multi-set field sequence -w "gb_length >= 6500" full_genome true
  multi-set field sequence -w "full_genome = null" full_genome false

  # if there's no evidence it's a lab construct, it isn't.
  multi-set field sequence -w "source.name = 'ncbi-hev' and gb_lab_construct = null" gb_lab_construct false

  # update gb_country_short based on gb_country_iso
  run file glue/hevCountryUpdate.glue

  module hevSmithGenotypingPopulator populate --whereClause "source.name = 'ncbi-hev'" -f tabular/Smith_full_length_genotypes.txt
  module hevEpaGenotypingResultsPopulator populate --whereClause "source.name = 'ncbi-hev'" -f tabular/ncbiHevGenotypes.txt
  multi-copy field sequence -w "source.name = 'ncbi-hev'" epa_genotype genotype
  multi-copy field sequence -w "source.name = 'ncbi-hev'" epa_subtype subtype

  run file glue/hevAlignmentTree.glue

  ## record feature presence
  module hevFeaturePresenceRecorder 
    record feature-presence AL_MASTER --recursive --whereClause "sequence.source.name = 'ncbi-hev'" --featureName whole_genome --descendentFeatures
    exit
    
  run file glue/importPhylogeny.glue


  validate