Merge pull request #94 from sanger-tol/local_testing

Local testing
sanger-tol · May 18, 2023 · 204d303 · 204d303
2 parents e047c5e + 3071064
commit 204d303
Show file tree

Hide file tree

Showing 32 changed files with 461 additions and 147 deletions.
diff --git a/assets/local_testing/nxOsc-2023-05-02.dp.TEST.md b/assets/local_testing/nxOsc-2023-05-02.dp.TEST.md
@@ -0,0 +1,31 @@
+e769c449778489095a023896d05b87fa cds/CaenorhabditisElegans.WBcel235_cds.bigBed
+29f4bb4aa841e754e6ad90a95c51a8ac cds/Gae_host.Gae_cds.bigBed
+55e02bdabcbd4c03413d42026ac9e34a custom/software_versions.yml
+d41d8cd98f00b204e9800998ecf8427e gap/Oscheius_DF5033_gaplen.bed
+efa3906048c52a26a3b762142b138df2 gen/CaenorhabditisElegans.WBcel235_cdna.bigBed
+6a1f75afdc99390c150a9abe204e856b generate/my.genome
+ab841e49f59ff1dd51ed87191c2d7562 gen/Gae_host.Gae_cdna.bigBed
+8b277d209db8bf97c46f118562c4b9b5 gen/OscheiusTipulae.ASM1342590v1_cdna.bigBed
+1d1846bbab542500504b19bfc56cb9b2 insilico/BSPQI.bigBed
+008e29071b2574e2ed50a2887f4a7fc5 insilico/BSSSI.bigBed
+5f58843218b373c5addd38bc91e0d74d insilico/DLE1.bigBed
+08d932ddcb01866d9cfa76dbcaf8c5f5 longread/Oscheius_DF5033.bigWig
+36e4493afcd46a6c89d762fee08b2aa8 longread/Oscheius_DF5033_halfdepth.bed
+7bd5f463e6cd75e876f648dce93411fc longread/Oscheius_DF5033_maxdepth.bed
+82d251d88ee7d9bdbb29b68d3136b7ea longread/Oscheius_DF5033_zerodepth.bed
+cf6a4dc883979ac9cafd75382aa16bdc pep/CaenorhabditisElegans.WBcel235_pep.gff.gz
+84c1ad1989c7e9bcf13258b2774f4a25 pep/CaenorhabditisElegans.WBcel235_pep.gff.gz.tbi
+c2cccc5ab38b0e6b4e12fea2c1151569 pep/Gae_host.Gae_pep.gff.gz
+6a6522a6176761172a6313df9fc5b210 pep/Gae_host.Gae_pep.gff.gz.tbi
+e012da1d0c2ea40171785ead8a294289 punchlist/CaenorhabditisElegans.WBcel235_cdna_punchlist.bed
+d9da11fc3f6170a1c37c38765718ab47 punchlist/CaenorhabditisElegans.WBcel235_cds_punchlist.bed
+31d4e0cec6ef4ec92d51336393a923be punchlist/CaenorhabditisElegans.WBcel235_rna_punchlist.bed
+1ae4cbf700ff5b6d02c96631351f7eb8 punchlist/Gae_host.Gae_cdna_punchlist.bed
+50f76662114c8a77e8604a5a539e1e9c punchlist/Gae_host.Gae_cds_punchlist.bed
+c269f93c3a43697116b5aa75314e5e07 punchlist/Gae_host.Gae_rna_punchlist.bed
+e5fed140728b0f0d088d983a34868d8d punchlist/OscheiusTipulae.ASM1342590v1_cdna_punchlist.bed
+779ad07ceefaca4657090c9f0322ddfd repeat/Oscheius_DF5033.bigWig
+9d2cca3997c9a60f66516af739eb3719 repeat/Oscheius_DF5033_renamed.bed
+bb92039394cc0f2e9e6809e78be4bc9e rna/CaenorhabditisElegans.WBcel235_rna.bigBed
+4254dcb32d0aed160e03d3f6c02cf636 rna/Gae_host.Gae_rna.bigBed
+b2d9bea322639d2b0954a0ccc7eed800 selfcomp/Oscheius_DF5033_selfcomp.bigBed
diff --git a/assets/local_testing/nxOscDF5033.yaml b/assets/local_testing/nxOscDF5033.yaml
@@ -0,0 +1,27 @@
+assembly:
+  sizeClass: S # S if {genome => 4Gb} else L
+  level: scaffold
+  sample_id: Oscheius_DF5033
+  latin_name: to_provide_taxonomic_rank
+  classT: nematode
+  asmVersion: Oscheius_DF5033_1
+  dbVersion: "1"
+  gevalType: DTOL
+reference_file: /lustre/scratch123/tol/resources/treeval/nextflow_test_data/Oscheius_DF5033/assembly/draft/DF5033.hifiasm.noTelos.20211120/DF5033.noTelos.hifiasm.purged.noCont.noMito.fasta
+assem_reads:
+  pacbio: /lustre/scratch123/tol/resources/treeval/nextflow_test_data/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/
+  hic: path
+  supplementary: path
+alignment:
+  data_dir: /lustre/scratch123/tol/resources/treeval/gene_alignment_data/
+  common_name: "" # For future implementation (adding bee, wasp, ant etc)
+  geneset: "OscheiusTipulae.ASM1342590v1,CaenorhabditisElegans.WBcel235,Gae_host.Gae"
+  #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
+self_comp:
+  motif_len: 0
+  mummer_chunk: 10
+synteny:
+  synteny_genome_path: /lustre/scratch123/tol/resources/treeval/synteny/
+outdir: "NEEDS TESTING"
+intron:
+  size: "50k"
diff --git a/assets/local_testing/nxOscSUBSET.yaml b/assets/local_testing/nxOscSUBSET.yaml
@@ -0,0 +1,27 @@
+assembly:
+  sizeClass: S # S if {genome => 4Gb} else L
+  level: scaffold
+  sample_id: OscheiusSUBSET
+  latin_name: to_provide_taxonomic_rank
+  classT: nematode
+  asmVersion: OscheiusSUBSET_1
+  dbVersion: "1"
+  gevalType: DTOL
+reference_file: /lustre/scratch123/tol/resources/treeval/nextflow_test_data/Oscheius_SUBSET/assembly/draft/SUBSET_genome/Oscheius_SUBSET.fasta
+assem_reads:
+  pacbio: /lustre/scratch123/tol/resources/treeval/nextflow_test_data/Oscheius_SUBSET/genomic_data/pacbio/
+  hic: path
+  supplementary: path
+alignment:
+  data_dir: /lustre/scratch123/tol/resources/treeval/nextflow_test_data/Oscheius_SUBSET/gene_set/
+  common_name: "" # For future implementation (adding bee, wasp, ant etc)
+  geneset: "Gae_host.Gae"
+  #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
+self_comp:
+  motif_len: 0
+  mummer_chunk: 4
+synteny:
+  synteny_genome_path: /lustre/scratch123/tol/resources/treeval/synteny/
+outdir: "NEEDS TESTING"
+intron:
+  size: "50k"
diff --git a/bin/cut_size.sh b/bin/cut_size.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+head -n 1 $1 | cut -f2
diff --git a/bin/gff_to_bed.sh b/bin/gff_to_bed.sh
@@ -14,5 +14,5 @@ if [ $1 == '-v'];
 then
     echo "$version"
 else
-    grep '##PAF' $1 | sed 's/##PAF\t//g'|awk 'BEGIN{FS="\t";}{a[$1]++;if(a[$1]==2)print v[$1] ORS $0;if(a[$1]>2)print;v[$1]=$0;}' | awk '$(NF+1) = ($10/$11)*100'|awk '$(NF+1) = ($10/($2*3))*100'|awk -vOFS='\t' '{print $6,$8,$9,$1,$2,$10,$(NF-1),$NF}' > $2.bed
+    grep '##PAF' $1 | sed 's/##PAF\t//g'|awk 'BEGIN{FS="\t";}{a[$1]++;if(a[$1]==2)print v[$1] ORS $0;if(a[$1]>2)print;v[$1]=$0;}' | awk '$(NF+1) = ($10/$11)*100'|awk '$(NF+1) = ($10/($2*3))*100'|awk -vOFS='\t' '{print $6,$8,$9,$1,$2,$10,$(NF-1),$NF}' > $2
 fi
diff --git a/bin/paf_to_bed12.sh b/bin/paf_to_bed12.sh
@@ -14,5 +14,5 @@ if [ $1 == '-v'];
 then
     echo "$version"
 else
-    cat $1 | awk 'BEGIN{FS="\t";}{a[$1]++;if(a[$1]==2)print v[$1] ORS $0;if(a[$1]>2)print;v[$1]=$0;}' | awk '$(NF+1) = ($10/$11)*100' | awk '$(NF+1) = ($10/$2)*100' | awk -vOFS='\t' '{print $6,$8,$9,$1,$2,$10,$(NF-1),$NF}' > $2_punchlist.bed
+    cat $1 | awk 'BEGIN{FS="\t";}{a[$1]++;if(a[$1]==2)print v[$1] ORS $0;if(a[$1]>2)print;v[$1]=$0;}' | awk '$(NF+1) = ($10/$11)*100' | awk '$(NF+1) = ($10/$2)*100' | awk -vOFS='\t' '{print $6,$8,$9,$1,$2,$10,$(NF-1),$NF}' > $2
 fi
diff --git a/conf/base.config b/conf/base.config
@@ -18,14 +18,14 @@ process {
     maxErrors     = '-1'
 
     withName:SAMTOOLS_MERGE {
-        memory = { check_max( 50.GB     * task.attempt, 'memory') }
+        memory = { check_max( 50.GB     * task.attempt, 'memory'    ) }
     }
 
     // RESOURCES: MEMORY INTENSIVE STEPS, SOFTWARE TO BE UPDATED TO COMBAT THIS
     withName: '.*:.*:SELFCOMP:(SELFCOMP_ALIGNMENTBLOCKS|SELFCOMP_MAPIDS|SELFCOMP_MUMMER2BED|SELFCOMP_SPLITFASTA|BEDTOOLS_MERGE)' {
-        cpus    = { check_max( 10       * task.attempt, 'cpus'  ) }
-        memory  = { check_max( 100.GB   * task.attempt, 'memory') }
-        time    = { check_max( 8.h      * task.attempt, 'time'  ) }
+        cpus    = { check_max( 10       * task.attempt, 'cpus'      ) }
+        memory  = { check_max( 120.GB   * task.attempt, 'memory'    ) }
+        time    = { check_max( 12.h      * task.attempt, 'time'     ) }
     }
 
     // RESOURCES: CHANGES TO FREQUENT FAILURES BELOW THIS MEM POINT

diff --git a/conf/modules.config b/conf/modules.config
@@ -78,11 +78,11 @@ process {
     }
 
     withName: '.*:.*:.*:(GEN_ALIGNMENTS|RNA_ALIGNMENTS|CDS_ALIGNMENTS):UCSC_BEDTOBIGBED' {
-        ext.prefix = { "${meta.id}_${meta.type}" }
+        ext.prefix      = { "${meta.id}_${meta.type}" }
     }
 
     withName: '.*:.*:.*:PEP_ALIGNMENTS:BEDTOOLS_SORT' {
-        ext.prefix = { "${meta.id}_prot" }
+        ext.prefix      = { "${meta.id}_prot" }
     }
 
     withName: '.*:.*:INSILICO_DIGEST:UCSC_BEDTOBIGBED' {
@@ -95,6 +95,14 @@ process {
         ext.prefix  = { "${meta.id}_selfcomp" }
     }
 
+    withName: '.*:.*:REPEAT_DENSITY:UCSC_BEDGRAPHTOBIGWIG' {
+        ext.prefix  = { "${meta.id}_repeat_density" }
+    }
+
+    withName: '.*:.*:GAP_FINDER:TABIX_BGZIPTABIX' {
+        ext.prefix    = { "gap_${meta.id}" }
+    }
+
     withName: '.*:.*:SYNTENY:MINIMAP2_ALIGN' {
         ext.args    = '-t 8 -x asm10'
         ext.prefix  = { "${meta.id}_synteny_${reference.getName().tokenize('.')[0]}" }
@@ -130,16 +138,20 @@ process {
 
     withName: '.*:.*:LONGREAD_COVERAGE:BEDTOOLS_MERGE_MAX' {
         ext.args    = "-d 50"
-        ext.prefix  = { "${meta.id}_maxdepth" }
+        ext.prefix  = { "maxdepth" }
     }
 
     withName: '.*:.*:LONGREAD_COVERAGE:BEDTOOLS_MERGE_MIN' {
         ext.args    = "-d 50"
-        ext.prefix  = { "${meta.id}_zerodepth" }
+        ext.prefix  = { "zerodepth" }
     }
 
     withName: '.*:.*:LONGREAD_COVERAGE:GNU_SORT' {
-        ext.args  = "-k1,1 -k2,2n"
+        ext.args    = "-k1,1 -k2,2n"
         ext.prefix  = { "${meta.id}_sorted" }
     }
+
+    withName: '.*:.*:LONGREAD_COVERAGE:UCSC_BEDGRAPHTOBIGWIG' {
+        ext.prefix  = 'coverage'
+    }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -1,24 +1,25 @@
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Nextflow config file for running full-size tests
+    Nextflow config file for running representative-size tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Defines input files and everything required to run a full size pipeline test.
+    Defines input files and everything required to run a representative size pipeline test.
 
     Use as follows:
-        nextflow run nf-core/treeval -profile test_full,<docker/singularity> --outdir <OUTDIR>
+        nextflow run sanger-tol/treeval -profile test,singularity -entry FULL
+
+    On LSF / tol farm:
+        bsub -Is -tty -e error -o out -n 2 -q oversubscribed -M4000 -R'select[mem>4000] rusage[mem=4000] span[hosts=1]' 'nextflow run main.nf -profile test,singularity,sanger'
 
 ----------------------------------------------------------------------------------------
 */
 
 params {
-    config_profile_name        = 'Full test profile'
-    config_profile_description = 'Full test dataset to check pipeline function'
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal local test dataset to check pipeline function'
 
-    // Input data for full size test
-    // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
 
-    // Genome references
-    genome = 'R64-1-1'
+    input = 'assets/local_testing/nxOscSUBSET.yaml'
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -5,20 +5,17 @@
     Defines input files and everything required to run a full size pipeline test.
 
     Use as follows:
-        nextflow run nf-core/treeval -profile test_full,<docker/singularity> --outdir <OUTDIR>
+        nextflow run sanger-tol/treeval -profile test_full,singularity,sanger
+
+    On LSF / tol farm:
+        bsub -Is -tty -e error -o out -n 2 -q oversubscribed -M4000 -R'select[mem>4000] rusage[mem=4000] span[hosts=1]' 'nextflow run main.nf -profile test_full,singularity,sanger'
 
 ----------------------------------------------------------------------------------------
 */
 
 params {
-    config_profile_name        = 'Full test profile'
-    config_profile_description = 'Full test dataset to check pipeline function'
-
-    // Input data for full size test
-    // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
+    config_profile_name        = 'Full local test profile'
+    config_profile_description = 'Full test dataset to check pipeline function, using a current full local dataset'
 
-    // Genome references
-    genome = 'R64-1-1'
+    input = 'assets/local_testing/nxOscDF5033.yaml'
 }
diff --git a/conf/test_genealignment.config b/conf/test_genealignment.config
diff --git a/conf/test_selfcomp.config b/conf/test_selfcomp.config
diff --git a/conf/test_synteny.config b/conf/test_synteny.config
diff --git a/docs/output.md b/docs/output.md
@@ -58,7 +58,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
 <details markdown="1">
 <summary>Output files</summary>
 
-- `pipeline_info/`
+- `treeval_info/`
   - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
   - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline.
   - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.

diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy
@@ -135,7 +135,7 @@ class NfcoreTemplate {
         }
 
         // Write summary e-mail HTML to a file
-        def output_d = new File("${params.outdir}/pipeline_info/")
+        def output_d = new File("${params.outdir}/treeval_info/")
         if (!output_d.exists()) {
             output_d.mkdirs()
         }

diff --git a/main.nf b/main.nf
@@ -32,13 +32,18 @@ WorkflowMain.initialise(workflow, params, log)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-include { TREEVAL } from './workflows/treeval'
+include { TREEVAL       } from './workflows/treeval'
+include { TREEVAL_RAPID } from './workflows/treeval_rapid'
 
 // WORKFLOW: Run main nf-core/treeval analysis pipeline
 workflow NFCORE_TREEVAL {
     TREEVAL ()
 }
 
+workflow NFCORE_TREEVAL_RAPID {
+    TREEVAL_RAPID ()
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN ALL WORKFLOWS
@@ -49,10 +54,14 @@ workflow NFCORE_TREEVAL {
 // WORKFLOW: Execute a single named workflow for the pipeline
 // See: https://github.com/nf-core/rnaseq/issues/619
 //
-workflow {
+workflow FULL {
     NFCORE_TREEVAL ()
 }
 
+workflow RAPID {
+    NFCORE_TREEVAL_RAPID ()
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     THE END