From 9b40147960b395bfb22d8855a44d184945181006 Mon Sep 17 00:00:00 2001
From: Anne Marie Noronha <anoronh4@users.noreply.github.com>
Date: Mon, 6 Mar 2023 18:53:59 -0500
Subject: [PATCH] made it so that merging processes were obsolete by replacing
 nf-core star/align with enhanced version of star/align

---
 conf/modules.config                           | 127 +++++++++---------
 modules.json                                  |   5 -
 modules/{nf-core => local}/star/align/main.nf |  18 ++-
 .../{nf-core => local}/star/align/meta.yml    |   1 +
 subworkflows/local/align_reads.nf             |  31 +----
 subworkflows/local/fusion.nf                  |  14 +-
 subworkflows/local/merge_reads.nf             |  34 +----
 subworkflows/local/preprocess_reads.nf        |  19 ++-
 workflows/forte.nf                            |   2 +-
 9 files changed, 105 insertions(+), 146 deletions(-)
 rename modules/{nf-core => local}/star/align/main.nf (87%)
 rename modules/{nf-core => local}/star/align/meta.yml (99%)

diff --git a/conf/modules.config b/conf/modules.config
index 3b54f3a..5642e3f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -150,25 +150,28 @@ process {
     }
 
     withName: STAR_FOR_ARRIBA {
-        ext.args = [
-            '--readFilesCommand zcat',
-            '--outSAMtype BAM Unsorted',
-            '--outSAMunmapped Within',
-            '--outBAMcompression 0',
-            '--outFilterMultimapNmax 50',
-            '--peOverlapNbasesMin 10',
-            '--alignSplicedMateMapLminOverLmate 0.5',
-            '--alignSJstitchMismatchNmax 5',
-            '-1 5 5',
-            '--chimSegmentMin 10',
-            '--chimOutType WithinBAM HardClip',
-            '--chimJunctionOverhangMin 10',
-            '--chimScoreDropMax 30',
-            '--chimScoreJunctionNonGTAG 0',
-            '--chimScoreSeparation 1',
-            '--chimSegmentReadGapMax 3',
-            '--chimMultimapNmax 50'
-        ].join(' ').trim()
+        ext.args = {
+            [
+                "--outSAMattrRGline ID:${meta.sample} SM:${meta.sample} PL:Illumina",
+                '--readFilesCommand zcat',
+                '--outSAMtype BAM Unsorted',
+                '--outSAMunmapped Within',
+                '--outBAMcompression 0',
+                '--outFilterMultimapNmax 50',
+                '--peOverlapNbasesMin 10',
+                '--alignSplicedMateMapLminOverLmate 0.5',
+                '--alignSJstitchMismatchNmax 5',
+                '-1 5 5',
+                '--chimSegmentMin 10',
+                '--chimOutType WithinBAM HardClip',
+                '--chimJunctionOverhangMin 10',
+                '--chimScoreDropMax 30',
+                '--chimScoreJunctionNonGTAG 0',
+                '--chimScoreSeparation 1',
+                '--chimSegmentReadGapMax 3',
+                '--chimMultimapNmax 50'
+            ].join(' ').trim()
+        }
         publishDir = [
             [
                 path: { "${params.outdir}/analysis/${meta.id}/arriba/STAR/log" },
@@ -186,30 +189,33 @@ process {
     }
 
     withName: STAR_FOR_STARFUSION {
-        ext.args = [
-            '--readFilesCommand zcat',
-            '--outSAMtype BAM Unsorted',
-            '--outReadsUnmapped None',
-            '--twopassMode Basic',
-            '--outSAMstrandField intronMotif',
-            '--outSAMunmapped Within',
-            '--chimSegmentMin 12',
-            '--chimJunctionOverhangMin 8',
-            '--chimOutJunctionFormat 1',
-            '--alignSJDBoverhangMin 10',
-            '--alignMatesGapMax 100000',
-            '--alignIntronMax 100000',
-            '--alignSJstitchMismatchNmax 5 -1 5 5',
-            '--chimMultimapScoreRange 3',
-            '--chimScoreJunctionNonGTAG -4',
-            '--chimMultimapNmax 20',
-            '--chimNonchimScoreDropMin 10',
-            '--peOverlapNbasesMin 12',
-            '--peOverlapMMp 0.1',
-            '--alignInsertionFlush Right',
-            '--alignSplicedMateMapLminOverLmate 0',
-            '--alignSplicedMateMapLmin 30'
-        ].join(' ').trim()
+        ext.args = {
+            [
+                "--outSAMattrRGline ID:${meta.sample} SM:${meta.sample} PL:Illumina",
+                '--readFilesCommand zcat',
+                '--outSAMtype None',
+                '--outReadsUnmapped None',
+                '--twopassMode Basic',
+                '--outSAMstrandField intronMotif',
+                '--outSAMunmapped Within',
+                '--chimSegmentMin 12',
+                '--chimJunctionOverhangMin 8',
+                '--chimOutJunctionFormat 1',
+                '--alignSJDBoverhangMin 10',
+                '--alignMatesGapMax 100000',
+                '--alignIntronMax 100000',
+                '--alignSJstitchMismatchNmax 5 -1 5 5',
+                '--chimMultimapScoreRange 3',
+                '--chimScoreJunctionNonGTAG -4',
+                '--chimMultimapNmax 20',
+                '--chimNonchimScoreDropMin 10',
+                '--peOverlapNbasesMin 12',
+                '--peOverlapMMp 0.1',
+                '--alignInsertionFlush Right',
+                '--alignSplicedMateMapLminOverLmate 0',
+                '--alignSplicedMateMapLmin 30'
+            ].join(' ').trim()
+        }
         publishDir = [
             [
                 path: { "${params.outdir}/analysis/${meta.id}/starfusion/STAR/log" },
@@ -227,19 +233,22 @@ process {
 
     withName: STAR_ALIGN {
         ext.prefix = { "$meta.sample" }
-        ext.args = [
-            '--quantMode GeneCounts',
-            '--twopassMode Basic',
-            '--outSAMtype BAM SortedByCoordinate',
-            '--readFilesCommand zcat',
-            '--runRNGseed 0',
-            '--outFilterMultimapNmax 20',
-            '--alignSJDBoverhangMin 1',
-            '--outSAMattributes NH HI AS NM MD',
-            '--quantTranscriptomeBan Singleend',
-            '--outSAMstrandField intronMotif',
-            params.save_unaligned ? '--outReadsUnmapped Fastx' : ''
-        ].join(' ').trim()
+        ext.args = {
+            [
+                "--outSAMattrRGline ${meta.read_group.collect{"ID:${it} SM:${meta.sample} PL:Illumina"}.join(" , ")}",
+                '--quantMode GeneCounts',
+                '--twopassMode Basic',
+                '--outSAMtype BAM SortedByCoordinate',
+                '--readFilesCommand zcat',
+                '--runRNGseed 0',
+                '--outFilterMultimapNmax 20',
+                '--alignSJDBoverhangMin 1',
+                '--outSAMattributes NH HI AS NM MD',
+                '--quantTranscriptomeBan Singleend',
+                '--outSAMstrandField intronMotif',
+                params.save_unaligned ? '--outReadsUnmapped Fastx' : ''
+            ].join(' ').trim()
+        }
         publishDir = [
             [
                 path: { "${params.outdir}/analysis/${meta.sample}/STAR/log" },
@@ -247,11 +256,7 @@ process {
                 pattern: '*.{out,tab}'
             ],
             [
-                path: {
-                    meta.fq_num.toInteger() > 1 ?
-                    "${params.outdir}/analysis/${meta.sample}/STAR/${meta.id}" :
-                    "${params.outdir}/analysis/${meta.sample}/STAR/"
-                },
+                path: {"${params.outdir}/analysis/${meta.sample}/STAR/"},
                 mode: params.publish_dir_mode,
                 pattern: '*.bam',
                 saveAs: { filename -> meta.fq_num.toInteger() > 1 ? null : filename }
diff --git a/modules.json b/modules.json
index 815a7c1..189fd59 100644
--- a/modules.json
+++ b/modules.json
@@ -75,11 +75,6 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
-                    "star/align": {
-                        "branch": "master",
-                        "git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01",
-                        "installed_by": ["modules"]
-                    },
                     "star/genomegenerate": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
diff --git a/modules/nf-core/star/align/main.nf b/modules/local/star/align/main.nf
similarity index 87%
rename from modules/nf-core/star/align/main.nf
rename to modules/local/star/align/main.nf
index 8b0f9d8..a210ab0 100644
--- a/modules/nf-core/star/align/main.nf
+++ b/modules/local/star/align/main.nf
@@ -11,17 +11,14 @@ process STAR_ALIGN {
     tuple val(meta), path(reads)
     path index
     path gtf
-    val star_ignore_sjdbgtf
-    val seq_platform
-    val seq_center
 
     output:
-    tuple val(meta), path('*d.out.bam')       , emit: bam
     tuple val(meta), path('*Log.final.out')   , emit: log_final
     tuple val(meta), path('*Log.out')         , emit: log_out
     tuple val(meta), path('*Log.progress.out'), emit: log_progress
     path  "versions.yml"                      , emit: versions
 
+    tuple val(meta), path('*d.out.bam')              , optional:true, emit: bam
     tuple val(meta), path('*sortedByCoord.out.bam')  , optional:true, emit: bam_sorted
     tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript
     tuple val(meta), path('*Aligned.unsort.out.bam') , optional:true, emit: bam_unsorted
@@ -36,20 +33,21 @@ process STAR_ALIGN {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def ignore_gtf      = star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf"
-    def seq_platform    = seq_platform ? "'PL:$seq_platform'" : ""
-    def seq_center      = seq_center ? "--outSAMattrRGline ID:$prefix 'CN:$seq_center' 'SM:$prefix' $seq_platform " : "--outSAMattrRGline ID:$prefix 'SM:$prefix' $seq_platform "
+    def reads1 = [], reads2 = []
+    meta.single_end ? reads.each{reads1 << it} : reads.eachWithIndex{ v, ix -> ( ix & 1 ? reads2 : reads1) << v }
+    def attrRG = args.contains("--outSAMattrRGline") ? '' : "--outSAMattrRGline ID:$prefix 'SM:$prefix'"
     def out_sam_type    = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted'
+    def gtf_param = gtf ? "--sjdbGTFfile $gtf" : ''
     def mv_unsorted_bam = (args.contains('--outSAMtype BAM Unsorted SortedByCoordinate')) ? "mv ${prefix}.Aligned.out.bam ${prefix}.Aligned.unsort.out.bam" : ''
     """
     STAR \\
         --genomeDir $index \\
-        --readFilesIn $reads  \\
+        --readFilesIn ${reads1.join(",")} ${reads2.join(",")} \\
         --runThreadN $task.cpus \\
         --outFileNamePrefix $prefix. \\
         $out_sam_type \\
-        $ignore_gtf \\
-        $seq_center \\
+        $attrRG \\
+        $gtf_param \\
         $args
 
     $mv_unsorted_bam
diff --git a/modules/nf-core/star/align/meta.yml b/modules/local/star/align/meta.yml
similarity index 99%
rename from modules/nf-core/star/align/meta.yml
rename to modules/local/star/align/meta.yml
index 7ee10f1..17b77db 100644
--- a/modules/nf-core/star/align/meta.yml
+++ b/modules/local/star/align/meta.yml
@@ -79,3 +79,4 @@ authors:
   - "@kevinmenden"
   - "@drpatelh"
   - "@praveenraj2018"
+  - "@anoronh4"
diff --git a/subworkflows/local/align_reads.nf b/subworkflows/local/align_reads.nf
index 13a00a4..6874881 100644
--- a/subworkflows/local/align_reads.nf
+++ b/subworkflows/local/align_reads.nf
@@ -1,5 +1,4 @@
-include { STAR_ALIGN       } from '../../modules/nf-core/star/align/main'
-include { SAMTOOLS_MERGE   } from '../../modules/nf-core/samtools/merge/main'
+include { STAR_ALIGN       } from '../../modules/local/star/align/main'
 include { UMITOOLS_DEDUP   } from '../../modules/nf-core/umitools/dedup/main'
 include {
     SAMTOOLS_INDEX;
@@ -20,34 +19,11 @@ workflow ALIGN_READS {
     STAR_ALIGN(
         reads,
         star_index,
-        gtf,
-        false,
-        false,
-        false
+        gtf
     )
     ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first())
 
-    star_align_bam = STAR_ALIGN.out.bam
-        .map{ meta, bam ->
-            def meta_clone = meta.clone().findAll { !["read_group","fastq_pair_id"].contains(it.key) }
-            meta_clone.id = meta.sample
-            [meta_clone, bam]
-        }.branch { meta, bam ->
-            needs_merge: meta.fq_num > 1
-            skips_merge: meta.fq_num == 1
-        }
-
-    SAMTOOLS_MERGE(
-        star_align_bam.needs_merge
-            .map{ meta, bam -> [groupKey(meta, meta.fq_num),bam] }
-            .groupTuple(),
-        [],
-        []
-    )
-    ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions.first())
-
-    merged_bam = star_align_bam.skips_merge
-        .mix(SAMTOOLS_MERGE.out.bam)
+    merged_bam = STAR_ALIGN.out.bam
 
     SAMTOOLS_INDEX(merged_bam)
     ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first())
@@ -69,7 +45,6 @@ workflow ALIGN_READS {
                 .filter{ meta, bam -> ! meta.has_umi }
         )
 
-
     emit:
     bam             = dedup_bam
     bam_withdup     = merged_bam
diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf
index e801507..90fdf1c 100644
--- a/subworkflows/local/fusion.nf
+++ b/subworkflows/local/fusion.nf
@@ -1,6 +1,6 @@
-include { STAR_ALIGN as STAR_FOR_ARRIBA     } from '../../modules/nf-core/star/align/main'
+include { STAR_ALIGN as STAR_FOR_ARRIBA     } from '../../modules/local/star/align/main'
 include { ARRIBA                            } from '../../modules/nf-core/arriba/main'
-include { STAR_ALIGN as STAR_FOR_STARFUSION } from '../../modules/nf-core/star/align/main'
+include { STAR_ALIGN as STAR_FOR_STARFUSION } from '../../modules/local/star/align/main'
 include { STARFUSION                        } from '../../modules/local/starfusion/detect/main'
 include { FUSIONCATCHER_DETECT              } from '../../modules/local/fusioncatcher/detect/main'
 include { FUSIONREPORT                      } from '../../modules/local/fusionreport/run/main'
@@ -24,10 +24,7 @@ workflow FUSION {
     STAR_FOR_ARRIBA(
         reads,
         star_index,
-        gtf,
-        false,
-        false,
-        false
+        gtf
     )
     ch_versions = ch_versions.mix(STAR_FOR_ARRIBA.out.versions.first())
 
@@ -47,10 +44,7 @@ workflow FUSION {
         reads,
         // use the star index in the starfusion reference to ensure compatibility
         starfusion_ref.map{ file( it + "/ref_genome.fa.star.idx")},
-        gtf,
-        false,
-        false,
-        false
+        gtf
     )
     ch_versions = ch_versions.mix(STAR_FOR_STARFUSION.out.versions.first())
 
diff --git a/subworkflows/local/merge_reads.nf b/subworkflows/local/merge_reads.nf
index fef4343..53e2ccf 100644
--- a/subworkflows/local/merge_reads.nf
+++ b/subworkflows/local/merge_reads.nf
@@ -1,5 +1,4 @@
 include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main'
-include { CAT_FASTQ       } from '../../modules/nf-core/cat/fastq/main'
 
 workflow MERGE_READS {
     take:
@@ -10,39 +9,18 @@ workflow MERGE_READS {
     ch_versions = Channel.empty()
 
     reads_ch = reads
-        .map{ meta, reads ->
-            def meta_clone = meta.clone().findAll { !["read_group","fastq_pair_id"].contains(it.key) }
-            meta_clone.id = meta.sample
-            [meta_clone, reads]
-        }.branch { meta, reads ->
-            needs_merge: ( meta.fq_num > 1 ) && ( ! ( meta.has_umi && params.dedup_umi_for_fusions ) )
-            needs_bam2fq: meta.has_umi && params.dedup_umi_for_fusions
-            skips_merge: true
-        }
-
+        .filter{ meta, reads -> ! ( meta.has_umi && params.dedup_umi_for_fusions) }
+        
     bam_ch = bam
-        .branch { meta, bam ->
-            needs_bam2fq: meta.has_umi && params.dedup_umi_for_fusions
-            skips_bam2fq: true
-        }
-
-
-    CAT_FASTQ(
-        reads_ch.needs_merge
-            .map{ meta, reads -> [ groupKey(meta, meta.fq_num), reads ] }
-            .groupTuple()
-            .map{ meta, reads -> [ meta, reads.flatten() ] }
-    )
-    ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first())
+        .filter{ meta, bam -> meta.has_umi && params.dedup_umi_for_fusions }
 
     SAMTOOLS_BAM2FQ(
-        bam_ch.needs_bam2fq,
+        bam_ch, 
         true
     )
     ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions.first())
 
-    merged_reads = reads_ch.skips_merge
-        .mix(CAT_FASTQ.out.reads)
+    merged_reads = reads_ch
         .mix(
             SAMTOOLS_BAM2FQ.out.reads
                 .map{ meta, reads ->
@@ -51,6 +29,6 @@ workflow MERGE_READS {
         )
 
     emit:
-    merged_reads = merged_reads
+    dedup_reads  = merged_reads
     ch_versions  = ch_versions
 }
diff --git a/subworkflows/local/preprocess_reads.nf b/subworkflows/local/preprocess_reads.nf
index 39db326..7fb8d53 100644
--- a/subworkflows/local/preprocess_reads.nf
+++ b/subworkflows/local/preprocess_reads.nf
@@ -18,7 +18,7 @@ workflow PREPROCESS_READS {
             def meta_clone = meta.clone()
             if (params.extract_fq_read_group) {
                 def rg_map = Utils.flowcellLaneFromFastq(reads[0])
-                meta_clone.read_group = "${meta.sample}@${rg_map["fcid"]}@${rg_map["lane"]}"
+                meta_clone.read_group = "${meta.sample}@${rg_map["fcid"]}@${rg_map["lane"]}@${meta.fastq_pair_id}"
                 meta_clone.id = meta_clone.read_group
             } else {
                 meta_clone.read_group = meta_clone.id
@@ -43,15 +43,28 @@ workflow PREPROCESS_READS {
         false,
         false
     )
+    ch_versions = ch_versions.mix(FASTP.out.versions.first())
     if (params.skip_trimming){
         trimmed_reads = extracted_reads
     } else {
         trimmed_reads = FASTP.out.reads
-        ch_versions = ch_versions.mix(FASTP.out.versions.first())
     }
 
+    trimmed_grouped_reads = trimmed_reads
+        .map{ meta, reads ->
+            def read_group = meta.read_group
+            def meta_clone = meta.clone().findAll { !["read_group","fastq_pair_id"].contains(it.key) }
+            meta_clone.id = meta.sample
+            [groupKey(meta_clone,meta.fq_num), reads, read_group]
+        }.groupTuple()
+        .map{ meta, reads, read_group ->
+            meta = meta + [read_group:read_group]
+            [meta, reads.flatten()]
+        }.view()
+
     emit:
-    reads           = trimmed_reads
+    reads           = trimmed_grouped_reads
+    ungrouped_reads = trimmed_reads
     fastp_json      = FASTP.out.json
     ch_versions     = ch_versions
 }
diff --git a/workflows/forte.nf b/workflows/forte.nf
index 907e997..3950913 100644
--- a/workflows/forte.nf
+++ b/workflows/forte.nf
@@ -108,7 +108,7 @@ workflow FORTE {
     )
 
     FUSION(
-        MERGE_READS.out.merged_reads,
+        MERGE_READS.out.dedup_reads,
         PREPARE_REFERENCES.out.star_index,
         PREPARE_REFERENCES.out.gtf,
         PREPARE_REFERENCES.out.starfusion_ref,