From 9b40147960b395bfb22d8855a44d184945181006 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 6 Mar 2023 18:53:59 -0500 Subject: [PATCH] made it so that merging processes were obsolete by replacing nf-core star/align with enhanced version of star/align --- conf/modules.config | 127 +++++++++--------- modules.json | 5 - modules/{nf-core => local}/star/align/main.nf | 18 ++- .../{nf-core => local}/star/align/meta.yml | 1 + subworkflows/local/align_reads.nf | 31 +---- subworkflows/local/fusion.nf | 14 +- subworkflows/local/merge_reads.nf | 34 +---- subworkflows/local/preprocess_reads.nf | 19 ++- workflows/forte.nf | 2 +- 9 files changed, 105 insertions(+), 146 deletions(-) rename modules/{nf-core => local}/star/align/main.nf (87%) rename modules/{nf-core => local}/star/align/meta.yml (99%) diff --git a/conf/modules.config b/conf/modules.config index 3b54f3a..5642e3f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -150,25 +150,28 @@ process { } withName: STAR_FOR_ARRIBA { - ext.args = [ - '--readFilesCommand zcat', - '--outSAMtype BAM Unsorted', - '--outSAMunmapped Within', - '--outBAMcompression 0', - '--outFilterMultimapNmax 50', - '--peOverlapNbasesMin 10', - '--alignSplicedMateMapLminOverLmate 0.5', - '--alignSJstitchMismatchNmax 5', - '-1 5 5', - '--chimSegmentMin 10', - '--chimOutType WithinBAM HardClip', - '--chimJunctionOverhangMin 10', - '--chimScoreDropMax 30', - '--chimScoreJunctionNonGTAG 0', - '--chimScoreSeparation 1', - '--chimSegmentReadGapMax 3', - '--chimMultimapNmax 50' - ].join(' ').trim() + ext.args = { + [ + "--outSAMattrRGline ID:${meta.sample} SM:${meta.sample} PL:Illumina", + '--readFilesCommand zcat', + '--outSAMtype BAM Unsorted', + '--outSAMunmapped Within', + '--outBAMcompression 0', + '--outFilterMultimapNmax 50', + '--peOverlapNbasesMin 10', + '--alignSplicedMateMapLminOverLmate 0.5', + '--alignSJstitchMismatchNmax 5', + '-1 5 5', + '--chimSegmentMin 10', + '--chimOutType WithinBAM HardClip', + '--chimJunctionOverhangMin 10', + '--chimScoreDropMax 30', + '--chimScoreJunctionNonGTAG 0', + '--chimScoreSeparation 1', + '--chimSegmentReadGapMax 3', + '--chimMultimapNmax 50' + ].join(' ').trim() + } publishDir = [ [ path: { "${params.outdir}/analysis/${meta.id}/arriba/STAR/log" }, @@ -186,30 +189,33 @@ process { } withName: STAR_FOR_STARFUSION { - ext.args = [ - '--readFilesCommand zcat', - '--outSAMtype BAM Unsorted', - '--outReadsUnmapped None', - '--twopassMode Basic', - '--outSAMstrandField intronMotif', - '--outSAMunmapped Within', - '--chimSegmentMin 12', - '--chimJunctionOverhangMin 8', - '--chimOutJunctionFormat 1', - '--alignSJDBoverhangMin 10', - '--alignMatesGapMax 100000', - '--alignIntronMax 100000', - '--alignSJstitchMismatchNmax 5 -1 5 5', - '--chimMultimapScoreRange 3', - '--chimScoreJunctionNonGTAG -4', - '--chimMultimapNmax 20', - '--chimNonchimScoreDropMin 10', - '--peOverlapNbasesMin 12', - '--peOverlapMMp 0.1', - '--alignInsertionFlush Right', - '--alignSplicedMateMapLminOverLmate 0', - '--alignSplicedMateMapLmin 30' - ].join(' ').trim() + ext.args = { + [ + "--outSAMattrRGline ID:${meta.sample} SM:${meta.sample} PL:Illumina", + '--readFilesCommand zcat', + '--outSAMtype None', + '--outReadsUnmapped None', + '--twopassMode Basic', + '--outSAMstrandField intronMotif', + '--outSAMunmapped Within', + '--chimSegmentMin 12', + '--chimJunctionOverhangMin 8', + '--chimOutJunctionFormat 1', + '--alignSJDBoverhangMin 10', + '--alignMatesGapMax 100000', + '--alignIntronMax 100000', + '--alignSJstitchMismatchNmax 5 -1 5 5', + '--chimMultimapScoreRange 3', + '--chimScoreJunctionNonGTAG -4', + '--chimMultimapNmax 20', + '--chimNonchimScoreDropMin 10', + '--peOverlapNbasesMin 12', + '--peOverlapMMp 0.1', + '--alignInsertionFlush Right', + '--alignSplicedMateMapLminOverLmate 0', + '--alignSplicedMateMapLmin 30' + ].join(' ').trim() + } publishDir = [ [ path: { "${params.outdir}/analysis/${meta.id}/starfusion/STAR/log" }, @@ -227,19 +233,22 @@ process { withName: STAR_ALIGN { ext.prefix = { "$meta.sample" } - ext.args = [ - '--quantMode GeneCounts', - '--twopassMode Basic', - '--outSAMtype BAM SortedByCoordinate', - '--readFilesCommand zcat', - '--runRNGseed 0', - '--outFilterMultimapNmax 20', - '--alignSJDBoverhangMin 1', - '--outSAMattributes NH HI AS NM MD', - '--quantTranscriptomeBan Singleend', - '--outSAMstrandField intronMotif', - params.save_unaligned ? '--outReadsUnmapped Fastx' : '' - ].join(' ').trim() + ext.args = { + [ + "--outSAMattrRGline ${meta.read_group.collect{"ID:${it} SM:${meta.sample} PL:Illumina"}.join(" , ")}", + '--quantMode GeneCounts', + '--twopassMode Basic', + '--outSAMtype BAM SortedByCoordinate', + '--readFilesCommand zcat', + '--runRNGseed 0', + '--outFilterMultimapNmax 20', + '--alignSJDBoverhangMin 1', + '--outSAMattributes NH HI AS NM MD', + '--quantTranscriptomeBan Singleend', + '--outSAMstrandField intronMotif', + params.save_unaligned ? '--outReadsUnmapped Fastx' : '' + ].join(' ').trim() + } publishDir = [ [ path: { "${params.outdir}/analysis/${meta.sample}/STAR/log" }, @@ -247,11 +256,7 @@ process { pattern: '*.{out,tab}' ], [ - path: { - meta.fq_num.toInteger() > 1 ? - "${params.outdir}/analysis/${meta.sample}/STAR/${meta.id}" : - "${params.outdir}/analysis/${meta.sample}/STAR/" - }, + path: {"${params.outdir}/analysis/${meta.sample}/STAR/"}, mode: params.publish_dir_mode, pattern: '*.bam', saveAs: { filename -> meta.fq_num.toInteger() > 1 ? null : filename } diff --git a/modules.json b/modules.json index 815a7c1..189fd59 100644 --- a/modules.json +++ b/modules.json @@ -75,11 +75,6 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, - "star/align": { - "branch": "master", - "git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01", - "installed_by": ["modules"] - }, "star/genomegenerate": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/star/align/main.nf b/modules/local/star/align/main.nf similarity index 87% rename from modules/nf-core/star/align/main.nf rename to modules/local/star/align/main.nf index 8b0f9d8..a210ab0 100644 --- a/modules/nf-core/star/align/main.nf +++ b/modules/local/star/align/main.nf @@ -11,17 +11,14 @@ process STAR_ALIGN { tuple val(meta), path(reads) path index path gtf - val star_ignore_sjdbgtf - val seq_platform - val seq_center output: - tuple val(meta), path('*d.out.bam') , emit: bam tuple val(meta), path('*Log.final.out') , emit: log_final tuple val(meta), path('*Log.out') , emit: log_out tuple val(meta), path('*Log.progress.out'), emit: log_progress path "versions.yml" , emit: versions + tuple val(meta), path('*d.out.bam') , optional:true, emit: bam tuple val(meta), path('*sortedByCoord.out.bam') , optional:true, emit: bam_sorted tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript tuple val(meta), path('*Aligned.unsort.out.bam') , optional:true, emit: bam_unsorted @@ -36,20 +33,21 @@ process STAR_ALIGN { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def ignore_gtf = star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf" - def seq_platform = seq_platform ? "'PL:$seq_platform'" : "" - def seq_center = seq_center ? "--outSAMattrRGline ID:$prefix 'CN:$seq_center' 'SM:$prefix' $seq_platform " : "--outSAMattrRGline ID:$prefix 'SM:$prefix' $seq_platform " + def reads1 = [], reads2 = [] + meta.single_end ? reads.each{reads1 << it} : reads.eachWithIndex{ v, ix -> ( ix & 1 ? reads2 : reads1) << v } + def attrRG = args.contains("--outSAMattrRGline") ? '' : "--outSAMattrRGline ID:$prefix 'SM:$prefix'" def out_sam_type = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted' + def gtf_param = gtf ? "--sjdbGTFfile $gtf" : '' def mv_unsorted_bam = (args.contains('--outSAMtype BAM Unsorted SortedByCoordinate')) ? "mv ${prefix}.Aligned.out.bam ${prefix}.Aligned.unsort.out.bam" : '' """ STAR \\ --genomeDir $index \\ - --readFilesIn $reads \\ + --readFilesIn ${reads1.join(",")} ${reads2.join(",")} \\ --runThreadN $task.cpus \\ --outFileNamePrefix $prefix. \\ $out_sam_type \\ - $ignore_gtf \\ - $seq_center \\ + $attrRG \\ + $gtf_param \\ $args $mv_unsorted_bam diff --git a/modules/nf-core/star/align/meta.yml b/modules/local/star/align/meta.yml similarity index 99% rename from modules/nf-core/star/align/meta.yml rename to modules/local/star/align/meta.yml index 7ee10f1..17b77db 100644 --- a/modules/nf-core/star/align/meta.yml +++ b/modules/local/star/align/meta.yml @@ -79,3 +79,4 @@ authors: - "@kevinmenden" - "@drpatelh" - "@praveenraj2018" + - "@anoronh4" diff --git a/subworkflows/local/align_reads.nf b/subworkflows/local/align_reads.nf index 13a00a4..6874881 100644 --- a/subworkflows/local/align_reads.nf +++ b/subworkflows/local/align_reads.nf @@ -1,5 +1,4 @@ -include { STAR_ALIGN } from '../../modules/nf-core/star/align/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { STAR_ALIGN } from '../../modules/local/star/align/main' include { UMITOOLS_DEDUP } from '../../modules/nf-core/umitools/dedup/main' include { SAMTOOLS_INDEX; @@ -20,34 +19,11 @@ workflow ALIGN_READS { STAR_ALIGN( reads, star_index, - gtf, - false, - false, - false + gtf ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) - star_align_bam = STAR_ALIGN.out.bam - .map{ meta, bam -> - def meta_clone = meta.clone().findAll { !["read_group","fastq_pair_id"].contains(it.key) } - meta_clone.id = meta.sample - [meta_clone, bam] - }.branch { meta, bam -> - needs_merge: meta.fq_num > 1 - skips_merge: meta.fq_num == 1 - } - - SAMTOOLS_MERGE( - star_align_bam.needs_merge - .map{ meta, bam -> [groupKey(meta, meta.fq_num),bam] } - .groupTuple(), - [], - [] - ) - ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions.first()) - - merged_bam = star_align_bam.skips_merge - .mix(SAMTOOLS_MERGE.out.bam) + merged_bam = STAR_ALIGN.out.bam SAMTOOLS_INDEX(merged_bam) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) @@ -69,7 +45,6 @@ workflow ALIGN_READS { .filter{ meta, bam -> ! meta.has_umi } ) - emit: bam = dedup_bam bam_withdup = merged_bam diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index e801507..90fdf1c 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -1,6 +1,6 @@ -include { STAR_ALIGN as STAR_FOR_ARRIBA } from '../../modules/nf-core/star/align/main' +include { STAR_ALIGN as STAR_FOR_ARRIBA } from '../../modules/local/star/align/main' include { ARRIBA } from '../../modules/nf-core/arriba/main' -include { STAR_ALIGN as STAR_FOR_STARFUSION } from '../../modules/nf-core/star/align/main' +include { STAR_ALIGN as STAR_FOR_STARFUSION } from '../../modules/local/star/align/main' include { STARFUSION } from '../../modules/local/starfusion/detect/main' include { FUSIONCATCHER_DETECT } from '../../modules/local/fusioncatcher/detect/main' include { FUSIONREPORT } from '../../modules/local/fusionreport/run/main' @@ -24,10 +24,7 @@ workflow FUSION { STAR_FOR_ARRIBA( reads, star_index, - gtf, - false, - false, - false + gtf ) ch_versions = ch_versions.mix(STAR_FOR_ARRIBA.out.versions.first()) @@ -47,10 +44,7 @@ workflow FUSION { reads, // use the star index in the starfusion reference to ensure compatibility starfusion_ref.map{ file( it + "/ref_genome.fa.star.idx")}, - gtf, - false, - false, - false + gtf ) ch_versions = ch_versions.mix(STAR_FOR_STARFUSION.out.versions.first()) diff --git a/subworkflows/local/merge_reads.nf b/subworkflows/local/merge_reads.nf index fef4343..53e2ccf 100644 --- a/subworkflows/local/merge_reads.nf +++ b/subworkflows/local/merge_reads.nf @@ -1,5 +1,4 @@ include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main' -include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' workflow MERGE_READS { take: @@ -10,39 +9,18 @@ workflow MERGE_READS { ch_versions = Channel.empty() reads_ch = reads - .map{ meta, reads -> - def meta_clone = meta.clone().findAll { !["read_group","fastq_pair_id"].contains(it.key) } - meta_clone.id = meta.sample - [meta_clone, reads] - }.branch { meta, reads -> - needs_merge: ( meta.fq_num > 1 ) && ( ! ( meta.has_umi && params.dedup_umi_for_fusions ) ) - needs_bam2fq: meta.has_umi && params.dedup_umi_for_fusions - skips_merge: true - } - + .filter{ meta, reads -> ! ( meta.has_umi && params.dedup_umi_for_fusions) } + bam_ch = bam - .branch { meta, bam -> - needs_bam2fq: meta.has_umi && params.dedup_umi_for_fusions - skips_bam2fq: true - } - - - CAT_FASTQ( - reads_ch.needs_merge - .map{ meta, reads -> [ groupKey(meta, meta.fq_num), reads ] } - .groupTuple() - .map{ meta, reads -> [ meta, reads.flatten() ] } - ) - ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) + .filter{ meta, bam -> meta.has_umi && params.dedup_umi_for_fusions } SAMTOOLS_BAM2FQ( - bam_ch.needs_bam2fq, + bam_ch, true ) ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions.first()) - merged_reads = reads_ch.skips_merge - .mix(CAT_FASTQ.out.reads) + merged_reads = reads_ch .mix( SAMTOOLS_BAM2FQ.out.reads .map{ meta, reads -> @@ -51,6 +29,6 @@ workflow MERGE_READS { ) emit: - merged_reads = merged_reads + dedup_reads = merged_reads ch_versions = ch_versions } diff --git a/subworkflows/local/preprocess_reads.nf b/subworkflows/local/preprocess_reads.nf index 39db326..7fb8d53 100644 --- a/subworkflows/local/preprocess_reads.nf +++ b/subworkflows/local/preprocess_reads.nf @@ -18,7 +18,7 @@ workflow PREPROCESS_READS { def meta_clone = meta.clone() if (params.extract_fq_read_group) { def rg_map = Utils.flowcellLaneFromFastq(reads[0]) - meta_clone.read_group = "${meta.sample}@${rg_map["fcid"]}@${rg_map["lane"]}" + meta_clone.read_group = "${meta.sample}@${rg_map["fcid"]}@${rg_map["lane"]}@${meta.fastq_pair_id}" meta_clone.id = meta_clone.read_group } else { meta_clone.read_group = meta_clone.id @@ -43,15 +43,28 @@ workflow PREPROCESS_READS { false, false ) + ch_versions = ch_versions.mix(FASTP.out.versions.first()) if (params.skip_trimming){ trimmed_reads = extracted_reads } else { trimmed_reads = FASTP.out.reads - ch_versions = ch_versions.mix(FASTP.out.versions.first()) } + trimmed_grouped_reads = trimmed_reads + .map{ meta, reads -> + def read_group = meta.read_group + def meta_clone = meta.clone().findAll { !["read_group","fastq_pair_id"].contains(it.key) } + meta_clone.id = meta.sample + [groupKey(meta_clone,meta.fq_num), reads, read_group] + }.groupTuple() + .map{ meta, reads, read_group -> + meta = meta + [read_group:read_group] + [meta, reads.flatten()] + }.view() + emit: - reads = trimmed_reads + reads = trimmed_grouped_reads + ungrouped_reads = trimmed_reads fastp_json = FASTP.out.json ch_versions = ch_versions } diff --git a/workflows/forte.nf b/workflows/forte.nf index 907e997..3950913 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -108,7 +108,7 @@ workflow FORTE { ) FUSION( - MERGE_READS.out.merged_reads, + MERGE_READS.out.dedup_reads, PREPARE_REFERENCES.out.star_index, PREPARE_REFERENCES.out.gtf, PREPARE_REFERENCES.out.starfusion_ref,