From b9681732df9231dbb5b4f9a89a8f76d5327394b7 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 28 Jun 2023 13:53:57 -0400 Subject: [PATCH] Add Kallisto quantification while removing deprecated option to use dedup reads for fusion calls --- conf/igenomes.config | 3 + conf/modules.config | 19 +++++++ modules.json | 10 ++++ modules/nf-core/kallisto/index/main.nf | 44 +++++++++++++++ modules/nf-core/kallisto/index/meta.yml | 31 +++++++++++ modules/nf-core/kallisto/quant/main.nf | 56 +++++++++++++++++++ modules/nf-core/kallisto/quant/meta.yml | 70 ++++++++++++++++++++++++ nextflow.config | 6 +- subworkflows/local/extract_dedup_fq.nf | 24 ++++++++ subworkflows/local/merge_reads.nf | 34 ------------ subworkflows/local/prepare_references.nf | 6 ++ subworkflows/local/quantification.nf | 23 ++++++-- workflows/forte.nf | 26 ++++++--- 13 files changed, 304 insertions(+), 48 deletions(-) create mode 100644 modules/nf-core/kallisto/index/main.nf create mode 100644 modules/nf-core/kallisto/index/meta.yml create mode 100644 modules/nf-core/kallisto/quant/main.nf create mode 100644 modules/nf-core/kallisto/quant/meta.yml create mode 100644 subworkflows/local/extract_dedup_fq.nf delete mode 100644 subworkflows/local/merge_reads.nf diff --git a/conf/igenomes.config b/conf/igenomes.config index 406ff36..4a1f98a 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -14,6 +14,7 @@ params { fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" refflat = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/refFlat.txt.gz" + cdna = "https://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz" starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh37_gencode_v19_CTAT_lib_Mar012021.plug-n-play.tar.gz" arriba_blacklist = "/usr/local/var/lib/arriba/blacklist_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz" arriba_known_fusions = "/usr/local/var/lib/arriba/known_fusions_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz" @@ -41,6 +42,7 @@ params { arriba_blacklist = "/usr/local/var/lib/arriba/blacklist_hg38_GRCh38_v2.3.0.tsv.gz" arriba_known_fusions = "/usr/local/var/lib/arriba/known_fusions_hg38_GRCh38_v2.3.0.tsv.gz" arriba_protein_domains = "/usr/local/var/lib/arriba/protein_domains_hg38_GRCh38_v2.3.0.gff3" + cdna = "https://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" } 'smallGRCh37' { fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta" @@ -50,6 +52,7 @@ params { arriba_blacklist = "/usr/local/var/lib/arriba/blacklist_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz" arriba_known_fusions = "/usr/local/var/lib/arriba/known_fusions_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz" arriba_protein_domains = "/usr/local/var/lib/arriba/protein_domains_hg19_hs37d5_GRCh37_v2.3.0.gff3" + cdna = "ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.abinitio.fa.gz" } /* 'hg38' { diff --git a/conf/modules.config b/conf/modules.config index bee9070..8a47f9a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,6 +344,25 @@ process { time = { check_max( 20.h * task.attempt, 'time' ) } } + withName: KALLISTO_QUANT { + ext.args = { + [ + "--plain-text", + "--bias", + "-b 100", + meta.strand == "forward" ? + "--fr-stranded" : + ( + meta.strand == "reverse" ? + "--rf-stranded" : + "" + ) + ].join(" ") + } + ext.fragment_len = params.kallisto_fragment_len + ext.sd = params.kallisto_fragment_sd + } + withName: ARRIBA { ext.args = { [ diff --git a/modules.json b/modules.json index 17de5ad..2c0b137 100644 --- a/modules.json +++ b/modules.json @@ -45,6 +45,16 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "kallisto/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "kallisto/quant": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba", diff --git a/modules/nf-core/kallisto/index/main.nf b/modules/nf-core/kallisto/index/main.nf new file mode 100644 index 0000000..c866c2a --- /dev/null +++ b/modules/nf-core/kallisto/index/main.nf @@ -0,0 +1,44 @@ +process KALLISTO_INDEX { + tag "$fasta" + label 'process_medium' + + conda "bioconda::kallisto=0.46.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.46.2--h4f7b962_1' : + 'biocontainers/kallisto:0.46.2--h4f7b962_1' }" + + input: + path fasta + + output: + path "kallisto" , emit: idx + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + kallisto \\ + index \\ + $args \\ + -i kallisto \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ + + stub: + """ + touch kallisto + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/index/meta.yml b/modules/nf-core/kallisto/index/meta.yml new file mode 100644 index 0000000..47f40c9 --- /dev/null +++ b/modules/nf-core/kallisto/index/meta.yml @@ -0,0 +1,31 @@ +name: kallisto_index +description: Create kallisto index +keywords: + - index +tools: + - kallisto: + description: Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + + licence: ["BSD-2-Clause"] + +input: + - fasta: + type: file + description: genome fasta file + pattern: "*.{fasta}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - idx: + type: index + description: Kallisto genome index + pattern: "*.idx" + +authors: + - "@ggabernet" diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf new file mode 100644 index 0000000..4644083 --- /dev/null +++ b/modules/nf-core/kallisto/quant/main.nf @@ -0,0 +1,56 @@ +process KALLISTO_QUANT { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kallisto=0.48.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2': + 'biocontainers/kallisto:0.48.0--h15996b6_2' }" + + input: + tuple val(meta), path(reads) + path index + path gtf + path chromosomes + + output: + tuple val(meta), path("abundance.tsv"), emit: abundance + tuple val(meta), path("abundance.h5") , emit: abundance_hdf5 + tuple val(meta), path("run_info.json"), emit: run_info + tuple val(meta), path("*.log.txt") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def single = meta.single_end ? "--single --fragment-length ${task.ext.fragment_len} --sd ${task.ext.sd}" : "" + def gtf_input = gtf ? "--gtf ${gtf}" : '' + def chromosomes_input = chromosomes ? "--chromosomes ${chromosomes}" : '' + """ + kallisto quant \\ + --threads ${task.cpus} \\ + --index ${index} \\ + ${gtf_input} \\ + ${chromosomes_input} \\ + ${single} \\ + ${args} \\ + -o . \\ + ${reads} 2> >(tee -a ${prefix}.log.txt >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" ) + END_VERSIONS + """ + + stub: + """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/quant/meta.yml b/modules/nf-core/kallisto/quant/meta.yml new file mode 100644 index 0000000..e81dd4c --- /dev/null +++ b/modules/nf-core/kallisto/quant/meta.yml @@ -0,0 +1,70 @@ +name: "kallisto_quant" +description: Computes equivalence classes for reads and quantifies abundances +keywords: + - quant + - kallisto +tools: + - "kallisto": + description: "Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads." + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + doi: "10.1038/nbt.3519" + licence: "['BSD_2_clause']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fastq,fastq.gz}" + - index: + type: index + description: Kallisto genome index. + pattern: "*.idx" + - gtf: + type: file + description: Optional gtf file for translation of transcripts into genomic coordinates. + pattern: "*.gtf" + - chromosomes: + type: file + description: Optional tab separated file with chromosome names and lengths. + pattern: "*.tsv" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: File containing log information from running kallisto quant + pattern: "*.log.txt" + - abundance: + type: file + description: Plaintext file of the abundance estimates + pattern: "abundance.tsv" + - abundance_hdf5: + type: file + description: | + A HDF5 binary file containing run info, abundance estimates, bootstrap + estimates, and transcript length information + pattern: "abundance.h5" + - run_info: + type: file + description: A json file containing information about the run + pattern: "run_info.json" + +authors: + - "@anoronh4" diff --git a/nextflow.config b/nextflow.config index 3a131a0..234d21b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,7 +29,6 @@ params { save_align_intermeds = false // Fusion - dedup_umi_for_fusions = false run_oncokb_fusionannotator = false cosmic_usr = null fusion_report_cutoff = 1 @@ -39,6 +38,11 @@ params { // rseqc_modules can include ['bam_stat','inner_distance','infer_experiment','junction_annotation','junction_saturation','read_distribution','read_duplication','tin'] rseqc_modules = ['bam_stat','inner_distance','infer_experiment','junction_annotation','junction_saturation','read_distribution','read_duplication'] + // Quantification + dedup_umi_for_kallisto = true + kallisto_fragment_len = 500 + kallisto_fragment_sd = 150 + // MultiQC options multiqc_config = null multiqc_title = null diff --git a/subworkflows/local/extract_dedup_fq.nf b/subworkflows/local/extract_dedup_fq.nf new file mode 100644 index 0000000..1cdde72 --- /dev/null +++ b/subworkflows/local/extract_dedup_fq.nf @@ -0,0 +1,24 @@ +include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main' + +workflow EXTRACT_DEDUP_FQ { + take: + bam + + main: + ch_versions = Channel.empty() + + SAMTOOLS_BAM2FQ( + bam, + true + ) + ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions.first()) + + dedup_reads = SAMTOOLS_BAM2FQ.out.reads + .map{ meta, reads -> + [meta, reads.findAll{ !(it.getName().endsWith("singleton.fq.gz") || it.getName().endsWith("other.fq.gz")) }] + } + + emit: + dedup_reads = dedup_reads + ch_versions = ch_versions +} diff --git a/subworkflows/local/merge_reads.nf b/subworkflows/local/merge_reads.nf deleted file mode 100644 index 03d30af..0000000 --- a/subworkflows/local/merge_reads.nf +++ /dev/null @@ -1,34 +0,0 @@ -include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main' - -workflow MERGE_READS { - take: - reads - bam - - main: - ch_versions = Channel.empty() - - reads_ch = reads - .filter{ meta, reads -> ! ( meta.has_umi && params.dedup_umi_for_fusions) } - - bam_ch = bam - .filter{ meta, bam -> meta.has_umi && params.dedup_umi_for_fusions } - - SAMTOOLS_BAM2FQ( - bam_ch, - true - ) - ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions.first()) - - merged_reads = reads_ch - .mix( - SAMTOOLS_BAM2FQ.out.reads - .map{ meta, reads -> - [meta, reads.findAll{ !(it.getName().endsWith("singleton.fq.gz") || it.getName().endsWith("other.fq.gz")) }] - } - ) - - emit: - dedup_reads = merged_reads - ch_versions = ch_versions -} diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index a87fab7..79962e3 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -9,6 +9,8 @@ include { GUNZIP } from '../../modules/nf-core/gunzip/ma include { STARFUSION_DOWNLOAD } from '../../modules/local/starfusion/download/main' include { FUSIONCATCHER_DOWNLOAD } from '../../modules/local/fusioncatcher/download/main' include { FUSIONREPORT_DOWNLOAD } from '../../modules/local/fusionreport/download/main' +include { KALLISTO_INDEX } from '../../modules/local/kallisto/index/main' + workflow PREPARE_REFERENCES { @@ -70,6 +72,9 @@ workflow PREPARE_REFERENCES { //cosmic_passwd = params.cosmic_passwd ?: "" FUSIONREPORT_DOWNLOAD() + KALLISTO_INDEX(params.cdna) + ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) + emit: star_index = star_index // Convert queue channel to value channel so it never gets poison pilled @@ -84,6 +89,7 @@ workflow PREPARE_REFERENCES { fusioncatcher_ref = fusioncatcher_ref fusion_report_db = FUSIONREPORT_DOWNLOAD.out.reference rseqc_bed = UCSC_GENEPREDTOBED.out.bed.map{it[1]}.first() + kallisto_index = KALLISTO_INDEX.out.idx ch_versions = ch_versions } diff --git a/subworkflows/local/quantification.nf b/subworkflows/local/quantification.nf index c1c8222..5d542d0 100644 --- a/subworkflows/local/quantification.nf +++ b/subworkflows/local/quantification.nf @@ -1,19 +1,34 @@ -include { HTSEQ_COUNT } from '../../modules/local/htseq/count/main' +include { HTSEQ_COUNT } from '../../modules/local/htseq/count/main' +include { KALLISTO_QUANT } from '../../modules/local/kallisto/quant/main' + workflow QUANTIFICATION { take: bam bai gtf + reads + kallisto_idx main: ch_versions = Channel.empty() - HTSEQ_COUNT(bam.join(bai,by:[0]),gtf) + HTSEQ_COUNT( + bam.join(bai,by:[0]), + gtf + ) ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions) + KALLISTO_QUANT( + reads, + kallisto_idx, + [], + [] + ) + ch_versions = ch_versions.mix(KALLISTO_QUANT.out.versions) + emit: - htseq_counts = HTSEQ_COUNT.out.counts - ch_versions = ch_versions + htseq_counts = HTSEQ_COUNT.out.counts + ch_versions = ch_versions } diff --git a/workflows/forte.nf b/workflows/forte.nf index 65721f0..be6a99e 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -52,12 +52,12 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/du include { PREPARE_REFERENCES } from '../subworkflows/local/prepare_references' include { PREPROCESS_READS } from '../subworkflows/local/preprocess_reads' include { ALIGN_READS } from '../subworkflows/local/align_reads' -include { MERGE_READS } from '../subworkflows/local/merge_reads' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { QC as QC_DUP ; QC as QC_DEDUP } from '../subworkflows/local/qc' +include { EXTRACT_DEDUP_FQ } from '../subworkflows/local/extract_dedup_fq' include { QUANTIFICATION } from '../subworkflows/local/quantification' include { FUSION } from '../subworkflows/local/fusion' @@ -103,21 +103,29 @@ workflow FORTE { ) ch_versions = ch_versions.mix(ALIGN_READS.out.ch_versions) + EXTRACT_DEDUP_FQ( + ALIGN_READS.out.bam + .filter{ meta, bam -> + meta.has_umi && params.dedup_umi_for_kallisto + } + ) + ch_versions = ch_versions.mix(EXTRACT_DEDUP_FQ.out.ch_versions) + QUANTIFICATION( ALIGN_READS.out.bam, ALIGN_READS.out.bai, - PREPARE_REFERENCES.out.gtf + PREPARE_REFERENCES.out.gtf, + GET_DEDUP_FQ.out.dedup_reads + .mix( + PREPROCESS_READS.out.reads + .filter{ meta, reads -> ! ( meta.has_umi && params.dedup_umi_for_kallisto ) } + ), + PREPARE_REFERENCES.out.kallisto_index ) ch_versions = ch_versions.mix(QUANTIFICATION.out.ch_versions) - - MERGE_READS( - PREPROCESS_READS.out.reads, - ALIGN_READS.out.bam - ) - FUSION( - MERGE_READS.out.dedup_reads, + PREPROCESS_READS.out.reads, PREPARE_REFERENCES.out.star_index, PREPARE_REFERENCES.out.gtf, PREPARE_REFERENCES.out.starfusion_ref,