From b9681732df9231dbb5b4f9a89a8f76d5327394b7 Mon Sep 17 00:00:00 2001
From: Anne Marie Noronha <anoronh4@users.noreply.github.com>
Date: Wed, 28 Jun 2023 13:53:57 -0400
Subject: [PATCH] Add Kallisto quantification while removing deprecated option
 to use dedup reads for fusion calls

---
 conf/igenomes.config                     |  3 +
 conf/modules.config                      | 19 +++++++
 modules.json                             | 10 ++++
 modules/nf-core/kallisto/index/main.nf   | 44 +++++++++++++++
 modules/nf-core/kallisto/index/meta.yml  | 31 +++++++++++
 modules/nf-core/kallisto/quant/main.nf   | 56 +++++++++++++++++++
 modules/nf-core/kallisto/quant/meta.yml  | 70 ++++++++++++++++++++++++
 nextflow.config                          |  6 +-
 subworkflows/local/extract_dedup_fq.nf   | 24 ++++++++
 subworkflows/local/merge_reads.nf        | 34 ------------
 subworkflows/local/prepare_references.nf |  6 ++
 subworkflows/local/quantification.nf     | 23 ++++++--
 workflows/forte.nf                       | 26 ++++++---
 13 files changed, 304 insertions(+), 48 deletions(-)
 create mode 100644 modules/nf-core/kallisto/index/main.nf
 create mode 100644 modules/nf-core/kallisto/index/meta.yml
 create mode 100644 modules/nf-core/kallisto/quant/main.nf
 create mode 100644 modules/nf-core/kallisto/quant/meta.yml
 create mode 100644 subworkflows/local/extract_dedup_fq.nf
 delete mode 100644 subworkflows/local/merge_reads.nf

diff --git a/conf/igenomes.config b/conf/igenomes.config
index 406ff36..4a1f98a 100644
--- a/conf/igenomes.config
+++ b/conf/igenomes.config
@@ -14,6 +14,7 @@ params {
             fasta          = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta"
             gtf            = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf"
             refflat        = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/refFlat.txt.gz"
+            cdna           = "https://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
             starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh37_gencode_v19_CTAT_lib_Mar012021.plug-n-play.tar.gz"
             arriba_blacklist       = "/usr/local/var/lib/arriba/blacklist_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz"
             arriba_known_fusions   = "/usr/local/var/lib/arriba/known_fusions_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz"
@@ -41,6 +42,7 @@ params {
             arriba_blacklist       = "/usr/local/var/lib/arriba/blacklist_hg38_GRCh38_v2.3.0.tsv.gz"
             arriba_known_fusions   = "/usr/local/var/lib/arriba/known_fusions_hg38_GRCh38_v2.3.0.tsv.gz"
             arriba_protein_domains = "/usr/local/var/lib/arriba/protein_domains_hg38_GRCh38_v2.3.0.gff3"
+            cdna           = "https://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz"
         }
         'smallGRCh37' {
             fasta          = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta"
@@ -50,6 +52,7 @@ params {
             arriba_blacklist       = "/usr/local/var/lib/arriba/blacklist_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz"
             arriba_known_fusions   = "/usr/local/var/lib/arriba/known_fusions_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz"
             arriba_protein_domains = "/usr/local/var/lib/arriba/protein_domains_hg19_hs37d5_GRCh37_v2.3.0.gff3"
+            cdna           = "ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.abinitio.fa.gz"
         }
 /*
         'hg38' {
diff --git a/conf/modules.config b/conf/modules.config
index bee9070..8a47f9a 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -344,6 +344,25 @@ process {
         time     = { check_max( 20.h * task.attempt, 'time' ) }
     }
 
+    withName: KALLISTO_QUANT {
+        ext.args = {
+            [
+                "--plain-text",
+                "--bias",
+                "-b 100",
+                meta.strand == "forward" ?
+                    "--fr-stranded" :
+                    (
+                        meta.strand == "reverse" ?
+                        "--rf-stranded" :
+                        ""
+                    )
+            ].join(" ")
+        }
+        ext.fragment_len = params.kallisto_fragment_len
+        ext.sd = params.kallisto_fragment_sd
+    }
+
     withName: ARRIBA {
         ext.args = {
             [
diff --git a/modules.json b/modules.json
index 17de5ad..2c0b137 100644
--- a/modules.json
+++ b/modules.json
@@ -45,6 +45,16 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "kallisto/index": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
+                    "kallisto/quant": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba",
diff --git a/modules/nf-core/kallisto/index/main.nf b/modules/nf-core/kallisto/index/main.nf
new file mode 100644
index 0000000..c866c2a
--- /dev/null
+++ b/modules/nf-core/kallisto/index/main.nf
@@ -0,0 +1,44 @@
+process KALLISTO_INDEX {
+    tag "$fasta"
+    label 'process_medium'
+
+    conda "bioconda::kallisto=0.46.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/kallisto:0.46.2--h4f7b962_1' :
+        'biocontainers/kallisto:0.46.2--h4f7b962_1' }"
+
+    input:
+    path fasta
+
+    output:
+    path "kallisto" , emit: idx
+    path "versions.yml" , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    kallisto \\
+        index \\
+        $args \\
+        -i kallisto \\
+        $fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch kallisto
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/kallisto/index/meta.yml b/modules/nf-core/kallisto/index/meta.yml
new file mode 100644
index 0000000..47f40c9
--- /dev/null
+++ b/modules/nf-core/kallisto/index/meta.yml
@@ -0,0 +1,31 @@
+name: kallisto_index
+description: Create kallisto index
+keywords:
+  - index
+tools:
+  - kallisto:
+      description: Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads.
+      homepage: https://pachterlab.github.io/kallisto/
+      documentation: https://pachterlab.github.io/kallisto/manual
+      tool_dev_url: https://github.com/pachterlab/kallisto
+
+      licence: ["BSD-2-Clause"]
+
+input:
+  - fasta:
+      type: file
+      description: genome fasta file
+      pattern: "*.{fasta}"
+
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - idx:
+      type: index
+      description: Kallisto genome index
+      pattern: "*.idx"
+
+authors:
+  - "@ggabernet"
diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf
new file mode 100644
index 0000000..4644083
--- /dev/null
+++ b/modules/nf-core/kallisto/quant/main.nf
@@ -0,0 +1,56 @@
+process KALLISTO_QUANT {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::kallisto=0.48.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2':
+        'biocontainers/kallisto:0.48.0--h15996b6_2' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path index
+    path gtf
+    path chromosomes
+
+    output:
+    tuple val(meta), path("abundance.tsv"), emit: abundance
+    tuple val(meta), path("abundance.h5") , emit: abundance_hdf5
+    tuple val(meta), path("run_info.json"), emit: run_info
+    tuple val(meta), path("*.log.txt")    , emit: log
+    path "versions.yml"                   , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def single = meta.single_end ? "--single --fragment-length ${task.ext.fragment_len} --sd ${task.ext.sd}" : ""
+    def gtf_input = gtf ? "--gtf ${gtf}" : ''
+    def chromosomes_input = chromosomes ? "--chromosomes ${chromosomes}" : ''
+    """
+    kallisto quant \\
+            --threads ${task.cpus} \\
+            --index ${index} \\
+            ${gtf_input} \\
+            ${chromosomes_input} \\
+            ${single} \\
+            ${args} \\
+            -o . \\
+            ${reads} 2> >(tee -a ${prefix}.log.txt >&2)
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" )
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/kallisto/quant/meta.yml b/modules/nf-core/kallisto/quant/meta.yml
new file mode 100644
index 0000000..e81dd4c
--- /dev/null
+++ b/modules/nf-core/kallisto/quant/meta.yml
@@ -0,0 +1,70 @@
+name: "kallisto_quant"
+description: Computes equivalence classes for reads and quantifies abundances
+keywords:
+  - quant
+  - kallisto
+tools:
+  - "kallisto":
+      description: "Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads."
+      homepage: https://pachterlab.github.io/kallisto/
+      documentation: https://pachterlab.github.io/kallisto/manual
+      tool_dev_url: https://github.com/pachterlab/kallisto
+      doi: "10.1038/nbt.3519"
+      licence: "['BSD_2_clause']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+      pattern: "*.{fastq,fastq.gz}"
+  - index:
+      type: index
+      description: Kallisto genome index.
+      pattern: "*.idx"
+  - gtf:
+      type: file
+      description: Optional gtf file for translation of transcripts into genomic coordinates.
+      pattern: "*.gtf"
+  - chromosomes:
+      type: file
+      description: Optional tab separated file with chromosome names and lengths.
+      pattern: "*.tsv"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - log:
+      type: file
+      description: File containing log information from running kallisto quant
+      pattern: "*.log.txt"
+  - abundance:
+      type: file
+      description: Plaintext file of the abundance estimates
+      pattern: "abundance.tsv"
+  - abundance_hdf5:
+      type: file
+      description: |
+        A HDF5 binary file containing run info, abundance estimates, bootstrap
+        estimates, and transcript length information
+      pattern: "abundance.h5"
+  - run_info:
+      type: file
+      description: A json file containing information about the run
+      pattern: "run_info.json"
+
+authors:
+  - "@anoronh4"
diff --git a/nextflow.config b/nextflow.config
index 3a131a0..234d21b 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -29,7 +29,6 @@ params {
     save_align_intermeds       = false
 
     // Fusion
-    dedup_umi_for_fusions      = false
     run_oncokb_fusionannotator = false
     cosmic_usr                 = null
     fusion_report_cutoff       = 1
@@ -39,6 +38,11 @@ params {
     // rseqc_modules can include ['bam_stat','inner_distance','infer_experiment','junction_annotation','junction_saturation','read_distribution','read_duplication','tin']
     rseqc_modules              = ['bam_stat','inner_distance','infer_experiment','junction_annotation','junction_saturation','read_distribution','read_duplication']
 
+    // Quantification
+    dedup_umi_for_kallisto     = true
+    kallisto_fragment_len      = 500
+    kallisto_fragment_sd       = 150
+
     // MultiQC options
     multiqc_config             = null
     multiqc_title              = null
diff --git a/subworkflows/local/extract_dedup_fq.nf b/subworkflows/local/extract_dedup_fq.nf
new file mode 100644
index 0000000..1cdde72
--- /dev/null
+++ b/subworkflows/local/extract_dedup_fq.nf
@@ -0,0 +1,24 @@
+include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main'
+
+workflow EXTRACT_DEDUP_FQ {
+    take:
+    bam
+
+    main:
+    ch_versions = Channel.empty()
+
+    SAMTOOLS_BAM2FQ(
+        bam,
+        true
+    )
+    ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions.first())
+
+    dedup_reads = SAMTOOLS_BAM2FQ.out.reads
+        .map{ meta, reads ->
+            [meta, reads.findAll{ !(it.getName().endsWith("singleton.fq.gz") || it.getName().endsWith("other.fq.gz")) }]
+        }
+
+    emit:
+    dedup_reads  = dedup_reads
+    ch_versions  = ch_versions
+}
diff --git a/subworkflows/local/merge_reads.nf b/subworkflows/local/merge_reads.nf
deleted file mode 100644
index 03d30af..0000000
--- a/subworkflows/local/merge_reads.nf
+++ /dev/null
@@ -1,34 +0,0 @@
-include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main'
-
-workflow MERGE_READS {
-    take:
-    reads
-    bam
-
-    main:
-    ch_versions = Channel.empty()
-
-    reads_ch = reads
-        .filter{ meta, reads -> ! ( meta.has_umi && params.dedup_umi_for_fusions) }
-
-    bam_ch = bam
-        .filter{ meta, bam -> meta.has_umi && params.dedup_umi_for_fusions }
-
-    SAMTOOLS_BAM2FQ(
-        bam_ch,
-        true
-    )
-    ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions.first())
-
-    merged_reads = reads_ch
-        .mix(
-            SAMTOOLS_BAM2FQ.out.reads
-                .map{ meta, reads ->
-                    [meta, reads.findAll{ !(it.getName().endsWith("singleton.fq.gz") || it.getName().endsWith("other.fq.gz")) }]
-                }
-        )
-
-    emit:
-    dedup_reads  = merged_reads
-    ch_versions  = ch_versions
-}
diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf
index a87fab7..79962e3 100644
--- a/subworkflows/local/prepare_references.nf
+++ b/subworkflows/local/prepare_references.nf
@@ -9,6 +9,8 @@ include { GUNZIP                         } from '../../modules/nf-core/gunzip/ma
 include { STARFUSION_DOWNLOAD            } from '../../modules/local/starfusion/download/main'
 include { FUSIONCATCHER_DOWNLOAD         } from '../../modules/local/fusioncatcher/download/main'
 include { FUSIONREPORT_DOWNLOAD          } from '../../modules/local/fusionreport/download/main'
+include { KALLISTO_INDEX                 } from '../../modules/local/kallisto/index/main'
+
 
 workflow PREPARE_REFERENCES {
 
@@ -70,6 +72,9 @@ workflow PREPARE_REFERENCES {
     //cosmic_passwd = params.cosmic_passwd ?: ""
     FUSIONREPORT_DOWNLOAD()
 
+    KALLISTO_INDEX(params.cdna)
+    ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions)
+
     emit:
     star_index         = star_index
     // Convert queue channel to value channel so it never gets poison pilled
@@ -84,6 +89,7 @@ workflow PREPARE_REFERENCES {
     fusioncatcher_ref  = fusioncatcher_ref
     fusion_report_db   = FUSIONREPORT_DOWNLOAD.out.reference
     rseqc_bed          = UCSC_GENEPREDTOBED.out.bed.map{it[1]}.first()
+    kallisto_index     = KALLISTO_INDEX.out.idx
     ch_versions        = ch_versions
 
 }
diff --git a/subworkflows/local/quantification.nf b/subworkflows/local/quantification.nf
index c1c8222..5d542d0 100644
--- a/subworkflows/local/quantification.nf
+++ b/subworkflows/local/quantification.nf
@@ -1,19 +1,34 @@
-include { HTSEQ_COUNT } from '../../modules/local/htseq/count/main'
+include { HTSEQ_COUNT    } from '../../modules/local/htseq/count/main'
+include { KALLISTO_QUANT } from '../../modules/local/kallisto/quant/main'
+
 
 workflow QUANTIFICATION {
     take:
     bam
     bai
     gtf
+    reads
+    kallisto_idx
 
     main:
 
     ch_versions = Channel.empty()
 
-    HTSEQ_COUNT(bam.join(bai,by:[0]),gtf)
+    HTSEQ_COUNT(
+        bam.join(bai,by:[0]),
+        gtf
+    )
     ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions)
 
+    KALLISTO_QUANT(
+        reads,
+        kallisto_idx,
+        [],
+        []
+    )
+    ch_versions = ch_versions.mix(KALLISTO_QUANT.out.versions)
+
     emit:
-    htseq_counts = HTSEQ_COUNT.out.counts
-    ch_versions = ch_versions
+    htseq_counts    = HTSEQ_COUNT.out.counts
+    ch_versions     = ch_versions
 }
diff --git a/workflows/forte.nf b/workflows/forte.nf
index 65721f0..be6a99e 100644
--- a/workflows/forte.nf
+++ b/workflows/forte.nf
@@ -52,12 +52,12 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS       } from '../modules/nf-core/custom/du
 include { PREPARE_REFERENCES                } from '../subworkflows/local/prepare_references'
 include { PREPROCESS_READS                  } from '../subworkflows/local/preprocess_reads'
 include { ALIGN_READS                       } from '../subworkflows/local/align_reads'
-include { MERGE_READS                       } from '../subworkflows/local/merge_reads'
 include { MULTIQC                           } from '../modules/nf-core/multiqc/main'
 include {
     QC as QC_DUP ;
     QC as QC_DEDUP
 } from '../subworkflows/local/qc'
+include { EXTRACT_DEDUP_FQ                  } from '../subworkflows/local/extract_dedup_fq'
 include { QUANTIFICATION                    } from '../subworkflows/local/quantification'
 include { FUSION                            } from '../subworkflows/local/fusion'
 
@@ -103,21 +103,29 @@ workflow FORTE {
     )
     ch_versions = ch_versions.mix(ALIGN_READS.out.ch_versions)
 
+    EXTRACT_DEDUP_FQ(
+        ALIGN_READS.out.bam
+            .filter{ meta, bam ->
+                meta.has_umi && params.dedup_umi_for_kallisto
+            }
+    )
+    ch_versions = ch_versions.mix(EXTRACT_DEDUP_FQ.out.ch_versions)
+
     QUANTIFICATION(
         ALIGN_READS.out.bam,
         ALIGN_READS.out.bai,
-        PREPARE_REFERENCES.out.gtf
+        PREPARE_REFERENCES.out.gtf,
+        GET_DEDUP_FQ.out.dedup_reads
+            .mix(
+                PREPROCESS_READS.out.reads
+                    .filter{ meta, reads -> ! ( meta.has_umi && params.dedup_umi_for_kallisto ) }
+            ),
+        PREPARE_REFERENCES.out.kallisto_index
     )
     ch_versions = ch_versions.mix(QUANTIFICATION.out.ch_versions)
 
-
-    MERGE_READS(
-        PREPROCESS_READS.out.reads,
-        ALIGN_READS.out.bam
-    )
-
     FUSION(
-        MERGE_READS.out.dedup_reads,
+        PREPROCESS_READS.out.reads,
         PREPARE_REFERENCES.out.star_index,
         PREPARE_REFERENCES.out.gtf,
         PREPARE_REFERENCES.out.starfusion_ref,