add featurecounts to pipeline

mskcc · Aug 31, 2023 · 97fbb3b · 97fbb3b
1 parent ca13b87
commit 97fbb3b
Show file tree

Hide file tree

Showing 11 changed files with 286 additions and 20 deletions.
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -7,14 +7,23 @@ use_filename_as_sample_name:
   - kallisto
 
 custom_data:
-  my_genstats:
+  htseq_expression_genstats:
     plot_type: "generalstats"
     pconfig:
       - GeneCount:
           title: "Gene Count"
           format: "{:,.0f}"
           description: "HTSeq: Number of genes with detected expression"
+  kallisto_expression_genstats:
+    plot_type: "generalstats"
+    pconfig:
+      - GeneCount:
+          title: "Gene Count"
+          format: "{:,.0f}"
+          description: "Kallisto: Number of genes with detected expression"
 
 sp:
-  my_genstats:
+  htseq_expression_genstats:
     fn: "*.htseq.summary.txt"
+  kallisto_expression_genstats:
+    fn: "*.kallisto.customsummary.txt"
diff --git a/bin/count_features.R b/bin/count_features.R
@@ -0,0 +1,70 @@
+#!/usr/local/bin/Rscript
+
+# __author__      = "Anne Marie Noronha"
+# __email__       = "noronhaa@mskcc.org"
+# __version__     = "0.0.1"
+
+usage <- function() {
+    message("Usage:")
+    message("count_features.R --abundance <abundance.tsv> --gtf <annotation.gtf> --sample <sampleid>")
+}
+
+args = commandArgs(TRUE)
+
+if (is.null(args) | length(args)<1) {
+    usage()
+    quit()
+}
+
+#' Parse out options from a string without recourse to optparse
+#'
+#' @param x Long-form argument list like --opt1 val1 --opt2 val2
+#'
+#' @return named list of options and values similar to optparse
+
+parse_args <- function(x){
+    args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1]
+    args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE))
+    # Ensure the option vectors are length 2 (key/ value) to catch empty ones
+    args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z})
+
+    parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1])))
+    parsed_args[! is.na(parsed_args)]
+}
+
+args_opt <- parse_args(paste(args,collapse=" "))
+
+abundance <- read.table(args_opt$abundance, header=T)
+gtf <- args_opt$gtf
+Sample <- args_opt$sample
+
+gtf_df <- as.data.frame(rtracklayer::import(gtf))
+
+gtf_df <- unique(gtf_df[,c("transcript_id","gene_id")])
+
+abundance <- merge(
+    abundance,
+    gtf_df,
+    by.x="target_id",
+    by.y="transcript_id",
+    all.x = T,
+    all.y =F
+)
+TranscriptCount <- dim(abundance[abundance$est_counts > 0,])[[1]]
+
+abundance_gene <- aggregate(
+    x = abundance$est_counts,
+    by = list(abundance$gene_id),
+    FUN = sum
+)
+GeneCount <- dim(abundance_gene[abundance_gene$x > 0,])[[1]]
+
+out_df <- data.frame(Sample, TranscriptCount, GeneCount)
+write.table(
+    out_df,
+    paste0(Sample,".kallisto.customsummary.txt"),
+    sep="\t",
+    row.names = F,
+    append = F,
+    quote = F
+)
diff --git a/conf/modules.config b/conf/modules.config
@@ -398,10 +398,15 @@ process {
     }
 
     withName: HTSEQ_COUNT {
+        ext.when = params.expression_quantifier.replaceAll("\\s","").toLowerCase().split(",").contains("htseq")
         ext.args = { "-s ${meta.single_end || meta.strand == "forward" ? "yes" : meta.strand == "reverse" ? "reverse" : "no" } -r pos" }
         time     = { check_max( 20.h * task.attempt, 'time' ) }
     }
 
+    withName: SUBREAD_FEATURECOUNTS {
+        ext.when = ext.when = params.expression_quantifier.replaceAll("\\s","").toLowerCase().split(",").contains("featurecounts")
+    }
+
     withName: KALLISTO_QUANT {
         ext.prefix = { "$meta.sample" }
         ext.args = {

diff --git a/modules.json b/modules.json
@@ -141,6 +141,12 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "subread/featurecounts": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/subread/featurecounts/subread-featurecounts.diff"
+                    },
                     "ucsc/gtftogenepred": {
                         "branch": "master",
                         "git_sha": "511909dedab2a9955f2c3eae88abd3733edfd349",

diff --git a/modules/local/count_features/main.nf b/modules/local/count_features/main.nf
@@ -0,0 +1,47 @@
+process COUNT_FEATURES {
+    tag "$meta.id"
+    label "process_low"
+
+/// must be using singularity 3.7+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.60.0--r43ha9d7317_0' :
+        'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.60.0--r43ha9d7317_0' }"
+
+    input:
+    tuple val(meta), path(abundance)
+    path(gtf)
+
+    output:
+    tuple val(meta), path("*.kallisto.customsummary.txt"), emit: kallisto_count_feature
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def sample = "${meta.sample}"
+    """
+    count_features.R \\
+        --abundance ${abundance} \\
+        --gtf ${gtf} \\
+        --sample ${sample}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(R --version | head -n1)
+        count_features.R: 0.0.1
+    END_VERSIONS
+    """
+
+    stub:
+    def sample = meta.sample ?: ''
+    """
+    touch ${sample}.kallisto.customsummary.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(R --version | head -n1)
+        add_flags_and_cluster_information.R: 0.0.1
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/subread/featurecounts/main.nf b/modules/nf-core/subread/featurecounts/main.nf
diff --git a/modules/nf-core/subread/featurecounts/meta.yml b/modules/nf-core/subread/featurecounts/meta.yml
diff --git a/modules/nf-core/subread/featurecounts/subread-featurecounts.diff b/modules/nf-core/subread/featurecounts/subread-featurecounts.diff
diff --git a/nextflow.config b/nextflow.config
@@ -43,6 +43,8 @@ params {
     dedup_umi_for_kallisto     = true
     kallisto_fragment_len      = 500
     kallisto_fragment_sd       = 150
+    // choose "htseq" or "featurecounts" or "htseq,featurecounts"
+    expression_quantifier      = "featurecounts"
 
     // MultiQC options
     multiqc_config             = null

diff --git a/subworkflows/local/quantification.nf b/subworkflows/local/quantification.nf
@@ -1,5 +1,7 @@
-include { HTSEQ_COUNT    } from '../../modules/local/htseq/count/main'
-include { KALLISTO_QUANT } from '../../modules/nf-core/kallisto/quant/main'
+include { HTSEQ_COUNT           } from '../../modules/local/htseq/count/main'
+include { KALLISTO_QUANT        } from '../../modules/nf-core/kallisto/quant/main'
+include { SUBREAD_FEATURECOUNTS } from '../../modules/nf-core/subread/featurecounts/main'
+include { COUNT_FEATURES           } from '../../modules/local/count_features/main'
 
 
 workflow QUANTIFICATION {
@@ -14,11 +16,19 @@ workflow QUANTIFICATION {
 
     ch_versions = Channel.empty()
 
+
     HTSEQ_COUNT(
         bam.join(bai,by:[0]),
         gtf
     )
-    ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions)
+    ch_versions   = ch_versions.mix(HTSEQ_COUNT.out.versions)
+
+
+    SUBREAD_FEATURECOUNTS(
+        bam,
+        gtf
+    )
+    ch_versions = ch_versions.mix(SUBREAD_FEATURECOUNTS.out.versions)
 
     KALLISTO_QUANT(
         reads,
@@ -28,9 +38,18 @@ workflow QUANTIFICATION {
     )
     ch_versions = ch_versions.mix(KALLISTO_QUANT.out.versions)
 
+    COUNT_FEATURES(
+        KALLISTO_QUANT.out.abundance,
+        gtf
+    )
+
+
     emit:
-    htseq_counts    = HTSEQ_COUNT.out.counts
-    htseq_summary   = HTSEQ_COUNT.out.summary
-    kallisto_log    = KALLISTO_QUANT.out.log
-    ch_versions     = ch_versions
+    htseq_counts           = HTSEQ_COUNT.out.counts
+    htseq_summary          = HTSEQ_COUNT.out.summary
+    featurecounts_counts   = SUBREAD_FEATURECOUNTS.out.counts
+    featurecounts_summary  = SUBREAD_FEATURECOUNTS.out.summary
+    kallisto_log           = KALLISTO_QUANT.out.log
+    kallisto_count_feature = COUNT_FEATURES.out.kallisto_count_feature
+    ch_versions            = ch_versions
 }
diff --git a/workflows/forte.nf b/workflows/forte.nf
@@ -142,14 +142,10 @@ workflow FORTE {
         ALIGN_READS.out.bam_dedup,
         ALIGN_READS.out.bai_dedup,
         QUANTIFICATION.out.kallisto_log
+            .mix(QUANTIFICATION.out.kallisto_count_feature)
             .filter{meta, log ->
                 meta.has_umi && params.dedup_umi_for_kallisto
-            }.mix(ALIGN_READS.out.umitools_dedup_log)
-            .mix(
-                QUANTIFICATION.out.htseq_counts
-                    .mix(QUANTIFICATION.out.htseq_summary)
-                    .filter{ meta, htseq_file -> meta.has_umi }
-            ),
+            }.mix(ALIGN_READS.out.umitools_dedup_log),
         PREPARE_REFERENCES.out.refflat,
         PREPARE_REFERENCES.out.rrna_interval_list,
         PREPARE_REFERENCES.out.rseqc_bed,
@@ -163,13 +159,10 @@ workflow FORTE {
         ALIGN_READS.out.bam_withdup,
         ALIGN_READS.out.bai_withdup,
         PREPROCESS_READS.out.fastp_json
-            .mix(
-                QUANTIFICATION.out.htseq_counts
-                    .mix(QUANTIFICATION.out.htseq_summary)
-                    .filter{ meta, htseq_file -> ! meta.has_umi }
-            ).mix(ALIGN_READS.out.star_log_final)
+            .mix(ALIGN_READS.out.star_log_final)
             .mix(
                 QUANTIFICATION.out.kallisto_log
+                    .mix(QUANTIFICATION.out.kallisto_count_feature)
                     .filter{meta, log ->
                         ! (meta.has_umi && params.dedup_umi_for_kallisto)
                     }