diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 470e326..ce46189 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -7,14 +7,23 @@ use_filename_as_sample_name: - kallisto custom_data: - my_genstats: + htseq_expression_genstats: plot_type: "generalstats" pconfig: - GeneCount: title: "Gene Count" format: "{:,.0f}" description: "HTSeq: Number of genes with detected expression" + kallisto_expression_genstats: + plot_type: "generalstats" + pconfig: + - GeneCount: + title: "Gene Count" + format: "{:,.0f}" + description: "Kallisto: Number of genes with detected expression" sp: - my_genstats: + htseq_expression_genstats: fn: "*.htseq.summary.txt" + kallisto_expression_genstats: + fn: "*.kallisto.customsummary.txt" diff --git a/bin/count_features.R b/bin/count_features.R new file mode 100755 index 0000000..a75b404 --- /dev/null +++ b/bin/count_features.R @@ -0,0 +1,70 @@ +#!/usr/local/bin/Rscript + +# __author__ = "Anne Marie Noronha" +# __email__ = "noronhaa@mskcc.org" +# __version__ = "0.0.1" + +usage <- function() { + message("Usage:") + message("count_features.R --abundance --gtf --sample ") +} + +args = commandArgs(TRUE) + +if (is.null(args) | length(args)<1) { + usage() + quit() +} + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1]))) + parsed_args[! is.na(parsed_args)] +} + +args_opt <- parse_args(paste(args,collapse=" ")) + +abundance <- read.table(args_opt$abundance, header=T) +gtf <- args_opt$gtf +Sample <- args_opt$sample + +gtf_df <- as.data.frame(rtracklayer::import(gtf)) + +gtf_df <- unique(gtf_df[,c("transcript_id","gene_id")]) + +abundance <- merge( + abundance, + gtf_df, + by.x="target_id", + by.y="transcript_id", + all.x = T, + all.y =F +) +TranscriptCount <- dim(abundance[abundance$est_counts > 0,])[[1]] + +abundance_gene <- aggregate( + x = abundance$est_counts, + by = list(abundance$gene_id), + FUN = sum +) +GeneCount <- dim(abundance_gene[abundance_gene$x > 0,])[[1]] + +out_df <- data.frame(Sample, TranscriptCount, GeneCount) +write.table( + out_df, + paste0(Sample,".kallisto.customsummary.txt"), + sep="\t", + row.names = F, + append = F, + quote = F +) diff --git a/conf/modules.config b/conf/modules.config index 2be37bf..0628835 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -398,10 +398,15 @@ process { } withName: HTSEQ_COUNT { + ext.when = params.expression_quantifier.replaceAll("\\s","").toLowerCase().split(",").contains("htseq") ext.args = { "-s ${meta.single_end || meta.strand == "forward" ? "yes" : meta.strand == "reverse" ? "reverse" : "no" } -r pos" } time = { check_max( 20.h * task.attempt, 'time' ) } } + withName: SUBREAD_FEATURECOUNTS { + ext.when = ext.when = params.expression_quantifier.replaceAll("\\s","").toLowerCase().split(",").contains("featurecounts") + } + withName: KALLISTO_QUANT { ext.prefix = { "$meta.sample" } ext.args = { diff --git a/modules.json b/modules.json index eb6cb4c..fa4c97f 100644 --- a/modules.json +++ b/modules.json @@ -141,6 +141,12 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "subread/featurecounts": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/subread/featurecounts/subread-featurecounts.diff" + }, "ucsc/gtftogenepred": { "branch": "master", "git_sha": "511909dedab2a9955f2c3eae88abd3733edfd349", diff --git a/modules/local/count_features/main.nf b/modules/local/count_features/main.nf new file mode 100755 index 0000000..075a6ae --- /dev/null +++ b/modules/local/count_features/main.nf @@ -0,0 +1,47 @@ +process COUNT_FEATURES { + tag "$meta.id" + label "process_low" + +/// must be using singularity 3.7+ + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.60.0--r43ha9d7317_0' : + 'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.60.0--r43ha9d7317_0' }" + + input: + tuple val(meta), path(abundance) + path(gtf) + + output: + tuple val(meta), path("*.kallisto.customsummary.txt"), emit: kallisto_count_feature + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def sample = "${meta.sample}" + """ + count_features.R \\ + --abundance ${abundance} \\ + --gtf ${gtf} \\ + --sample ${sample} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + count_features.R: 0.0.1 + END_VERSIONS + """ + + stub: + def sample = meta.sample ?: '' + """ + touch ${sample}.kallisto.customsummary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + add_flags_and_cluster_information.R: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/nf-core/subread/featurecounts/main.nf b/modules/nf-core/subread/featurecounts/main.nf new file mode 100644 index 0000000..1af3b0b --- /dev/null +++ b/modules/nf-core/subread/featurecounts/main.nf @@ -0,0 +1,48 @@ +process SUBREAD_FEATURECOUNTS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::subread=2.0.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/subread:2.0.1--hed695b0_0' : + 'biocontainers/subread:2.0.1--hed695b0_0' }" + + input: + tuple val(meta), path(bams) + path(annotation) + + output: + tuple val(meta), path("*featureCounts.txt") , emit: counts + tuple val(meta), path("*featureCounts.txt.summary"), emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired_end = meta.single_end ? '' : '-p' + + def strandedness = 0 + if (meta.strandedness == 'forward') { + strandedness = 1 + } else if (meta.strandedness == 'reverse') { + strandedness = 2 + } + """ + featureCounts \\ + $args \\ + $paired_end \\ + -T $task.cpus \\ + -a $annotation \\ + -s $strandedness \\ + -o ${prefix}.featureCounts.txt \\ + ${bams.join(' ')} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + subread: \$( echo \$(featureCounts -v 2>&1) | sed -e "s/featureCounts v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/subread/featurecounts/meta.yml b/modules/nf-core/subread/featurecounts/meta.yml new file mode 100644 index 0000000..cf02f1e --- /dev/null +++ b/modules/nf-core/subread/featurecounts/meta.yml @@ -0,0 +1,52 @@ +name: subread_featurecounts +description: Count reads that map to genomic features +keywords: + - counts + - fasta + - genome + - reference + +tools: + - featurecounts: + description: featureCounts is a highly efficient general-purpose read summarization program that counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. It can be used to count both RNA-seq and genomic DNA-seq reads. + homepage: http://bioinf.wehi.edu.au/featureCounts/ + documentation: http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf + doi: "10.1093/bioinformatics/btt656" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/SAM file containing read alignments + pattern: "*.{bam}" + - annotation: + type: file + description: Genomic features annotation in GTF or SAF + pattern: "*.{gtf,saf}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - counts: + type: file + description: Counts of reads mapping to features + pattern: "*featureCounts.txt" + - summary: + type: file + description: Summary log file + pattern: "*.featureCounts.txt.summary" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@ntoda03" diff --git a/modules/nf-core/subread/featurecounts/subread-featurecounts.diff b/modules/nf-core/subread/featurecounts/subread-featurecounts.diff new file mode 100644 index 0000000..01ac7dd --- /dev/null +++ b/modules/nf-core/subread/featurecounts/subread-featurecounts.diff @@ -0,0 +1,15 @@ +Changes in module 'nf-core/subread/featurecounts' +--- modules/nf-core/subread/featurecounts/main.nf ++++ modules/nf-core/subread/featurecounts/main.nf +@@ -8,7 +8,8 @@ + 'biocontainers/subread:2.0.1--hed695b0_0' }" + + input: +- tuple val(meta), path(bams), path(annotation) ++ tuple val(meta), path(bams) ++ path(annotation) + + output: + tuple val(meta), path("*featureCounts.txt") , emit: counts + +************************************************************ diff --git a/nextflow.config b/nextflow.config index 5c026f9..45127a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,6 +43,8 @@ params { dedup_umi_for_kallisto = true kallisto_fragment_len = 500 kallisto_fragment_sd = 150 + // choose "htseq" or "featurecounts" or "htseq,featurecounts" + expression_quantifier = "featurecounts" // MultiQC options multiqc_config = null diff --git a/subworkflows/local/quantification.nf b/subworkflows/local/quantification.nf index 7aa0f01..4d8fbb1 100644 --- a/subworkflows/local/quantification.nf +++ b/subworkflows/local/quantification.nf @@ -1,5 +1,7 @@ -include { HTSEQ_COUNT } from '../../modules/local/htseq/count/main' -include { KALLISTO_QUANT } from '../../modules/nf-core/kallisto/quant/main' +include { HTSEQ_COUNT } from '../../modules/local/htseq/count/main' +include { KALLISTO_QUANT } from '../../modules/nf-core/kallisto/quant/main' +include { SUBREAD_FEATURECOUNTS } from '../../modules/nf-core/subread/featurecounts/main' +include { COUNT_FEATURES } from '../../modules/local/count_features/main' workflow QUANTIFICATION { @@ -14,11 +16,19 @@ workflow QUANTIFICATION { ch_versions = Channel.empty() + HTSEQ_COUNT( bam.join(bai,by:[0]), gtf ) - ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions) + ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions) + + + SUBREAD_FEATURECOUNTS( + bam, + gtf + ) + ch_versions = ch_versions.mix(SUBREAD_FEATURECOUNTS.out.versions) KALLISTO_QUANT( reads, @@ -28,9 +38,18 @@ workflow QUANTIFICATION { ) ch_versions = ch_versions.mix(KALLISTO_QUANT.out.versions) + COUNT_FEATURES( + KALLISTO_QUANT.out.abundance, + gtf + ) + + emit: - htseq_counts = HTSEQ_COUNT.out.counts - htseq_summary = HTSEQ_COUNT.out.summary - kallisto_log = KALLISTO_QUANT.out.log - ch_versions = ch_versions + htseq_counts = HTSEQ_COUNT.out.counts + htseq_summary = HTSEQ_COUNT.out.summary + featurecounts_counts = SUBREAD_FEATURECOUNTS.out.counts + featurecounts_summary = SUBREAD_FEATURECOUNTS.out.summary + kallisto_log = KALLISTO_QUANT.out.log + kallisto_count_feature = COUNT_FEATURES.out.kallisto_count_feature + ch_versions = ch_versions } diff --git a/workflows/forte.nf b/workflows/forte.nf index aacc4cc..fb5a93f 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -142,14 +142,10 @@ workflow FORTE { ALIGN_READS.out.bam_dedup, ALIGN_READS.out.bai_dedup, QUANTIFICATION.out.kallisto_log + .mix(QUANTIFICATION.out.kallisto_count_feature) .filter{meta, log -> meta.has_umi && params.dedup_umi_for_kallisto - }.mix(ALIGN_READS.out.umitools_dedup_log) - .mix( - QUANTIFICATION.out.htseq_counts - .mix(QUANTIFICATION.out.htseq_summary) - .filter{ meta, htseq_file -> meta.has_umi } - ), + }.mix(ALIGN_READS.out.umitools_dedup_log), PREPARE_REFERENCES.out.refflat, PREPARE_REFERENCES.out.rrna_interval_list, PREPARE_REFERENCES.out.rseqc_bed, @@ -163,13 +159,10 @@ workflow FORTE { ALIGN_READS.out.bam_withdup, ALIGN_READS.out.bai_withdup, PREPROCESS_READS.out.fastp_json - .mix( - QUANTIFICATION.out.htseq_counts - .mix(QUANTIFICATION.out.htseq_summary) - .filter{ meta, htseq_file -> ! meta.has_umi } - ).mix(ALIGN_READS.out.star_log_final) + .mix(ALIGN_READS.out.star_log_final) .mix( QUANTIFICATION.out.kallisto_log + .mix(QUANTIFICATION.out.kallisto_count_feature) .filter{meta, log -> ! (meta.has_umi && params.dedup_umi_for_kallisto) }