From b7f2c8315f025e336011a6b12cbf9421336fd2a4 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Fri, 29 Dec 2023 12:51:18 -0500 Subject: [PATCH] add processes/scripts to generate metafusion gene.info file and gene.bed files --- bin/final_generate_v75_gene_bed.R | 106 +++++++++++------- bin/make_gene_info_for_forte.R | 84 ++++++++++---- conf/igenomes.config | 4 - conf/modules.config | 20 +++- modules.json | 5 + modules/local/metafusion/genebed/main.nf | 47 ++++++++ modules/local/metafusion/geneinfo/main.nf | 50 +++++++++ modules/local/metafusion/{ => run}/main.nf | 2 +- .../nf-core/agat/spaddintrons/environment.yml | 7 ++ modules/nf-core/agat/spaddintrons/main.nf | 51 +++++++++ modules/nf-core/agat/spaddintrons/meta.yml | 43 +++++++ .../agat/spaddintrons/tests/main.nf.test | 61 ++++++++++ .../agat/spaddintrons/tests/main.nf.test.snap | 40 +++++++ .../nf-core/agat/spaddintrons/tests/tags.yml | 2 + subworkflows/local/fusion.nf | 5 +- subworkflows/local/prepare_references.nf | 30 +++-- workflows/forte.nf | 1 + 17 files changed, 478 insertions(+), 80 deletions(-) mode change 100644 => 100755 bin/final_generate_v75_gene_bed.R create mode 100644 modules/local/metafusion/genebed/main.nf create mode 100644 modules/local/metafusion/geneinfo/main.nf rename modules/local/metafusion/{ => run}/main.nf (98%) create mode 100644 modules/nf-core/agat/spaddintrons/environment.yml create mode 100644 modules/nf-core/agat/spaddintrons/main.nf create mode 100644 modules/nf-core/agat/spaddintrons/meta.yml create mode 100644 modules/nf-core/agat/spaddintrons/tests/main.nf.test create mode 100644 modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap create mode 100644 modules/nf-core/agat/spaddintrons/tests/tags.yml diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R old mode 100644 new mode 100755 index 57ecbe9..46a5d15 --- a/bin/final_generate_v75_gene_bed.R +++ b/bin/final_generate_v75_gene_bed.R @@ -1,4 +1,5 @@ +#!/usr/local/bin/Rscript # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" @@ -6,13 +7,24 @@ # __status__ = "Dev" +suppressPackageStartupMessages({ + library(plyr) + library(dplyr) + library(data.table) + library(stringr) +}) -library(dplyr) -library(data.table) -library(stringr) -gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf') -gtf_df <- as.data.frame(gtf) +usage <- function() { + message("Usage:") + message("final_generate_v75_gene_bed.R ") +} +args = commandArgs(TRUE) + +if (length(args)!=2) { + usage() + quit() +} # Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 # Add introns to gtf, convert to gff3 @@ -20,28 +32,34 @@ gtf_df <- as.data.frame(gtf) # -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ # /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ # -o genes.INTRONS.gff3" -# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed +gtf <- rtracklayer::import(args[1]) +gtf_df <- as.data.frame(gtf) -total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F) -colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description") -total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2]) -total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2]) +file.to_write <- args[2] -transcript_ids <- unique(total.introns.bed$transcript_id) -file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed" +gtf_df <- gtf_df %>% + rename( + chr = seqnames + ) %>% + select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) -if(file.exists(file.to_write) ) {file.remove(file.to_write)} -#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS +#START CLOCK ptm <- proc.time() +print(ptm) # Index each transcript feature, incrementing when an intron is passed ## metafusion expects exon count 0 to (N(exons)-1) ## Forward strand: Exon 0 == Exon 1 ### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT -for (id in transcript_ids){ - transcript <- total.introns.bed[total.introns.bed$transcript_id == id,] + +print(dim(gtf_df)) +print(length(unique(gtf_df$transcript_id))) + +modify_transcript <- function(transcript){ + # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another if ("CDS" %in% transcript$type){ transcript <- transcript[!transcript$type == "exon",] @@ -51,7 +69,7 @@ for (id in transcript_ids){ # Index features idx <- 0 for (i in 1:nrow(transcript)){ - transcript$idx [i]<- idx + transcript$idx[i]<- idx if (transcript$type[i] == "intron"){ idx <- idx + 1 } @@ -68,7 +86,8 @@ for (id in transcript_ids){ #Add "chr" prefix to chromosomes transcript$chr <- sapply("chr", paste0, transcript$chr) #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later - if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"} + transcript <- transcript %>% mutate(type = as.character(type)) + transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) ## DETERMING UTR3 and UTR5 ### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION..... if ("UTR" %in% unique(transcript$type)){ @@ -79,35 +98,42 @@ for (id in transcript_ids){ transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr5" transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3" }else { + #Reverse strand start_coding <- max(transcript[transcript$type == "cds","end"]) stop_coding <- min(transcript[transcript$type == "cds","start"]) transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr3" transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" } } - transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )] - write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F, row.names=F, col.names=F) + #### Any exon that remains after teh cds change, is likely and untranslated region. change below + + # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) + #Forward strand + transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" + #Reverse strand + transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" + #transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )] + expected_types <- c("cds","intron","utr3","utr5") + transcript <- transcript[transcript$type %in% c(expected_types),] + return(transcript) } -time <- proc.time() - ptm -time -# -# user system elapsed -# 16657.116 32.227 16741.382 - - -new.bed <- fread(file.to_write,data.table = F) -colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id") - -#### Any exon that remains after teh cds change, is likely and untranslated region. change below - -# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) -#Forward strand -new.bed$type[new.bed$strand == "f" & new.bed$type == "exon" ] <- "utr5" -#Reverse strand -new.bed$type[new.bed$strand == "r" & new.bed$type == "exon"]<- "utr3" +if(file.exists(file.to_write) ) {file.remove(file.to_write)} -expected_types <- c("cds","intron","utr3","utr5") -new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),] +gtf_df_modified <- gtf_df %>% + group_by(transcript_id,.drop = FALSE) %>% + group_modify(~ modify_transcript(.x)) %>% + select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% + arrange(chr,start,end) -write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed", sep="\t", quote=F, row.names=F, col.names=F) +time <- proc.time() - ptm +print(time) + +write.table( + gtf_df_modified, + file.to_write, + sep="\t", + quote=F, + row.names=F, + col.names=F +) diff --git a/bin/make_gene_info_for_forte.R b/bin/make_gene_info_for_forte.R index 1ef403c..2ab3dfd 100644 --- a/bin/make_gene_info_for_forte.R +++ b/bin/make_gene_info_for_forte.R @@ -1,4 +1,4 @@ - +#!/usr/local/bin/Rscript # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" @@ -6,22 +6,53 @@ # __status__ = "Dev" +suppressPackageStartupMessages({ + library(dplyr) + library(stringr) +}) -library(dplyr) -library(stringr) -library(argparse) - -opt = commandArgs(TRUE) - -parser=ArgumentParser() -parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf") -parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file") -parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF") -parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF") -parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list") - -opt=parser$parse_args() - +usage <- function() { + message("Usage:") + message("make_gene_info_for_forte.R --primary_gtf --fc_custom_bed_gene_names --star_fusion_ref --fusioncatcher_ref --outputDir ") +} + +args = commandArgs(TRUE) + +if (is.null(args) | length(args)<1) { + usage() + quit() +} + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1]))) + parsed_args[! is.na(parsed_args)] +} + +opt <- parse_args(paste(args,collapse=" ")) + +required_args <- c( + "primary_gtf", + "fc_custom_bed_gene_names", + "star_fusion_ref", + "fusioncatcher_ref", + "outputDir" +) +if (length(setdiff(required_args,names(opt))) > 0) { + message("Missing required arguments") + usage() + quit() +} ### primary gtf is v75, also used in arriba primary_gtf <- as.data.frame(rtracklayer::import(opt$primary_gtf)) @@ -59,7 +90,7 @@ versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names })) -add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){ +add_these_excess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){ add_symbols_and_ids <- unique_id_to_names[[name]] add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,] if(name %in% versioned_gtf){ @@ -70,7 +101,7 @@ add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names })) # Excess genes being added (genes will be flagged as gene not in v75) -gene_info <- rbind(gene_info,add_these_exess_gene_ids) +gene_info <- rbind(gene_info,add_these_excess_gene_ids) gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F) @@ -79,5 +110,18 @@ gene_info$Symbol <- gene_info$gene_name gene_info <- gene_info[,c("Symbol","Synonyms")] -write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F) -write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F) +write.table( + gene_info, + paste0(opt$outputDir,"/gene.info"), + sep ="\t", + quote = F, + row.names = F + +) +write.table( + add_these_excess_gene_ids, + paste0(opt$outputDir,"/excess_gene_ids.txt"), + sep ="\t", + quote = F, + row.names = F +) diff --git a/conf/igenomes.config b/conf/igenomes.config index 9cea106..c618acc 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -31,8 +31,6 @@ params { } } metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz" - metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/v75_gene.bed.gz" - metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/gene_info_20230714.txt" ensembl_version = 75 } 'GRCh38' { @@ -49,8 +47,6 @@ params { starfusion_url = null cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz" metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe" - metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/v75_gene.bed" - metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/gene_info_20230714.txt" ensembl_version = 75 } diff --git a/conf/modules.config b/conf/modules.config index 07afea1..144a14f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -186,7 +186,7 @@ process { ] } - withName: METAFUSION { + withName: METAFUSION_RUN { ext.args = "--num_tools=${params.fusion_tool_cutoff}" publishDir = [ path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" }, @@ -196,6 +196,24 @@ process { ] } + withName: 'METAFUSION_GENEBED' { + storeDir = { "${params.reference_base}/${params.genome}/metafusion/genebed" } + publishDir = [ + enabled: false, + ] + } + withName: 'METAFUSION_GENEINFO' { + storeDir = { "${params.reference_base}/${params.genome}/metafusion/geneinfo" } + publishDir = [ + enabled: false, + ] + } + withName: 'AGAT_SPADDINTRONS' { + storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" } + publishDir = [ + enabled: false, + ] + } withName: ADD_FLAG { publishDir = [ diff --git a/modules.json b/modules.json index c43247b..fdb0a02 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "agat/spaddintrons": { + "branch": "master", + "git_sha": "6898156da3604a6bdf26c36036053a970050fea0", + "installed_by": ["modules"] + }, "arriba": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf new file mode 100644 index 0000000..5f8b95a --- /dev/null +++ b/modules/local/metafusion/genebed/main.nf @@ -0,0 +1,47 @@ +process METAFUSION_GENEBED { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'ghcr.io/rocker-org/devcontainer/tidyverse:4': + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' }" + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + final_generate_v75_gene_bed.R \\ + $gff \\ + ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + final_generate_v75_gene_bed.R: 0.0.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + final_generate_v75_gene_bed.R: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/metafusion/geneinfo/main.nf b/modules/local/metafusion/geneinfo/main.nf new file mode 100644 index 0000000..e6eb2f9 --- /dev/null +++ b/modules/local/metafusion/geneinfo/main.nf @@ -0,0 +1,50 @@ +process METAFUSION_GENEINFO { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'ghcr.io/rocker-org/devcontainer/tidyverse:4': + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' }" + + input: + tuple val(meta), path(gtf), path(starfusion_ref), path(fusioncatcher_ref) + + output: + tuple val(meta), path("gene.info"), emit: metafusion_gene_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + make_gene_info_for_forte.R \\ + --primary_gtf $gtf \\ + --fc_custom_bed_gene_names $fusioncatcher_ref/custom_genes.bed \\ + --star_fusion_ref $starfusion_ref/ref_annot.gtf \\ + --fusioncatcher_ref $fusioncatcher_ref/organism.gtf \\ + --outputDir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + make_gene_info_for_forte.R: 0.0.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch gene.info + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + make_gene_info_for_forte.R: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/metafusion/main.nf b/modules/local/metafusion/run/main.nf similarity index 98% rename from modules/local/metafusion/main.nf rename to modules/local/metafusion/run/main.nf index 40b90ee..07382ee 100644 --- a/modules/local/metafusion/main.nf +++ b/modules/local/metafusion/run/main.nf @@ -1,4 +1,4 @@ -process METAFUSION { +process METAFUSION_RUN { tag "$meta.id" label "process_low" diff --git a/modules/nf-core/agat/spaddintrons/environment.yml b/modules/nf-core/agat/spaddintrons/environment.yml new file mode 100644 index 0000000..6940adf --- /dev/null +++ b/modules/nf-core/agat/spaddintrons/environment.yml @@ -0,0 +1,7 @@ +name: agat_spaddintrons +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::agat=1.2.0" diff --git a/modules/nf-core/agat/spaddintrons/main.nf b/modules/nf-core/agat/spaddintrons/main.nf new file mode 100644 index 0000000..b27289c --- /dev/null +++ b/modules/nf-core/agat/spaddintrons/main.nf @@ -0,0 +1,51 @@ +process AGAT_SPADDINTRONS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.2.0--pl5321hdfd78af_0': + 'biocontainers/agat:1.2.0--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gff) + path config + + output: + tuple val(meta), path("${output}"), emit: gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def config_param = config ? "--config $config" : "" + def prefix = meta.id ?: gff.getBaseName() + output = "${prefix}.intron.gff" + """ + agat_sp_add_introns.pl \\ + --gff $gff \\ + $config_param \\ + --out $output \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_add_introns.pl --help | sed '3!d; s/.*v//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = meta.id ?: gff.getBaseName() + output = "${prefix}.intron.gff" + """ + touch ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_add_introns.pl --help | sed '3!d; s/.*v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/agat/spaddintrons/meta.yml b/modules/nf-core/agat/spaddintrons/meta.yml new file mode 100644 index 0000000..0d7ce8c --- /dev/null +++ b/modules/nf-core/agat/spaddintrons/meta.yml @@ -0,0 +1,43 @@ +name: agat_spaddintrons +description: Add intron features to gtf/gff file without intron features. +keywords: + - gtf + - gff + - introns +tools: + - agat: + description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene annotations in any GTF/GFF format." + homepage: "https://agat.readthedocs.io/en/latest/" + documentation: "https://agat.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gff: + type: file + description: Input gtf/gff file + pattern: "*.{gff,gff3,gtf}" + - config: + type: file + description: Optional input agat config file + pattern: "*.{yaml,yml}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - gff: + type: file + description: Output gff3 file with introns + pattern: "*.gff" + +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/nf-core/agat/spaddintrons/tests/main.nf.test b/modules/nf-core/agat/spaddintrons/tests/main.nf.test new file mode 100644 index 0000000..d5c5c72 --- /dev/null +++ b/modules/nf-core/agat/spaddintrons/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + + name "Test Process AGAT_SPADDINTRONS" + script "../main.nf" + process "AGAT_SPADDINTRONS" + + tag "modules" + tag "modules_nfcore" + tag "agat" + tag "agat/spaddintrons" + + test("homo_sapiens - gtf") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - gtf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.gff.collect { file(it[1]).getName() } + + process.out.versions ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap b/modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap new file mode 100644 index 0000000..8be0484 --- /dev/null +++ b/modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap @@ -0,0 +1,40 @@ +{ + "homo_sapiens - gtf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.intron.gff:md5,d4b5b806927c7ad2ad6146231f1c4def" + ] + ], + "1": [ + "versions.yml:md5,b12cd5686fe12bbe48e1abedef784585" + ], + "gff": [ + [ + { + "id": "test" + }, + "test.intron.gff:md5,d4b5b806927c7ad2ad6146231f1c4def" + ] + ], + "versions": [ + "versions.yml:md5,b12cd5686fe12bbe48e1abedef784585" + ] + } + ], + "timestamp": "2023-12-27T19:20:32.415824613" + }, + "homo_sapiens - gtf - stub": { + "content": [ + [ + "test.intron.gff", + "versions.yml:md5,b12cd5686fe12bbe48e1abedef784585" + ] + ], + "timestamp": "2023-12-27T19:20:46.113839009" + } +} \ No newline at end of file diff --git a/modules/nf-core/agat/spaddintrons/tests/tags.yml b/modules/nf-core/agat/spaddintrons/tests/tags.yml new file mode 100644 index 0000000..a47bd9b --- /dev/null +++ b/modules/nf-core/agat/spaddintrons/tests/tags.yml @@ -0,0 +1,2 @@ +agat/spaddintrons: + - "modules/nf-core/agat/spaddintrons/**" diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index 8c8bb3b..fcc02c4 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -9,7 +9,7 @@ include { TO_CFF as ARRIBA_TO_CFF } from '../../modules/local/convert_ include { TO_CFF as FUSIONCATCHER_TO_CFF } from '../../modules/local/convert_to_cff/main' include { TO_CFF as STARFUSION_TO_CFF } from '../../modules/local/convert_to_cff/main' include { CAT_CAT as MERGE_CFF } from '../../modules/nf-core/cat/cat/main' -include { METAFUSION } from '../../modules/local/metafusion/main' +include { METAFUSION_RUN } from '../../modules/local/metafusion/run/main' include { ADD_FLAG } from '../../modules/local/add_flags/main' include { CFF_ANNOTATE as CFF_FINALIZE } from '../../modules/local/cff_annotate/main' @@ -25,6 +25,7 @@ workflow FUSION { agfusion_db pyensembl_cache gene_bed + gene_info blocklist arriba_blacklist arriba_known_fusions @@ -34,7 +35,7 @@ workflow FUSION { ch_versions = Channel.empty() fasta = params.fasta //gene_bed = params.metafusion_gene_bed - gene_info = params.metafusion_gene_info + //gene_info = params.metafusion_gene_info //blocklist = params.metafusion_blocklist STAR_FOR_ARRIBA( diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index 69895df..6995e6c 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -15,7 +15,9 @@ include { FUSIONCATCHER_DOWNLOAD } from '../../modules/local/fusioncatch include { ARRIBA_DOWNLOAD } from '../../modules/local/arriba/download/main' include { KALLISTO_INDEX } from '../../modules/nf-core/kallisto/index/main' include { AGFUSION_DOWNLOAD } from '../../modules/local/agfusion/download/main' - +include { AGAT_SPADDINTRONS } from '../../modules/local/agat/spaddintrons/main' +include { METAFUSION_GENEBED } from '../../modules/local/metafusion/genebed/main' +include { METAFUSION_GENEINFO } from '../../modules/local/metafusion/geneinfo/main' workflow PREPARE_REFERENCES { @@ -36,13 +38,6 @@ workflow PREPARE_REFERENCES { metafusion_blocklist = params.metafusion_blocklist } - if (params.metafusion_gene_bed.endsWith(".gz")){ - GUNZIP_METAFUSIONGENEBED([[:],params.metafusion_gene_bed]) - metafusion_gene_bed = GUNZIP_METAFUSIONGENEBED.out.gunzip.map{ it[1] }.first() - } else { - metafusion_gene_bed = params.metafusion_gene_bed - } - STAR_GENOMEGENERATE(params.fasta,gtf) ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) star_index = STAR_GENOMEGENERATE.out.index @@ -87,11 +82,21 @@ workflow PREPARE_REFERENCES { fusioncatcher_ref = Channel.empty() } - //cosmic_usr = params.cosmic_usr ?: "" - //cosmic_passwd = params.cosmic_passwd ?: "" - ARRIBA_DOWNLOAD() + AGAT_SPADDINTRONS( + [[:],gtf], + [] + ) + + METAFUSION_GENEBED( + AGAT_SPADDINTRONS.out.gff + ) + + METAFUSION_GENEINFO( + [[:],gtf, starfusion_ref,fusioncatcher_ref] + ) + AGFUSION_DOWNLOAD( params.ensembl_version, params.genome @@ -118,7 +123,8 @@ workflow PREPARE_REFERENCES { agfusion_db = AGFUSION_DOWNLOAD.out.agfusion_db pyensembl_cache = AGFUSION_DOWNLOAD.out.pyensembl_cache metafusion_blocklist = metafusion_blocklist - metafusion_gene_bed = metafusion_gene_bed + metafusion_gene_bed = METAFUSION_GENEBED.out.metafusion_gene_bed + metafusion_gene_info = METAFUSION_GENEINFO.out.metafusion_gene_info arriba_blacklist = ARRIBA_DOWNLOAD.out.blacklist arriba_known_fusions = ARRIBA_DOWNLOAD.out.known_fusions arriba_protein_domains = ARRIBA_DOWNLOAD.out.protein_domains diff --git a/workflows/forte.nf b/workflows/forte.nf index 432a86e..ea56c0a 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -135,6 +135,7 @@ workflow FORTE { PREPARE_REFERENCES.out.agfusion_db, PREPARE_REFERENCES.out.pyensembl_cache, PREPARE_REFERENCES.out.metafusion_gene_bed, + PREPARE_REFERENCES.out.metafusion_gene_info, PREPARE_REFERENCES.out.metafusion_blocklist, workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_blacklist, workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_known_fusions,