From b7f2c8315f025e336011a6b12cbf9421336fd2a4 Mon Sep 17 00:00:00 2001
From: Anne Marie Noronha <anoronh4@users.noreply.github.com>
Date: Fri, 29 Dec 2023 12:51:18 -0500
Subject: [PATCH] add processes/scripts to generate metafusion gene.info file
 and gene.bed files

---
 bin/final_generate_v75_gene_bed.R             | 106 +++++++++++-------
 bin/make_gene_info_for_forte.R                |  84 ++++++++++----
 conf/igenomes.config                          |   4 -
 conf/modules.config                           |  20 +++-
 modules.json                                  |   5 +
 modules/local/metafusion/genebed/main.nf      |  47 ++++++++
 modules/local/metafusion/geneinfo/main.nf     |  50 +++++++++
 modules/local/metafusion/{ => run}/main.nf    |   2 +-
 .../nf-core/agat/spaddintrons/environment.yml |   7 ++
 modules/nf-core/agat/spaddintrons/main.nf     |  51 +++++++++
 modules/nf-core/agat/spaddintrons/meta.yml    |  43 +++++++
 .../agat/spaddintrons/tests/main.nf.test      |  61 ++++++++++
 .../agat/spaddintrons/tests/main.nf.test.snap |  40 +++++++
 .../nf-core/agat/spaddintrons/tests/tags.yml  |   2 +
 subworkflows/local/fusion.nf                  |   5 +-
 subworkflows/local/prepare_references.nf      |  30 +++--
 workflows/forte.nf                            |   1 +
 17 files changed, 478 insertions(+), 80 deletions(-)
 mode change 100644 => 100755 bin/final_generate_v75_gene_bed.R
 create mode 100644 modules/local/metafusion/genebed/main.nf
 create mode 100644 modules/local/metafusion/geneinfo/main.nf
 rename modules/local/metafusion/{ => run}/main.nf (98%)
 create mode 100644 modules/nf-core/agat/spaddintrons/environment.yml
 create mode 100644 modules/nf-core/agat/spaddintrons/main.nf
 create mode 100644 modules/nf-core/agat/spaddintrons/meta.yml
 create mode 100644 modules/nf-core/agat/spaddintrons/tests/main.nf.test
 create mode 100644 modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/agat/spaddintrons/tests/tags.yml

diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R
old mode 100644
new mode 100755
index 57ecbe9..46a5d15
--- a/bin/final_generate_v75_gene_bed.R
+++ b/bin/final_generate_v75_gene_bed.R
@@ -1,4 +1,5 @@
 
+#!/usr/local/bin/Rscript
 # __author__      = "Alexandria Dymun"
 # __email__       = "pintoa1@mskcc.org"
 # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)"
@@ -6,13 +7,24 @@
 # __status__      = "Dev"
 
 
+suppressPackageStartupMessages({
+    library(plyr)
+    library(dplyr)
+    library(data.table)
+    library(stringr)
+})
 
-library(dplyr)
-library(data.table)
-library(stringr)
-gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf')
-gtf_df <- as.data.frame(gtf)
+usage <- function() {
+    message("Usage:")
+    message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
+}
 
+args = commandArgs(TRUE)
+
+if (length(args)!=2) {
+    usage()
+    quit()
+}
 
 # Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
 # Add introns to gtf, convert to gff3
@@ -20,28 +32,34 @@ gtf_df <- as.data.frame(gtf)
 # -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0  \\
 # /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
 # -o genes.INTRONS.gff3"
-# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed
 
+gtf <- rtracklayer::import(args[1])
+gtf_df <- as.data.frame(gtf)
 
-total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F)
-colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description")
-total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2])
-total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2])
+file.to_write <- args[2]
 
-transcript_ids <- unique(total.introns.bed$transcript_id)
-file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed"
+gtf_df <- gtf_df %>%
+    rename(
+        chr = seqnames
+    ) %>%
+    select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
+    filter(type %in% c("exon","intron","UTR","CDS","cds","utr"))
 
-if(file.exists(file.to_write) ) {file.remove(file.to_write)}
 
-#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS
+#START CLOCK
 ptm <- proc.time()
+print(ptm)
 
 # Index each transcript feature, incrementing when an intron is passed
 ## metafusion expects exon count 0 to (N(exons)-1)
 ## Forward strand: Exon 0 == Exon 1
 ### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT
-for (id in transcript_ids){
-    transcript <- total.introns.bed[total.introns.bed$transcript_id == id,]
+
+print(dim(gtf_df))
+print(length(unique(gtf_df$transcript_id)))
+
+modify_transcript <- function(transcript){
+
     # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
     if ("CDS" %in% transcript$type){
         transcript <- transcript[!transcript$type == "exon",]
@@ -51,7 +69,7 @@ for (id in transcript_ids){
     # Index features
     idx <- 0
     for (i in 1:nrow(transcript)){
-        transcript$idx [i]<- idx
+        transcript$idx[i]<- idx
         if (transcript$type[i] == "intron"){
             idx <- idx + 1
         }
@@ -68,7 +86,8 @@ for (id in transcript_ids){
     #Add "chr" prefix to chromosomes
     transcript$chr <- sapply("chr", paste0,  transcript$chr)
     #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
-    if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"}
+    transcript <- transcript %>% mutate(type = as.character(type))
+    transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
     ## DETERMING UTR3 and UTR5
     ### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION.....
     if ("UTR" %in% unique(transcript$type)){
@@ -79,35 +98,42 @@ for (id in transcript_ids){
             transcript$type[transcript$end <= start_coding &  transcript$type == "UTR"] <- "utr5"
             transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3"
         }else {
+            #Reverse strand
             start_coding <- max(transcript[transcript$type == "cds","end"])
             stop_coding <- min(transcript[transcript$type == "cds","start"])
             transcript$type[transcript$end <= start_coding &  transcript$type == "UTR"] <- "utr3"
             transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
         }
     }
-    transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
-    write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F,  row.names=F, col.names=F)
+    #### Any exon that remains after teh cds change, is likely and untranslated region. change below
+
+    # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
+    #Forward strand
+    transcript$type[transcript$strand == "f" &  transcript$type == "exon" ] <- "utr5"
+    #Reverse strand
+    transcript$type[transcript$strand == "r" &  transcript$type == "exon"]<- "utr3"
+    #transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
+    expected_types <- c("cds","intron","utr3","utr5")
+    transcript <- transcript[transcript$type %in% c(expected_types),]
+    return(transcript)
 }
 
-time <- proc.time() - ptm
-time
-#
-# user    system   elapsed
-# 16657.116    32.227 16741.382
-
-
-new.bed <- fread(file.to_write,data.table = F)
-colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id")
-
-#### Any exon that remains after teh cds change, is likely and untranslated region. change below
-
-# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
-#Forward strand
-new.bed$type[new.bed$strand == "f" &  new.bed$type == "exon" ] <- "utr5"
-#Reverse strand
-new.bed$type[new.bed$strand == "r" &  new.bed$type == "exon"]<- "utr3"
+if(file.exists(file.to_write) ) {file.remove(file.to_write)}
 
-expected_types <- c("cds","intron","utr3","utr5")
-new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),]
+gtf_df_modified <- gtf_df %>%
+    group_by(transcript_id,.drop = FALSE) %>%
+    group_modify(~ modify_transcript(.x)) %>%
+    select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
+    arrange(chr,start,end)
 
-write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed",  sep="\t", quote=F,  row.names=F, col.names=F)
+time <- proc.time() - ptm
+print(time)
+
+write.table(
+    gtf_df_modified,
+    file.to_write,
+    sep="\t",
+    quote=F,
+    row.names=F,
+    col.names=F
+)
diff --git a/bin/make_gene_info_for_forte.R b/bin/make_gene_info_for_forte.R
index 1ef403c..2ab3dfd 100644
--- a/bin/make_gene_info_for_forte.R
+++ b/bin/make_gene_info_for_forte.R
@@ -1,4 +1,4 @@
-
+#!/usr/local/bin/Rscript
 # __author__      = "Alexandria Dymun"
 # __email__       = "pintoa1@mskcc.org"
 # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)"
@@ -6,22 +6,53 @@
 # __status__      = "Dev"
 
 
+suppressPackageStartupMessages({
+    library(dplyr)
+    library(stringr)
+})
 
-library(dplyr)
-library(stringr)
-library(argparse)
-
-opt = commandArgs(TRUE)
-
-parser=ArgumentParser()
-parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf")
-parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file")
-parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF")
-parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF")
-parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list")
-
-opt=parser$parse_args()
-
+usage <- function() {
+    message("Usage:")
+    message("make_gene_info_for_forte.R --primary_gtf <file.gtf> --fc_custom_bed_gene_names <custom_genes.bed> --star_fusion_ref <ref_annot.gtf> --fusioncatcher_ref <organism.gtf> --outputDir <outputpath>")
+}
+
+args = commandArgs(TRUE)
+
+if (is.null(args) | length(args)<1) {
+    usage()
+    quit()
+}
+
+#' Parse out options from a string without recourse to optparse
+#'
+#' @param x Long-form argument list like --opt1 val1 --opt2 val2
+#'
+#' @return named list of options and values similar to optparse
+
+parse_args <- function(x){
+    args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1]
+    args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE))
+    # Ensure the option vectors are length 2 (key/ value) to catch empty ones
+    args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z})
+
+    parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1])))
+    parsed_args[! is.na(parsed_args)]
+}
+
+opt <- parse_args(paste(args,collapse=" "))
+
+required_args <- c(
+    "primary_gtf",
+    "fc_custom_bed_gene_names",
+    "star_fusion_ref",
+    "fusioncatcher_ref",
+    "outputDir"
+)
+if (length(setdiff(required_args,names(opt))) > 0) {
+    message("Missing required arguments")
+    usage()
+    quit()
+}
 
 ### primary gtf is v75, also used in arriba
 primary_gtf <-  as.data.frame(rtracklayer::import(opt$primary_gtf))
@@ -59,7 +90,7 @@ versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names
 }))
 
 
-add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
+add_these_excess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
     add_symbols_and_ids <- unique_id_to_names[[name]]
     add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,]
     if(name %in% versioned_gtf){
@@ -70,7 +101,7 @@ add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names
 
 }))
 # Excess genes being added (genes will be flagged as gene not in v75)
-gene_info <- rbind(gene_info,add_these_exess_gene_ids)
+gene_info <- rbind(gene_info,add_these_excess_gene_ids)
 
 gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F)
 
@@ -79,5 +110,18 @@ gene_info$Symbol <- gene_info$gene_name
 
 gene_info <- gene_info[,c("Symbol","Synonyms")]
 
-write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F)
-write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F)
+write.table(
+    gene_info,
+    paste0(opt$outputDir,"/gene.info"),
+    sep ="\t",
+    quote = F,
+    row.names = F
+
+)
+write.table(
+    add_these_excess_gene_ids,
+    paste0(opt$outputDir,"/excess_gene_ids.txt"),
+    sep ="\t",
+    quote = F,
+    row.names = F
+)
diff --git a/conf/igenomes.config b/conf/igenomes.config
index 9cea106..c618acc 100644
--- a/conf/igenomes.config
+++ b/conf/igenomes.config
@@ -31,8 +31,6 @@ params {
                 }
             }
             metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz"
-            metafusion_gene_bed  = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/v75_gene.bed.gz"
-            metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/gene_info_20230714.txt"
             ensembl_version = 75
         }
         'GRCh38' {
@@ -49,8 +47,6 @@ params {
             starfusion_url = null
             cdna           = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz"
             metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe"
-            metafusion_gene_bed  = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/v75_gene.bed"
-            metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/gene_info_20230714.txt"
             ensembl_version = 75
 
         }
diff --git a/conf/modules.config b/conf/modules.config
index 07afea1..144a14f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -186,7 +186,7 @@ process {
         ]
     }
 
-    withName: METAFUSION {
+    withName: METAFUSION_RUN {
         ext.args = "--num_tools=${params.fusion_tool_cutoff}"
         publishDir = [
             path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" },
@@ -196,6 +196,24 @@ process {
 
         ]
     }
+    withName: 'METAFUSION_GENEBED' {
+        storeDir = { "${params.reference_base}/${params.genome}/metafusion/genebed" }
+        publishDir = [
+            enabled: false,
+        ]
+    }
+    withName: 'METAFUSION_GENEINFO' {
+        storeDir = { "${params.reference_base}/${params.genome}/metafusion/geneinfo" }
+        publishDir = [
+            enabled: false,
+        ]
+    }
+    withName: 'AGAT_SPADDINTRONS' {
+        storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" }
+        publishDir = [
+            enabled: false,
+        ]
+    }
 
     withName: ADD_FLAG {
         publishDir = [
diff --git a/modules.json b/modules.json
index c43247b..fdb0a02 100644
--- a/modules.json
+++ b/modules.json
@@ -5,6 +5,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "agat/spaddintrons": {
+                        "branch": "master",
+                        "git_sha": "6898156da3604a6bdf26c36036053a970050fea0",
+                        "installed_by": ["modules"]
+                    },
                     "arriba": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf
new file mode 100644
index 0000000..5f8b95a
--- /dev/null
+++ b/modules/local/metafusion/genebed/main.nf
@@ -0,0 +1,47 @@
+process METAFUSION_GENEBED {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'ghcr.io/rocker-org/devcontainer/tidyverse:4':
+        'ghcr.io/rocker-org/devcontainer/tidyverse:4' }"
+
+    input:
+    tuple val(meta), path(gff)
+
+    output:
+    tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed
+    path "versions.yml"                           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    final_generate_v75_gene_bed.R \\
+        $gff \\
+        ${prefix}.metafusion.gene.bed
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(R --version | head -n1)
+        final_generate_v75_gene_bed.R: 0.0.1
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.metafusion.gene.bed
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(R --version | head -n1)
+        final_generate_v75_gene_bed.R: 0.0.1
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/metafusion/geneinfo/main.nf b/modules/local/metafusion/geneinfo/main.nf
new file mode 100644
index 0000000..e6eb2f9
--- /dev/null
+++ b/modules/local/metafusion/geneinfo/main.nf
@@ -0,0 +1,50 @@
+process METAFUSION_GENEINFO {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'ghcr.io/rocker-org/devcontainer/tidyverse:4':
+        'ghcr.io/rocker-org/devcontainer/tidyverse:4' }"
+
+    input:
+    tuple val(meta), path(gtf), path(starfusion_ref), path(fusioncatcher_ref)
+
+    output:
+    tuple val(meta), path("gene.info"), emit: metafusion_gene_info
+    path "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    make_gene_info_for_forte.R \\
+        --primary_gtf $gtf \\
+        --fc_custom_bed_gene_names $fusioncatcher_ref/custom_genes.bed \\
+        --star_fusion_ref $starfusion_ref/ref_annot.gtf \\
+        --fusioncatcher_ref $fusioncatcher_ref/organism.gtf \\
+        --outputDir .
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(R --version | head -n1)
+        make_gene_info_for_forte.R: 0.0.1
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch gene.info
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(R --version | head -n1)
+        make_gene_info_for_forte.R: 0.0.1
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/metafusion/main.nf b/modules/local/metafusion/run/main.nf
similarity index 98%
rename from modules/local/metafusion/main.nf
rename to modules/local/metafusion/run/main.nf
index 40b90ee..07382ee 100644
--- a/modules/local/metafusion/main.nf
+++ b/modules/local/metafusion/run/main.nf
@@ -1,4 +1,4 @@
-process METAFUSION {
+process METAFUSION_RUN {
     tag "$meta.id"
     label "process_low"
 
diff --git a/modules/nf-core/agat/spaddintrons/environment.yml b/modules/nf-core/agat/spaddintrons/environment.yml
new file mode 100644
index 0000000..6940adf
--- /dev/null
+++ b/modules/nf-core/agat/spaddintrons/environment.yml
@@ -0,0 +1,7 @@
+name: agat_spaddintrons
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - "bioconda::agat=1.2.0"
diff --git a/modules/nf-core/agat/spaddintrons/main.nf b/modules/nf-core/agat/spaddintrons/main.nf
new file mode 100644
index 0000000..b27289c
--- /dev/null
+++ b/modules/nf-core/agat/spaddintrons/main.nf
@@ -0,0 +1,51 @@
+process AGAT_SPADDINTRONS {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/agat:1.2.0--pl5321hdfd78af_0':
+        'biocontainers/agat:1.2.0--pl5321hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(gff)
+    path config
+
+    output:
+    tuple val(meta), path("${output}"), emit: gff
+    path "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def config_param = config ? "--config $config" : ""
+    def prefix = meta.id ?: gff.getBaseName()
+    output = "${prefix}.intron.gff"
+    """
+    agat_sp_add_introns.pl \\
+        --gff $gff \\
+        $config_param \\
+        --out $output \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        agat: \$(agat_sp_add_introns.pl --help | sed '3!d; s/.*v//')
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = meta.id ?: gff.getBaseName()
+    output = "${prefix}.intron.gff"
+    """
+    touch ${output}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        agat: \$(agat_sp_add_introns.pl --help | sed '3!d; s/.*v//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/agat/spaddintrons/meta.yml b/modules/nf-core/agat/spaddintrons/meta.yml
new file mode 100644
index 0000000..0d7ce8c
--- /dev/null
+++ b/modules/nf-core/agat/spaddintrons/meta.yml
@@ -0,0 +1,43 @@
+name: agat_spaddintrons
+description: Add intron features to gtf/gff file without intron features.
+keywords:
+  - gtf
+  - gff
+  - introns
+tools:
+  - agat:
+      description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene annotations in any GTF/GFF format."
+      homepage: "https://agat.readthedocs.io/en/latest/"
+      documentation: "https://agat.readthedocs.io/en/latest/"
+      tool_dev_url: "https://github.com/NBISweden/AGAT"
+      licence: ["GPL v3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - gff:
+      type: file
+      description: Input gtf/gff file
+      pattern: "*.{gff,gff3,gtf}"
+  - config:
+      type: file
+      description: Optional input agat config file
+      pattern: "*.{yaml,yml}"
+
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - gff:
+      type: file
+      description: Output gff3 file with introns
+      pattern: "*.gff"
+
+authors:
+  - "@anoronh4"
+maintainers:
+  - "@anoronh4"
diff --git a/modules/nf-core/agat/spaddintrons/tests/main.nf.test b/modules/nf-core/agat/spaddintrons/tests/main.nf.test
new file mode 100644
index 0000000..d5c5c72
--- /dev/null
+++ b/modules/nf-core/agat/spaddintrons/tests/main.nf.test
@@ -0,0 +1,61 @@
+nextflow_process {
+
+    name "Test Process AGAT_SPADDINTRONS"
+    script "../main.nf"
+    process "AGAT_SPADDINTRONS"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "agat"
+    tag "agat/spaddintrons"
+
+    test("homo_sapiens - gtf") {
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test' ],
+                    file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)
+                ]
+                input[1] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("homo_sapiens - gtf - stub") {
+
+        options "-stub"
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test' ],
+                    file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)
+                ]
+                input[1] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.gff.collect { file(it[1]).getName() } +
+                                process.out.versions ).match() }
+            )
+        }
+
+    }
+
+}
diff --git a/modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap b/modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap
new file mode 100644
index 0000000..8be0484
--- /dev/null
+++ b/modules/nf-core/agat/spaddintrons/tests/main.nf.test.snap
@@ -0,0 +1,40 @@
+{
+    "homo_sapiens - gtf": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.intron.gff:md5,d4b5b806927c7ad2ad6146231f1c4def"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,b12cd5686fe12bbe48e1abedef784585"
+                ],
+                "gff": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.intron.gff:md5,d4b5b806927c7ad2ad6146231f1c4def"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,b12cd5686fe12bbe48e1abedef784585"
+                ]
+            }
+        ],
+        "timestamp": "2023-12-27T19:20:32.415824613"
+    },
+    "homo_sapiens - gtf - stub": {
+        "content": [
+            [
+                "test.intron.gff",
+                "versions.yml:md5,b12cd5686fe12bbe48e1abedef784585"
+            ]
+        ],
+        "timestamp": "2023-12-27T19:20:46.113839009"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/agat/spaddintrons/tests/tags.yml b/modules/nf-core/agat/spaddintrons/tests/tags.yml
new file mode 100644
index 0000000..a47bd9b
--- /dev/null
+++ b/modules/nf-core/agat/spaddintrons/tests/tags.yml
@@ -0,0 +1,2 @@
+agat/spaddintrons:
+  - "modules/nf-core/agat/spaddintrons/**"
diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf
index 8c8bb3b..fcc02c4 100644
--- a/subworkflows/local/fusion.nf
+++ b/subworkflows/local/fusion.nf
@@ -9,7 +9,7 @@ include { TO_CFF as ARRIBA_TO_CFF           } from '../../modules/local/convert_
 include { TO_CFF as FUSIONCATCHER_TO_CFF    } from '../../modules/local/convert_to_cff/main'
 include { TO_CFF as STARFUSION_TO_CFF       } from '../../modules/local/convert_to_cff/main'
 include { CAT_CAT as MERGE_CFF              } from '../../modules/nf-core/cat/cat/main'
-include { METAFUSION                        } from '../../modules/local/metafusion/main'
+include { METAFUSION_RUN                    } from '../../modules/local/metafusion/run/main'
 include { ADD_FLAG                          } from '../../modules/local/add_flags/main'
 include { CFF_ANNOTATE as CFF_FINALIZE      } from '../../modules/local/cff_annotate/main'
 
@@ -25,6 +25,7 @@ workflow FUSION {
     agfusion_db
     pyensembl_cache
     gene_bed
+    gene_info
     blocklist
     arriba_blacklist
     arriba_known_fusions
@@ -34,7 +35,7 @@ workflow FUSION {
     ch_versions = Channel.empty()
     fasta = params.fasta
     //gene_bed = params.metafusion_gene_bed
-    gene_info = params.metafusion_gene_info
+    //gene_info = params.metafusion_gene_info
     //blocklist = params.metafusion_blocklist
 
     STAR_FOR_ARRIBA(
diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf
index 69895df..6995e6c 100644
--- a/subworkflows/local/prepare_references.nf
+++ b/subworkflows/local/prepare_references.nf
@@ -15,7 +15,9 @@ include { FUSIONCATCHER_DOWNLOAD         } from '../../modules/local/fusioncatch
 include { ARRIBA_DOWNLOAD                } from '../../modules/local/arriba/download/main'
 include { KALLISTO_INDEX                 } from '../../modules/nf-core/kallisto/index/main'
 include { AGFUSION_DOWNLOAD              } from '../../modules/local/agfusion/download/main'
-
+include { AGAT_SPADDINTRONS              } from '../../modules/local/agat/spaddintrons/main'
+include { METAFUSION_GENEBED             } from '../../modules/local/metafusion/genebed/main'
+include { METAFUSION_GENEINFO            } from '../../modules/local/metafusion/geneinfo/main'
 
 workflow PREPARE_REFERENCES {
 
@@ -36,13 +38,6 @@ workflow PREPARE_REFERENCES {
         metafusion_blocklist = params.metafusion_blocklist
     }
 
-    if (params.metafusion_gene_bed.endsWith(".gz")){
-        GUNZIP_METAFUSIONGENEBED([[:],params.metafusion_gene_bed])
-        metafusion_gene_bed = GUNZIP_METAFUSIONGENEBED.out.gunzip.map{ it[1] }.first()
-    } else {
-        metafusion_gene_bed = params.metafusion_gene_bed
-    }
-
     STAR_GENOMEGENERATE(params.fasta,gtf)
     ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
     star_index = STAR_GENOMEGENERATE.out.index
@@ -87,11 +82,21 @@ workflow PREPARE_REFERENCES {
         fusioncatcher_ref = Channel.empty()
     }
 
-    //cosmic_usr = params.cosmic_usr ?: ""
-    //cosmic_passwd = params.cosmic_passwd ?: ""
-
     ARRIBA_DOWNLOAD()
 
+    AGAT_SPADDINTRONS(
+        [[:],gtf],
+        []
+    )
+
+    METAFUSION_GENEBED(
+        AGAT_SPADDINTRONS.out.gff
+    )
+
+    METAFUSION_GENEINFO(
+        [[:],gtf, starfusion_ref,fusioncatcher_ref]
+    )
+
     AGFUSION_DOWNLOAD(
         params.ensembl_version,
         params.genome
@@ -118,7 +123,8 @@ workflow PREPARE_REFERENCES {
     agfusion_db        = AGFUSION_DOWNLOAD.out.agfusion_db
     pyensembl_cache    = AGFUSION_DOWNLOAD.out.pyensembl_cache
     metafusion_blocklist = metafusion_blocklist
-    metafusion_gene_bed = metafusion_gene_bed
+    metafusion_gene_bed = METAFUSION_GENEBED.out.metafusion_gene_bed
+    metafusion_gene_info = METAFUSION_GENEINFO.out.metafusion_gene_info
     arriba_blacklist   = ARRIBA_DOWNLOAD.out.blacklist
     arriba_known_fusions = ARRIBA_DOWNLOAD.out.known_fusions
     arriba_protein_domains = ARRIBA_DOWNLOAD.out.protein_domains
diff --git a/workflows/forte.nf b/workflows/forte.nf
index 432a86e..ea56c0a 100644
--- a/workflows/forte.nf
+++ b/workflows/forte.nf
@@ -135,6 +135,7 @@ workflow FORTE {
         PREPARE_REFERENCES.out.agfusion_db,
         PREPARE_REFERENCES.out.pyensembl_cache,
         PREPARE_REFERENCES.out.metafusion_gene_bed,
+        PREPARE_REFERENCES.out.metafusion_gene_info,
         PREPARE_REFERENCES.out.metafusion_blocklist,
         workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_blacklist,
         workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_known_fusions,