Skip to content

Commit

Permalink
add processes/scripts to generate metafusion gene.info file and gene.…
Browse files Browse the repository at this point in the history
…bed files
  • Loading branch information
anoronh4 committed Dec 29, 2023
1 parent 79f1bd2 commit b7f2c83
Show file tree
Hide file tree
Showing 17 changed files with 478 additions and 80 deletions.
106 changes: 66 additions & 40 deletions bin/final_generate_v75_gene_bed.R
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,47 +1,65 @@

#!/usr/local/bin/Rscript
# __author__ = "Alexandria Dymun"
# __email__ = "pintoa1@mskcc.org"
# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)"
# __version__ = "0.0.1"
# __status__ = "Dev"


suppressPackageStartupMessages({
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
})

library(dplyr)
library(data.table)
library(stringr)
gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf')
gtf_df <- as.data.frame(gtf)
usage <- function() {
message("Usage:")
message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
}

args = commandArgs(TRUE)

if (length(args)!=2) {
usage()
quit()
}

# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
# Add introns to gtf, convert to gff3
# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\
# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\
# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
# -o genes.INTRONS.gff3"
# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed

gtf <- rtracklayer::import(args[1])
gtf_df <- as.data.frame(gtf)

total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F)
colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description")
total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2])
total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2])
file.to_write <- args[2]

transcript_ids <- unique(total.introns.bed$transcript_id)
file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed"
gtf_df <- gtf_df %>%
rename(
chr = seqnames
) %>%
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
filter(type %in% c("exon","intron","UTR","CDS","cds","utr"))

if(file.exists(file.to_write) ) {file.remove(file.to_write)}

#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS
#START CLOCK
ptm <- proc.time()
print(ptm)

# Index each transcript feature, incrementing when an intron is passed
## metafusion expects exon count 0 to (N(exons)-1)
## Forward strand: Exon 0 == Exon 1
### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT
for (id in transcript_ids){
transcript <- total.introns.bed[total.introns.bed$transcript_id == id,]

print(dim(gtf_df))
print(length(unique(gtf_df$transcript_id)))

modify_transcript <- function(transcript){

# Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
if ("CDS" %in% transcript$type){
transcript <- transcript[!transcript$type == "exon",]
Expand All @@ -51,7 +69,7 @@ for (id in transcript_ids){
# Index features
idx <- 0
for (i in 1:nrow(transcript)){
transcript$idx [i]<- idx
transcript$idx[i]<- idx
if (transcript$type[i] == "intron"){
idx <- idx + 1
}
Expand All @@ -68,7 +86,8 @@ for (id in transcript_ids){
#Add "chr" prefix to chromosomes
transcript$chr <- sapply("chr", paste0, transcript$chr)
#Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"}
transcript <- transcript %>% mutate(type = as.character(type))
transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
## DETERMING UTR3 and UTR5
### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION.....
if ("UTR" %in% unique(transcript$type)){
Expand All @@ -79,35 +98,42 @@ for (id in transcript_ids){
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr5"
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3"
}else {
#Reverse strand
start_coding <- max(transcript[transcript$type == "cds","end"])
stop_coding <- min(transcript[transcript$type == "cds","start"])
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr3"
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
}
}
transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F, row.names=F, col.names=F)
#### Any exon that remains after teh cds change, is likely and untranslated region. change below

# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5"
#Reverse strand
transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3"
#transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
expected_types <- c("cds","intron","utr3","utr5")
transcript <- transcript[transcript$type %in% c(expected_types),]
return(transcript)
}

time <- proc.time() - ptm
time
#
# user system elapsed
# 16657.116 32.227 16741.382


new.bed <- fread(file.to_write,data.table = F)
colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id")

#### Any exon that remains after teh cds change, is likely and untranslated region. change below

# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
new.bed$type[new.bed$strand == "f" & new.bed$type == "exon" ] <- "utr5"
#Reverse strand
new.bed$type[new.bed$strand == "r" & new.bed$type == "exon"]<- "utr3"
if(file.exists(file.to_write) ) {file.remove(file.to_write)}

expected_types <- c("cds","intron","utr3","utr5")
new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),]
gtf_df_modified <- gtf_df %>%
group_by(transcript_id,.drop = FALSE) %>%
group_modify(~ modify_transcript(.x)) %>%
select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
arrange(chr,start,end)

write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed", sep="\t", quote=F, row.names=F, col.names=F)
time <- proc.time() - ptm
print(time)

write.table(
gtf_df_modified,
file.to_write,
sep="\t",
quote=F,
row.names=F,
col.names=F
)
84 changes: 64 additions & 20 deletions bin/make_gene_info_for_forte.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,58 @@

#!/usr/local/bin/Rscript
# __author__ = "Alexandria Dymun"
# __email__ = "pintoa1@mskcc.org"
# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)"
# __version__ = "0.0.1"
# __status__ = "Dev"


suppressPackageStartupMessages({
library(dplyr)
library(stringr)
})

library(dplyr)
library(stringr)
library(argparse)

opt = commandArgs(TRUE)

parser=ArgumentParser()
parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf")
parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file")
parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF")
parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF")
parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list")

opt=parser$parse_args()

usage <- function() {
message("Usage:")
message("make_gene_info_for_forte.R --primary_gtf <file.gtf> --fc_custom_bed_gene_names <custom_genes.bed> --star_fusion_ref <ref_annot.gtf> --fusioncatcher_ref <organism.gtf> --outputDir <outputpath>")
}

args = commandArgs(TRUE)

if (is.null(args) | length(args)<1) {
usage()
quit()
}

#' Parse out options from a string without recourse to optparse
#'
#' @param x Long-form argument list like --opt1 val1 --opt2 val2
#'
#' @return named list of options and values similar to optparse

parse_args <- function(x){
args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1]
args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE))
# Ensure the option vectors are length 2 (key/ value) to catch empty ones
args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z})

parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1])))
parsed_args[! is.na(parsed_args)]
}

opt <- parse_args(paste(args,collapse=" "))

required_args <- c(
"primary_gtf",
"fc_custom_bed_gene_names",
"star_fusion_ref",
"fusioncatcher_ref",
"outputDir"
)
if (length(setdiff(required_args,names(opt))) > 0) {
message("Missing required arguments")
usage()
quit()
}

### primary gtf is v75, also used in arriba
primary_gtf <- as.data.frame(rtracklayer::import(opt$primary_gtf))
Expand Down Expand Up @@ -59,7 +90,7 @@ versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names
}))


add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
add_these_excess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
add_symbols_and_ids <- unique_id_to_names[[name]]
add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,]
if(name %in% versioned_gtf){
Expand All @@ -70,7 +101,7 @@ add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names

}))
# Excess genes being added (genes will be flagged as gene not in v75)
gene_info <- rbind(gene_info,add_these_exess_gene_ids)
gene_info <- rbind(gene_info,add_these_excess_gene_ids)

gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F)

Expand All @@ -79,5 +110,18 @@ gene_info$Symbol <- gene_info$gene_name

gene_info <- gene_info[,c("Symbol","Synonyms")]

write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F)
write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F)
write.table(
gene_info,
paste0(opt$outputDir,"/gene.info"),
sep ="\t",
quote = F,
row.names = F

)
write.table(
add_these_excess_gene_ids,
paste0(opt$outputDir,"/excess_gene_ids.txt"),
sep ="\t",
quote = F,
row.names = F
)
4 changes: 0 additions & 4 deletions conf/igenomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ params {
}
}
metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz"
metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/v75_gene.bed.gz"
metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/gene_info_20230714.txt"
ensembl_version = 75
}
'GRCh38' {
Expand All @@ -49,8 +47,6 @@ params {
starfusion_url = null
cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz"
metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe"
metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/v75_gene.bed"
metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/gene_info_20230714.txt"
ensembl_version = 75

}
Expand Down
20 changes: 19 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ process {
]
}

withName: METAFUSION {
withName: METAFUSION_RUN {
ext.args = "--num_tools=${params.fusion_tool_cutoff}"
publishDir = [
path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" },
Expand All @@ -196,6 +196,24 @@ process {

]
}
withName: 'METAFUSION_GENEBED' {
storeDir = { "${params.reference_base}/${params.genome}/metafusion/genebed" }
publishDir = [
enabled: false,
]
}
withName: 'METAFUSION_GENEINFO' {
storeDir = { "${params.reference_base}/${params.genome}/metafusion/geneinfo" }
publishDir = [
enabled: false,
]
}
withName: 'AGAT_SPADDINTRONS' {
storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" }
publishDir = [
enabled: false,
]
}

withName: ADD_FLAG {
publishDir = [
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"agat/spaddintrons": {
"branch": "master",
"git_sha": "6898156da3604a6bdf26c36036053a970050fea0",
"installed_by": ["modules"]
},
"arriba": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
Expand Down
47 changes: 47 additions & 0 deletions modules/local/metafusion/genebed/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
process METAFUSION_GENEBED {
tag "$meta.id"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'ghcr.io/rocker-org/devcontainer/tidyverse:4':
'ghcr.io/rocker-org/devcontainer/tidyverse:4' }"

input:
tuple val(meta), path(gff)

output:
tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
final_generate_v75_gene_bed.R \\
$gff \\
${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.1
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.1
END_VERSIONS
"""
}
Loading

0 comments on commit b7f2c83

Please # to comment.