Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
EC2 Default User committed Apr 3, 2024
1 parent e634817 commit 9e131c7
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 0 deletions.
139 changes: 139 additions & 0 deletions bin/final_generate_v75_gene_bed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@

#!/usr/local/bin/Rscript
# __author__ = "Alexandria Dymun"
# __email__ = "pintoa1@mskcc.org"
# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)"
# __version__ = "0.0.1"
# __status__ = "Dev"


suppressPackageStartupMessages({
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
})

usage <- function() {
message("Usage:")
message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
}

args = commandArgs(TRUE)

if (length(args)!=2) {
usage()
quit()
}

# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
# Add introns to gtf, convert to gff3
# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\
# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\
# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
# -o genes.INTRONS.gff3"

gtf <- rtracklayer::import(args[1])
gtf_df <- as.data.frame(gtf)

file.to_write <- args[2]

gtf_df <- gtf_df %>%
rename(
chr = seqnames
) %>%
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
filter(type %in% c("exon","intron","UTR","CDS","cds","utr"))


#START CLOCK
ptm <- proc.time()
print(ptm)

# Index each transcript feature, incrementing when an intron is passed
## metafusion expects exon count 0 to (N(exons)-1)
## Forward strand: Exon 0 == Exon 1
### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT

print(dim(gtf_df))
print(length(unique(gtf_df$transcript_id)))

modify_transcript <- function(transcript){

# Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
if ("CDS" %in% transcript$type){
transcript <- transcript[!transcript$type == "exon",]
}
# Order features by increasing bp
transcript <- transcript[order(transcript$start, decreasing = FALSE),]
# Index features
idx <- 0
for (i in 1:nrow(transcript)){
transcript$idx[i]<- idx
if (transcript$type[i] == "intron"){
idx <- idx + 1
}
}
# REFORMAT TRANSCRIPT
#Change strand info (+ --> f, - --> r)
if (unique(transcript$strand) == "+"){
transcript$strand <- 'f'
} else if (unique(transcript$strand) == "-"){
transcript$strand <- 'r'
} else {
errorCondition("Strand info for this transcript is inconsistent")
}
#Add "chr" prefix to chromosomes
transcript$chr <- sapply("chr", paste0, transcript$chr)
#Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
transcript <- transcript %>% mutate(type = as.character(type))
transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
## DETERMING UTR3 and UTR5
### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION.....
if ("UTR" %in% unique(transcript$type)){
if( unique(transcript$strand) == "f"){
#Forward strand
start_coding <- min(transcript[transcript$type == "cds","start"])
stop_coding <- max(transcript[transcript$type == "cds","end"])
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr5"
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3"
}else {
#Reverse strand
start_coding <- max(transcript[transcript$type == "cds","end"])
stop_coding <- min(transcript[transcript$type == "cds","start"])
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr3"
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
}
}
#### Any exon that remains after teh cds change, is likely and untranslated region. change below

# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5"
#Reverse strand
transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3"
#transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
expected_types <- c("cds","intron","utr3","utr5")
transcript <- transcript[transcript$type %in% c(expected_types),]
return(transcript)
}

if(file.exists(file.to_write) ) {file.remove(file.to_write)}

gtf_df_modified <- gtf_df %>%
group_by(transcript_id,.drop = FALSE) %>%
group_modify(~ modify_transcript(.x)) %>%
select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
arrange(chr,start,end)

time <- proc.time() - ptm
print(time)

write.table(
gtf_df_modified,
file.to_write,
sep="\t",
quote=F,
row.names=F,
col.names=F
)
8 changes: 8 additions & 0 deletions modules/local/prepare_rrna/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: gawk
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- anaconda::gawk=5.1.0
- bioconda::grep=3.4
6 changes: 6 additions & 0 deletions modules/local/prepare_rrna/main.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
process PREPARE_RRNA {
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'docker.io/ubuntu:20.04' }"

input:
path(gtf)
Expand Down

0 comments on commit 9e131c7

Please # to comment.