Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Update to handle capture data #60

Merged
merged 21 commits into from
Mar 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,33 @@ Suggests:
testthat
biocViews:
Imports:
metaviz,
RMariaDB,
broom,
circlize,
ComplexHeatmap,
config,
cowplot,
data.table,
DBI,
maftools,
ggbeeswarm,
SRAdb,
dbplyr,
dplyr,
g3viz,
GenomeInfoDb,
circlize,
ggbeeswarm,
ggplot2,
ggrepel,
ggsci,
ggthemes,
grid,
maftools,
metaviz,
reshape2,
RMariaDB,
rtracklayer,
config,
tidyverse
S4Vectors,
SRAdb,
stats,
tidyverse,
workflowr
VignetteBuilder: knitr
Depends:
R (>= 2.10)
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export(get_cn_states)
export(get_coding_ssm)
export(get_coding_ssm_status)
export(get_combined_sv)
export(get_excluded_samples)
export(get_gambl_colours)
export(get_gambl_metadata)
export(get_gambl_outcomes)
Expand All @@ -50,6 +51,7 @@ export(get_merged_result)
export(get_mutation_frequency_bin_matrix)
export(get_sample_cn_segments)
export(get_ssm_by_gene)
export(get_ssm_by_patients)
export(get_ssm_by_region)
export(get_ssm_by_regions)
export(get_ssm_by_sample)
Expand Down Expand Up @@ -82,6 +84,7 @@ export(sv_to_custom_track)
export(theme_Morons)
export(tidy_gene_expression)
export(trim_scale_expression)
export(web_initialize_gambl_site)
import(ComplexHeatmap)
import(DBI)
import(RMariaDB)
Expand All @@ -105,3 +108,4 @@ import(reshape2)
import(rtracklayer)
import(stats)
import(tidyverse)
import(workflowr)
89 changes: 62 additions & 27 deletions R/annotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,92 @@
#' Annotate and auto-drop a MAF data frame with existing blacklists to remove variants that would be dropped during the merge process
#'
#' @param mutations_df
#' @param seq_type The seq_type of your mutations if you prefer to apply only the corresponding blacklist (default is to use all available)
#' @param unix_group
#' @param tool_name
#' @param flavour Set to "clustered" if you want to use the blacklist for the new and improved SLMS-3 outputs (otherwise leave empty)
#' @param genome_build The genome build projection for the variants you want (grch37 is the only one currently supported)
#' @param genome_build The genome build projection for the variants you are working with (default is grch37)
#' @param project_base Optional: A full path to the directory that your blacklist_file_pattern is relative to
#' @param blacklist_file_pattern Optional: A string that contains the relative path to your blacklist file from after the project_base (i.e. results) with any wildcards surrounded with curly braces
#' @param drop_threshold The minimum count from one of the blacklists to drop a variant
#' @param verbose For debugging, print out a bunch of possibly useful information
#' @param invert USE WITH CAUTION! This returns only the variants that would be dropped in the process (opposite of what you want, probably)
#'
#' @return A MAF format data frame with two new columns indicating the number of occurrences of each variant in the two blacklists
#' @export
#'
#' @examples deblacklisted_maf_df = annotate_ssm_blacklist(original_maf_df)
annotate_ssm_blacklist = function(mutations_df,
unix_group="gambl",
seq_type,
tool_name="slms_3",
tool_version="1.0",
annotator_name="vcf2maf",
annotator_version="1.2",
flavour="",
genome_build="grch37",
project_base,
blacklist_file_template,
drop_threshold = 4,
return_blacklist = FALSE){
if(flavour=="clustered"){
native_blacklist_path = paste0(config::get("project_base"),unix_group,"/",tool_name,"-",tool_version,"_",annotator_name,"-",annotator_version,"/level_3/variants_",genome_build,"_native_clean_blacklist.txt")
lifted_blacklist_path = paste0(config::get("project_base"),unix_group,"/",tool_name,"-",tool_version,"_",annotator_name,"-",annotator_version,"/level_3/variants_",genome_build, "_lifted_clean_blacklist.txt")
return_blacklist = FALSE,
verbose = FALSE,
invert = FALSE){
if(missing(seq_type)){
message("User must specify seq_type of the mutations to select the right blacklist file. More than one seq_type can be specified as a vector if desired.")
return()
}
projection = genome_build
if(missing(blacklist_file_template)){
blacklist_template = config::get("resources")$blacklist$template
}else{
annotator_version="1.2"
native_blacklist_path = paste0(config::get("project_base"),unix_group,"/",tool_name,"-",tool_version,"_",annotator_name,"-",annotator_version,"/level_3/variants_",genome_build,"_native_clean_blacklist.txt")
lifted_blacklist_path = paste0(config::get("project_base"),unix_group,"/",tool_name,"-",tool_version,"_",annotator_name,"-",annotator_version,"/level_3/variants_",genome_build, "_lifted_clean_blacklist.txt")
blacklist_template = blacklist_file_template
}
lifted_blacklist=read_tsv(lifted_blacklist_path,col_names = c("chrpos","lifted_blacklist_count"),show_col_types = FALSE)
native_blacklist=read_tsv(native_blacklist_path,col_names = c("chrpos","native_blacklist_count"),show_col_types = FALSE)
native_blacklist = native_blacklist %>% separate(chrpos,into=c("Chromosome","Start_Position"),sep=":")
lifted_blacklist = lifted_blacklist %>% separate(chrpos,into=c("Chromosome","Start_Position"),sep=":")

lifted_blacklist = mutate(lifted_blacklist,Start_Position = as.integer(Start_Position))
native_blacklist = mutate(native_blacklist,Start_Position = as.integer(Start_Position))
if(missing(project_base)){
project_base = config::get("project_base")
}
blacklist_files = glue(blacklist_template)
blacklist_list = list()
for(b in blacklist_files){
full_path = paste0(project_base,b)
lifted_blacklist=read_tsv(full_path,col_names = c("chrpos","blacklist_count"),show_col_types = FALSE)
lifted_blacklist = lifted_blacklist %>% separate(chrpos,into=c("Chromosome","Start_Position"),sep=":")
blacklist_list[[b]] = lifted_blacklist
}
combined_blacklist = do.call("rbind",blacklist_list)
# Collapse variant counts per Start_Position
combined_blacklist = mutate(combined_blacklist,Start_Position = as.integer(Start_Position)) %>%
group_by(Start_Position, Chromosome) %>%
summarize(blacklist_count = sum(blacklist_count)) %>%
ungroup()
if(return_blacklist){
return(native_blacklist)
return(combined_blacklist)
}
#join using chromosome and position
print(mutations_df)
print(native_blacklist)
if(genome_build == "hg38"){
native_blacklist = mutate(native_blacklist,Chromosome = paste0("chr",Chromosome))
lifted_blacklist = mutate(lifted_blacklist,Chromosome = paste0("chr",Chromosome))
if(verbose){
print(head(mutations_df))
print(head(combined_blacklist))
}
if(str_detect(mutations_df$Chromosome, "chr")[1]){
combined_blacklist = mutate(combined_blacklist,Chromosome = paste0("chr",Chromosome))

}
mutations_df = left_join(mutations_df,combined_blacklist,by=c("Chromosome", "Start_Position")) %>%
mutate(blacklist_count = replace_na(blacklist_count, 0))


dropped = dplyr::filter(mutations_df,blacklist_count > drop_threshold)
if(verbose){
if(length(dropped) > 0 ){
ndrop = length(dropped$Tumor_Sample_Barcode)
message(paste(ndrop,"variants were dropped"))
} else {
message("0 variants were dropped")
}

}
mutations_df = left_join(mutations_df,native_blacklist,by=c("Chromosome", "Start_Position"))
mutations_df = left_join(mutations_df,lifted_blacklist,by=c("Chromosome", "Start_Position"))
#drop anything that exceeds our threshold but keep NA
mutations_df = dplyr::filter(mutations_df,is.na(native_blacklist_count) | native_blacklist_count < drop_threshold)
mutations_df = dplyr::filter(mutations_df,is.na(lifted_blacklist_count) | lifted_blacklist_count < drop_threshold)
mutations_df = dplyr::filter(mutations_df,is.na(blacklist_count) | blacklist_count < drop_threshold)
if(invert){
return(dropped)
}
return(mutations_df)
}

Expand Down
Loading