Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Rmorin dev #21

Merged
merged 6 commits into from
Jul 13, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ export(tidy_gene_expression)
import(ComplexHeatmap)
import(DBI)
import(RMariaDB)
import(S4Vectors)
import(SRAdb)
import(circlize)
import(config)
Expand All @@ -68,5 +69,6 @@ import(dplyr)
import(ggrepel)
import(grid)
import(metaviz)
import(rtracklayer)
import(stats)
import(tidyverse)
102 changes: 74 additions & 28 deletions R/database.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#' @param seq_type_filter Filtering criteria (default: all genomes)
#' @param tissue_status_filter Filtering criteria (default: only tumour genomes)
#' @param case_set optional short name for a pre-defined set of cases avoiding any
#' @param remove_benchmarking By default the FFPE benchmarking duplicate samples will be dropped
#' @param with_outcomes Optionally join to gambl outcome data
#' @param from_flatfile New default is to use the metadata in the flatfiles from your clone of the repo. Can be over-ridden to use the database
#' embargoed cases (current options: 'BLGSP-study', 'FL-DLBCL-study', 'DLBCL-unembargoed)
#'
#' @return A data frame with metadata for each biopsy in GAMBL
Expand All @@ -20,18 +23,34 @@
#' # override default filters and request metadata for samples other than tumour genomes, e.g. also get the normals
#' only_normal_metadata = get_gambl_metadata(tissue_status_filter = c('tumour','normal'))
get_gambl_metadata = function(seq_type_filter = "genome",
tissue_status_filter=c("tumour"), case_set, remove_benchmarking = TRUE, with_outcomes=FALSE){
db=config::get("database_name")
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
sample_meta = dplyr::tbl(con,"sample_metadata")
tissue_status_filter=c("tumour"),
case_set, remove_benchmarking = TRUE, with_outcomes=FALSE,from_flatfile=TRUE){
outcome_table = get_gambl_outcomes(from_flatfile=from_flatfile) %>% dplyr::select(-sex)

if(from_flatfile){
base = config::get("repo_base")
sample_flatfile = paste0(base,config::get("table_flatfiles")$samples)
sample_meta = read_tsv(sample_flatfile,guess_max=100000)
biopsy_flatfile = paste0(base,config::get("table_flatfiles")$biopsies)
biopsy_meta = read_tsv(biopsy_flatfile,guess_max=100000)

}else{
db=config::get("database_name")
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
sample_meta = dplyr::tbl(con,"sample_metadata") %>% as.data.frame()
biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% as.data.frame()
DBI::dbDisconnect(con)
}
sample_meta_normal_genomes = sample_meta %>% dplyr::filter(seq_type == "genome" & tissue_status=="normal") %>%
dplyr::select(patient_id,sample_id) %>% as.data.frame() %>% dplyr::rename("normal_sample_id"="sample_id")
dplyr::select(patient_id,sample_id) %>% as.data.frame() %>% dplyr::rename("normal_sample_id"="sample_id")

sample_meta = sample_meta %>% dplyr::filter(seq_type == seq_type_filter & tissue_status %in% tissue_status_filter & bam_available == 1)
sample_meta = sample_meta %>% dplyr::filter(seq_type == seq_type_filter & tissue_status %in% tissue_status_filter & bam_available %in% c(1,"TRUE"))

#if we only care about genomes, we can drop/filter anything that isn't a tumour genome
#The key for joining this table to the mutation information is to use sample_id. Think of this as equivalent to a library_id. It will differ depending on what assay was done to the sample.
biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% dplyr::select(-patient_id) %>% dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns

biopsy_meta = biopsy_meta %>% dplyr::select(-patient_id) %>% dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns

all_meta = dplyr::left_join(sample_meta,biopsy_meta,by="biopsy_id") %>% as.data.frame()
all_meta = all_meta %>% mutate(bcl2_ba=ifelse(bcl2_ba=="POS_BCC","POS",bcl2_ba))
Copy link

@bcollinge bcollinge Jul 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth cleaning up all the FISH columns here?

all_meta = all_meta %>% mutate_at(vars(starts_with(c("myc", "bcl2", "bcl6")) & ends_with(c("_ba", "_cn"))), 
          ~str_remove(., "_.*") %>% str_replace(., "^NORM$|^NOMR$", "NORMAL")) 

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point but I think it should actually be done in GAMBL rather than GAMBLR so the metadata is clean before this package sees it. I suggest discussing how to tackle this with @lkhilton

if(seq_type_filter == "genome" & length(tissue_status_filter) == 1 & tissue_status_filter[1] == "tumour"){
Expand Down Expand Up @@ -176,12 +195,12 @@ get_gambl_metadata = function(seq_type_filter = "genome",
TRUE ~ 50
))
if(with_outcomes){
outcome_table = get_gambl_outcomes() %>% dplyr::select(-sex)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like the outcome_table is dropped here, but below (line 199) used in left_join. Is there other place the outcomes are obtained from?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved this line to the top of the function. It's always called but not always joined. I changed the default to always join it because I see no reason why not to. Thoughts?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found it now, see it - line 28. Missed it at first

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to always join with outcomes by default, so all are returned at once 👍


all_meta = left_join(all_meta,outcome_table,by="patient_id") %>%
mutate(age_group = case_when(cohort=="BL_Adult"~"Adult_BL",cohort=="BL_Pediatric" | cohort == "BL_ICGC" ~ "BL_Pediatric", TRUE ~ "Other"))

}
DBI::dbDisconnect(con)

return(all_meta)
}

Expand Down Expand Up @@ -252,10 +271,17 @@ add_icgc_metadata = function(incoming_metadata){
#'
#' @examples
#' outcome_df = get_gambl_outcomes()
get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FALSE,complete_missing=FALSE){
db=config::get("database_name")
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
all_outcome = dplyr::tbl(con,"outcome_metadata") %>% as.data.frame()
get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FALSE,complete_missing=FALSE,from_flatfile=FALSE){
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default for from_flatfile here is set to FALSE, but in the get_metadata default is TRUE. Do you want to default it to TRUE here as well, so the default values are consistent between functions?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't really matter because the only call to get_gambl_outcomes in the code is part of get_gambl_metadata and it passes the desired value of this variable along with it. In the long run I'd actually like to move away from using the database for these tables so I'll make the default false.

if(from_flatfile){
outcome_flatfile = paste0(config::get("repo_base"),config::get("table_flatfiles")$outcomes)
all_outcome = read_tsv(outcome_flatfile)

}else{
db=config::get("database_name")
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
all_outcome = dplyr::tbl(con,"outcome_metadata") %>% as.data.frame()
DBI::dbDisconnect(con)
}
if(!missing(patient_ids)){
all_outcome = all_outcome %>% dplyr::filter(patient_id %in% patient_ids)
if(complete_missing){
Expand Down Expand Up @@ -285,7 +311,7 @@ get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FAL
all_outcome = all_outcome %>% mutate(all_outcome,DFS_MONTHS=PFS_MONTHS)
}
all_outcome = all_outcome %>% mutate(is_adult = ifelse(age < 20, "Pediatric","Adult"))
DBI::dbDisconnect(con)

return(all_outcome)
}

Expand Down Expand Up @@ -735,6 +761,7 @@ get_ssm_by_region = function(chromosome,qstart,qend,
#' @param exclude_cohort Supply this to exclude mutations from one or more cohorts in a list
#' @param limit_pathology Supply this to restrict mutations to one pathology
#' @param basic_columns Set to TRUE to override the default behaviour of returning only the first 45 columns of MAF data
#' @param from_flatfile Set to TRUE to obtain mutations from a local flatfile instead of the database. This can be more efficient and is currently the only option for users who do not have ICGC data access.
#'
#' @return A data frame containing all the MAF data columns (one row per mutation)
#' @export
Expand All @@ -744,18 +771,13 @@ get_ssm_by_region = function(chromosome,qstart,qend,
#' #basic usage
#' maf_data = get_coding_ssm(limit_cohort=c("BL_ICGC"))
#' maf_data = get_coding_ssm(limit_samples=my_sample_ids)
get_coding_ssm = function(limit_cohort,exclude_cohort,limit_pathology,limit_samples,basic_columns=TRUE){
table_name = config::get("results_tables")$ssm
db=config::get("database_name")
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
get_coding_ssm = function(limit_cohort,exclude_cohort,
limit_pathology,limit_samples,basic_columns=TRUE,
from_flatfile=FALSE,groups=c("gambl","icgc_dart")){
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also I think might be helpful to harmonize the defaults with other functions

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strongly disagree. Easier to explain in a conversation. Keeping the metadata in the database has proven to be very problematic due to the ongoing change in the order and number of columns. I'd like to always use the database for all functions except for metadata so the defaults shouldn't be harmonized.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, yes, the metadata is a subject to constant updates. I got it now that defaults for flat files should be different here

coding_class = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation","Silent","Splice_Region","Splice_Site","Targeted_Region","Translation_Start_Site")
sample_meta = dplyr::tbl(con,"sample_metadata") %>% dplyr::filter(seq_type == "genome" & tissue_status == "tumour")
biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% dplyr::select(-patient_id) %>%
dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns
all_meta = left_join(sample_meta,biopsy_meta,by="biopsy_id") %>%
as.data.frame()

all_meta= get_gambl_metadata(from_flatfile=from_flatfile)
#do all remaining filtering on the metadata then add the remaining sample_id to the query
all_meta = all_meta %>% dplyr::filter(unix_group %in% groups)
if(!missing(limit_cohort)){
all_meta = all_meta %>% dplyr::filter(cohort %in% limit_cohort)
}
Expand All @@ -769,15 +791,39 @@ get_coding_ssm = function(limit_cohort,exclude_cohort,limit_pathology,limit_samp
all_meta = all_meta %>% dplyr::filter(sample_id %in% limit_samples)
}
sample_ids = pull(all_meta,sample_id)
muts = tbl(con,table_name) %>%
dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame()

if(from_flatfile){
base_path = config::get("project_base")
#test if we have permissions for the full gambl + icgc merge
maf_partial_path = config::get("results_filatfiles")$ssm$all$cds
maf_path = paste0(base_path,maf_partial_path)
maf_permissions = file.access(maf_path,4)
if(maf_permissions == -1){
#currently this will only return non-ICGC results
maf_partial_path = config::get("results_filatfiles")$ssm$gambl$cds
base_path = config::get("project_base")
#default is non-ICGC
maf_path = paste0(base_path,maf_partial_path)
}
muts=fread_maf(maf_path) %>% dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame()
mutated_samples = length(unique(muts$Tumor_Sample_Barcode))
message(paste("mutations from",mutated_samples,"samples"))
}else{
table_name = config::get("results_tables")$ssm
db=config::get("database_name")
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
muts = tbl(con,table_name) %>%
dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame()
DBI::dbDisconnect(con)
}
muts = muts %>%
dplyr::filter(Tumor_Sample_Barcode %in% sample_ids)

mutated_samples = length(unique(muts$Tumor_Sample_Barcode))
message(paste("after linking with metadata, we have mutations from",mutated_samples,"samples"))
if(basic_columns){
muts = muts[,c(1:45)]
}
DBI::dbDisconnect(con)

return(muts)
}

Expand Down
69 changes: 42 additions & 27 deletions R/preprocessing_io.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,26 @@

#' Populate the database with the per-sample summarized results of various tools
#'
#' @param sample_table A data frame with sample_id as the first column
#' @param tool Name of the tool to get the results for
#' @param base_directory_gambl
#' @param base_directory_other
#'
#' @return Nothing
#' @export
#' @import tidyverse DBI
#'
#' @examples
populate_tool_results = function(sample_table){
populate_tool_results = function(){
#IMPORTANT TODO: This function should only ever work with samples that exist in the metadata
# Perhaps it should report any excluded outputs in case they need to be deleted from the main output directories
matched_analyses = unlist(config::get("analyses")$matched)

print(matched_analyses)
database_name = config::get("database_name")

genome_builds = unlist(strsplit(config::get("genome_builds"),","))
groups= unlist(strsplit(config::get("unix_groups"),","))
for(analysis_type in names(matched_analyses)){
tool_name = matched_analyses[analysis_type]
populate_each_tool_result(tool_name,genome_builds,groups)
message(paste("populating results for",tool_name))
populate_each_tool_result(tool=tool_name,genome_builds,groups)
}
}

Expand Down Expand Up @@ -81,9 +81,9 @@ populate_each_tool_result = function(tool,genome_builds,unix_groups){
if(tool == "sequenza"){
parse_sequenza = function(sequenza_files){
seq_data=sequenza_files %>%
map(read_tsv) %>% #read each file into a list of tibbles
map(head,1) %>% #just keep the first line
reduce(rbind) %>% #rbind the elements all back into one
purrr::map(read_tsv) %>% #read each file into a list of tibbles
purrr::map(head,1) %>% #just keep the first line
purrr::reduce(rbind) %>% #rbind the elements all back into one
dplyr::rename(sequenza_cellularity=cellularity,sequenza_ploidy=ploidy) #change the column names
return(seq_data)
}
Expand Down Expand Up @@ -123,10 +123,13 @@ populate_each_tool_result = function(tool,genome_builds,unix_groups){
}
if(tool == "manta"){
#fetch the output file names per group/build combination

message("processing results from manta")
message(paste(unix_groups,sep=","))
#separately process by unix group
for(ug in unix_groups){
file_df = find_files_extract_wildcards(tool_name="manta",genome_build=c("hg38","grch37"),search_pattern=".bed",unix_group=ug)
files_df = find_files_extract_wildcards(tool_name="manta",genome_build=genome_builds,search_pattern=".bed",unix_group=ug)
print(head(files_df))
message(paste("processing",ug))
manta_df = process_all_manta_bedpe(files_df) #need to add this to the database. Not currently automated
}
}
Expand Down Expand Up @@ -239,13 +242,14 @@ populate_each_tool_result = function(tool,genome_builds,unix_groups){
#' @examples
read_merge_manta_with_liftover = function(bedpe_paths=c(),pattern="--matched",out_dir){
to_merge = list()
print(head(bedpe_paths))
for(thispath in bedpe_paths){
sample_paths = dir(thispath,pattern=pattern) #DEBUGGING
print(sample_paths)
#sample_paths = head(sample_paths,15) #for debugging
for(sp in sample_paths){
full_path = paste0(thispath,sp,"/somaticSV.bedpe")
print(paste("working on ", full_path))
print(paste("working on HERE:", full_path))
if(grepl("hg38",full_path)){
print("using liftOver")
svbed = liftover_bedpe(full_path) #load and convert to grch37 coordinates
Expand Down Expand Up @@ -315,20 +319,20 @@ process_all_manta_bedpe = function(file_df,out_dir){
this_patient = colnames(svbed)[23]
this_normal = colnames(svbed)[22]
out_file = paste0(out_dir,"/",this_patient,"--",this_normal,"--hg38Togrch37_sv.tsv")

message("working on OVER HERE:",bedpe_file)
if(file.exists(out_file)){
if(!only_return_missing){
print(paste("LOADING",out_file))
svbed = read_tsv(out_file,col_types = "ccccccccccccnnnnccc",col_names = cnames)
return(svbed)
}
else{
svbed = dplyr::filter(is.na(tumour_sample_id))
svbed = dplyr::filter(svbed,is.na(tumour_sample_id))
return(svbed)
}
}
if(liftover_to_hg19){
svbed = liftover_bedpe(bedpe_file=bedpe_file)
svbed = liftover_bedpe(bedpe_df=svbed)
}

infos = pull(svbed,this_patient)
Expand All @@ -343,6 +347,7 @@ process_all_manta_bedpe = function(file_df,out_dir){

svbed$tumour_sample_id = this_patient
svbed$normal_sample_id = this_normal
message(paste("checking status:",bedpe_file))
if(grepl("--unmatched",bedpe_file)){
svbed$pair_status = "unmatched"
}else{
Expand All @@ -354,20 +359,25 @@ process_all_manta_bedpe = function(file_df,out_dir){
#remove chr prefix from both chromosome names
svbed = svbed %>% mutate(CHROM_A = gsub("chr","",CHROM_A)) %>% mutate(CHROM_B = gsub("chr","",CHROM_B))
#print(paste("writing output to",out_file))

#run liftover after formatting?

write_tsv(svbed,out_file,col_names=FALSE)
return(svbed)
}

#separately run the hg38 and other builds
not_hg38_files = dplyr::filter(file_df,genome_build != "hg38") %>% pull(file_path)
bed_data_not_lifted =not_hg38_files %>%
map(process_manta) %>%
reduce(rbind)


hg38_files = dplyr::filter(file_df,genome_build == "hg38") %>% pull(file_path)
bed_data_lifted = hg38_files %>%
map(process_manta,liftover_to_hg19=TRUE) %>%
reduce(rbind)
purrr::map(process_manta,liftover_to_hg19=TRUE) %>%
purrr::reduce(rbind)

not_hg38_files = dplyr::filter(file_df,genome_build != "hg38") %>% pull(file_path)
bed_data_not_lifted =not_hg38_files %>%
purrr::map(process_manta) %>%
purrr::reduce(rbind)

}

Expand Down Expand Up @@ -475,9 +485,9 @@ find_files_extract_wildcards = function(tool_results_path,search_pattern,genome_
grepl("--unmatched",filename) ~ "unmatched",
TRUE ~"matched"
)) %>%
mutate(sample_id = strsplit(filename,"--")) %>%
dplyr::mutate(sample_id = strsplit(filename,"--")) %>%
unnest_wider(sample_id,names_sep = "-") %>% dplyr::rename(tumour_sample_id=`sample_id-1`,normal_sample_id=`sample_id-2`) %>%
mutate(tool_name=tool,seq_type=seq_type,unix_group=unix_group) %>%
dplyr::mutate(tool_name=tool_name,seq_type=seq_type,unix_group=unix_group) %>%
dplyr::select(tumour_sample_id,unix_group,tool_name,seq_type,genome_build,file_path,pairing_status,normal_sample_id)
return(found_files)
}
Expand Down Expand Up @@ -617,30 +627,35 @@ assemble_file_details = function(file_details_df,file_paths,tool_name,unix_group
#'
#' @return Data frame containing original bedpe data with new coordinates
#' @export
#' @import tidyverse
#' @import tidyverse rtracklayer S4Vectors
#'
#' @examples
#' hg19_sv = get_manta_sv() %>% head(100)
#' hg38_sv = liftover_bedpe(bedpe_df=hg19_sv,target_build="hg38")
liftover_bedpe = function(bedpe_file,bedpe_df,target_build="grch37",col_names,col_types){

if(!missing(bedpe_file)){
if(missing(col_names)){
message("imposing column names")
original_bedpe = read_tsv(bedpe_file,comment = "##",col_types="cddcddccccccccccccccccc")
}else{
message(paste("using column names",col_names,sep=": "))
original_bedpe = read_tsv(bedpe_file,col_names=col_names,col_types=col_types)
}
}else{
original_bedpe = bedpe_df

}
colnames(original_bedpe)[1]="CHROM_A"
if(!missing(bedpe_df)){
original_bedpe = bedpe_df
}
original_bedpe = as.data.frame(original_bedpe)
original_bedpe=original_bedpe %>% mutate_if(is.numeric, as.integer)
#print(head(original_bedpe))
if(!grepl("chr",original_bedpe$CHROM_A)){
#add chr prefix
original_bedpe = original_bedpe %>% mutate(CHROM_A = paste0("chr",CHROM_A)) %>%
mutate(CHROM_B = paste0("chr",CHROM_B))
}
print(head(original_bedpe))
char_vec = original_bedpe %>% tidyr::unite(united,sep="\t") %>% dplyr::pull(united)
bedpe_obj <- rtracklayer::import(text=char_vec,format="bedpe")
if(length(colnames(original_bedpe))>22){
Expand Down
Loading