-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Rmorin dev #21
Rmorin dev #21
Changes from 2 commits
94082a7
003c597
7fa5ead
461ee61
83b82b9
923b5f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,9 @@ | |
#' @param seq_type_filter Filtering criteria (default: all genomes) | ||
#' @param tissue_status_filter Filtering criteria (default: only tumour genomes) | ||
#' @param case_set optional short name for a pre-defined set of cases avoiding any | ||
#' @param remove_benchmarking By default the FFPE benchmarking duplicate samples will be dropped | ||
#' @param with_outcomes Optionally join to gambl outcome data | ||
#' @param from_flatfile New default is to use the metadata in the flatfiles from your clone of the repo. Can be over-ridden to use the database | ||
#' embargoed cases (current options: 'BLGSP-study', 'FL-DLBCL-study', 'DLBCL-unembargoed) | ||
#' | ||
#' @return A data frame with metadata for each biopsy in GAMBL | ||
|
@@ -20,18 +23,34 @@ | |
#' # override default filters and request metadata for samples other than tumour genomes, e.g. also get the normals | ||
#' only_normal_metadata = get_gambl_metadata(tissue_status_filter = c('tumour','normal')) | ||
get_gambl_metadata = function(seq_type_filter = "genome", | ||
tissue_status_filter=c("tumour"), case_set, remove_benchmarking = TRUE, with_outcomes=FALSE){ | ||
db=config::get("database_name") | ||
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db) | ||
sample_meta = dplyr::tbl(con,"sample_metadata") | ||
tissue_status_filter=c("tumour"), | ||
case_set, remove_benchmarking = TRUE, with_outcomes=FALSE,from_flatfile=TRUE){ | ||
outcome_table = get_gambl_outcomes(from_flatfile=from_flatfile) %>% dplyr::select(-sex) | ||
|
||
if(from_flatfile){ | ||
base = config::get("repo_base") | ||
sample_flatfile = paste0(base,config::get("table_flatfiles")$samples) | ||
sample_meta = read_tsv(sample_flatfile,guess_max=100000) | ||
biopsy_flatfile = paste0(base,config::get("table_flatfiles")$biopsies) | ||
biopsy_meta = read_tsv(biopsy_flatfile,guess_max=100000) | ||
|
||
}else{ | ||
db=config::get("database_name") | ||
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db) | ||
sample_meta = dplyr::tbl(con,"sample_metadata") %>% as.data.frame() | ||
biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% as.data.frame() | ||
DBI::dbDisconnect(con) | ||
} | ||
sample_meta_normal_genomes = sample_meta %>% dplyr::filter(seq_type == "genome" & tissue_status=="normal") %>% | ||
dplyr::select(patient_id,sample_id) %>% as.data.frame() %>% dplyr::rename("normal_sample_id"="sample_id") | ||
dplyr::select(patient_id,sample_id) %>% as.data.frame() %>% dplyr::rename("normal_sample_id"="sample_id") | ||
|
||
sample_meta = sample_meta %>% dplyr::filter(seq_type == seq_type_filter & tissue_status %in% tissue_status_filter & bam_available == 1) | ||
sample_meta = sample_meta %>% dplyr::filter(seq_type == seq_type_filter & tissue_status %in% tissue_status_filter & bam_available %in% c(1,"TRUE")) | ||
|
||
#if we only care about genomes, we can drop/filter anything that isn't a tumour genome | ||
#The key for joining this table to the mutation information is to use sample_id. Think of this as equivalent to a library_id. It will differ depending on what assay was done to the sample. | ||
biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% dplyr::select(-patient_id) %>% dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns | ||
|
||
biopsy_meta = biopsy_meta %>% dplyr::select(-patient_id) %>% dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns | ||
|
||
all_meta = dplyr::left_join(sample_meta,biopsy_meta,by="biopsy_id") %>% as.data.frame() | ||
all_meta = all_meta %>% mutate(bcl2_ba=ifelse(bcl2_ba=="POS_BCC","POS",bcl2_ba)) | ||
if(seq_type_filter == "genome" & length(tissue_status_filter) == 1 & tissue_status_filter[1] == "tumour"){ | ||
|
@@ -176,12 +195,12 @@ get_gambl_metadata = function(seq_type_filter = "genome", | |
TRUE ~ 50 | ||
)) | ||
if(with_outcomes){ | ||
outcome_table = get_gambl_outcomes() %>% dplyr::select(-sex) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it looks like the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved this line to the top of the function. It's always called but not always joined. I changed the default to always join it because I see no reason why not to. Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Found it now, see it - line 28. Missed it at first There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sense to always join with outcomes by default, so all are returned at once 👍 |
||
|
||
all_meta = left_join(all_meta,outcome_table,by="patient_id") %>% | ||
mutate(age_group = case_when(cohort=="BL_Adult"~"Adult_BL",cohort=="BL_Pediatric" | cohort == "BL_ICGC" ~ "BL_Pediatric", TRUE ~ "Other")) | ||
|
||
} | ||
DBI::dbDisconnect(con) | ||
|
||
return(all_meta) | ||
} | ||
|
||
|
@@ -252,10 +271,17 @@ add_icgc_metadata = function(incoming_metadata){ | |
#' | ||
#' @examples | ||
#' outcome_df = get_gambl_outcomes() | ||
get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FALSE,complete_missing=FALSE){ | ||
db=config::get("database_name") | ||
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db) | ||
all_outcome = dplyr::tbl(con,"outcome_metadata") %>% as.data.frame() | ||
get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FALSE,complete_missing=FALSE,from_flatfile=FALSE){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default for from_flatfile here is set to FALSE, but in the get_metadata default is TRUE. Do you want to default it to TRUE here as well, so the default values are consistent between functions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't really matter because the only call to get_gambl_outcomes in the code is part of get_gambl_metadata and it passes the desired value of this variable along with it. In the long run I'd actually like to move away from using the database for these tables so I'll make the default false. |
||
if(from_flatfile){ | ||
outcome_flatfile = paste0(config::get("repo_base"),config::get("table_flatfiles")$outcomes) | ||
all_outcome = read_tsv(outcome_flatfile) | ||
|
||
}else{ | ||
db=config::get("database_name") | ||
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db) | ||
all_outcome = dplyr::tbl(con,"outcome_metadata") %>% as.data.frame() | ||
DBI::dbDisconnect(con) | ||
} | ||
if(!missing(patient_ids)){ | ||
all_outcome = all_outcome %>% dplyr::filter(patient_id %in% patient_ids) | ||
if(complete_missing){ | ||
|
@@ -285,7 +311,7 @@ get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FAL | |
all_outcome = all_outcome %>% mutate(all_outcome,DFS_MONTHS=PFS_MONTHS) | ||
} | ||
all_outcome = all_outcome %>% mutate(is_adult = ifelse(age < 20, "Pediatric","Adult")) | ||
DBI::dbDisconnect(con) | ||
|
||
return(all_outcome) | ||
} | ||
|
||
|
@@ -735,6 +761,7 @@ get_ssm_by_region = function(chromosome,qstart,qend, | |
#' @param exclude_cohort Supply this to exclude mutations from one or more cohorts in a list | ||
#' @param limit_pathology Supply this to restrict mutations to one pathology | ||
#' @param basic_columns Set to TRUE to override the default behaviour of returning only the first 45 columns of MAF data | ||
#' @param from_flatfile Set to TRUE to obtain mutations from a local flatfile instead of the database. This can be more efficient and is currently the only option for users who do not have ICGC data access. | ||
#' | ||
#' @return A data frame containing all the MAF data columns (one row per mutation) | ||
#' @export | ||
|
@@ -744,18 +771,13 @@ get_ssm_by_region = function(chromosome,qstart,qend, | |
#' #basic usage | ||
#' maf_data = get_coding_ssm(limit_cohort=c("BL_ICGC")) | ||
#' maf_data = get_coding_ssm(limit_samples=my_sample_ids) | ||
get_coding_ssm = function(limit_cohort,exclude_cohort,limit_pathology,limit_samples,basic_columns=TRUE){ | ||
table_name = config::get("results_tables")$ssm | ||
db=config::get("database_name") | ||
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db) | ||
get_coding_ssm = function(limit_cohort,exclude_cohort, | ||
limit_pathology,limit_samples,basic_columns=TRUE, | ||
from_flatfile=FALSE,groups=c("gambl","icgc_dart")){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also I think might be helpful to harmonize the defaults with other functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Strongly disagree. Easier to explain in a conversation. Keeping the metadata in the database has proven to be very problematic due to the ongoing change in the order and number of columns. I'd like to always use the database for all functions except for metadata so the defaults shouldn't be harmonized. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, yes, the metadata is a subject to constant updates. I got it now that defaults for flat files should be different here |
||
coding_class = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation","Silent","Splice_Region","Splice_Site","Targeted_Region","Translation_Start_Site") | ||
sample_meta = dplyr::tbl(con,"sample_metadata") %>% dplyr::filter(seq_type == "genome" & tissue_status == "tumour") | ||
biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% dplyr::select(-patient_id) %>% | ||
dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns | ||
all_meta = left_join(sample_meta,biopsy_meta,by="biopsy_id") %>% | ||
as.data.frame() | ||
|
||
all_meta= get_gambl_metadata(from_flatfile=from_flatfile) | ||
#do all remaining filtering on the metadata then add the remaining sample_id to the query | ||
all_meta = all_meta %>% dplyr::filter(unix_group %in% groups) | ||
if(!missing(limit_cohort)){ | ||
all_meta = all_meta %>% dplyr::filter(cohort %in% limit_cohort) | ||
} | ||
|
@@ -769,15 +791,39 @@ get_coding_ssm = function(limit_cohort,exclude_cohort,limit_pathology,limit_samp | |
all_meta = all_meta %>% dplyr::filter(sample_id %in% limit_samples) | ||
} | ||
sample_ids = pull(all_meta,sample_id) | ||
muts = tbl(con,table_name) %>% | ||
dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame() | ||
|
||
if(from_flatfile){ | ||
base_path = config::get("project_base") | ||
#test if we have permissions for the full gambl + icgc merge | ||
maf_partial_path = config::get("results_filatfiles")$ssm$all$cds | ||
maf_path = paste0(base_path,maf_partial_path) | ||
maf_permissions = file.access(maf_path,4) | ||
if(maf_permissions == -1){ | ||
#currently this will only return non-ICGC results | ||
maf_partial_path = config::get("results_filatfiles")$ssm$gambl$cds | ||
base_path = config::get("project_base") | ||
#default is non-ICGC | ||
maf_path = paste0(base_path,maf_partial_path) | ||
} | ||
muts=fread_maf(maf_path) %>% dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame() | ||
mutated_samples = length(unique(muts$Tumor_Sample_Barcode)) | ||
message(paste("mutations from",mutated_samples,"samples")) | ||
}else{ | ||
table_name = config::get("results_tables")$ssm | ||
db=config::get("database_name") | ||
con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db) | ||
muts = tbl(con,table_name) %>% | ||
dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame() | ||
DBI::dbDisconnect(con) | ||
} | ||
muts = muts %>% | ||
dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) | ||
|
||
mutated_samples = length(unique(muts$Tumor_Sample_Barcode)) | ||
message(paste("after linking with metadata, we have mutations from",mutated_samples,"samples")) | ||
if(basic_columns){ | ||
muts = muts[,c(1:45)] | ||
} | ||
DBI::dbDisconnect(con) | ||
|
||
return(muts) | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it worth cleaning up all the FISH columns here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a good point but I think it should actually be done in GAMBL rather than GAMBLR so the metadata is clean before this package sees it. I suggest discussing how to tackle this with @lkhilton