morinlab · rdmorin · Jul 13, 2021 · Jul 5, 2021 · Jul 5, 2021 · Jul 6, 2021
diff --git a/NAMESPACE b/NAMESPACE
@@ -59,6 +59,7 @@ export(tidy_gene_expression)
 import(ComplexHeatmap)
 import(DBI)
 import(RMariaDB)
+import(S4Vectors)
 import(SRAdb)
 import(circlize)
 import(config)
@@ -68,5 +69,6 @@ import(dplyr)
 import(ggrepel)
 import(grid)
 import(metaviz)
+import(rtracklayer)
 import(stats)
 import(tidyverse)
diff --git a/R/database.R b/R/database.R
@@ -6,6 +6,9 @@
 #' @param seq_type_filter Filtering criteria (default: all genomes)
 #' @param tissue_status_filter Filtering criteria (default: only tumour genomes)
 #' @param case_set optional short name for a pre-defined set of cases avoiding any
+#' @param remove_benchmarking By default the FFPE benchmarking duplicate samples will be dropped
+#' @param with_outcomes Optionally join to gambl outcome data
+#' @param from_flatfile New default is to use the metadata in the flatfiles from your clone of the repo. Can be over-ridden to use the database
 #' embargoed cases (current options: 'BLGSP-study', 'FL-DLBCL-study', 'DLBCL-unembargoed)
 #'
 #' @return A data frame with metadata for each biopsy in GAMBL
@@ -20,18 +23,34 @@
 #' # override default filters and request metadata for samples other than tumour genomes, e.g. also get the normals
 #' only_normal_metadata = get_gambl_metadata(tissue_status_filter = c('tumour','normal'))
 get_gambl_metadata = function(seq_type_filter = "genome",
-                              tissue_status_filter=c("tumour"), case_set, remove_benchmarking = TRUE, with_outcomes=FALSE){
-  db=config::get("database_name")
-  con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
-  sample_meta = dplyr::tbl(con,"sample_metadata")
+                              tissue_status_filter=c("tumour"),
+                              case_set, remove_benchmarking = TRUE, with_outcomes=FALSE,from_flatfile=TRUE){
+  outcome_table = get_gambl_outcomes(from_flatfile=from_flatfile) %>% dplyr::select(-sex)
+
+  if(from_flatfile){
+    base = config::get("repo_base")
+    sample_flatfile = paste0(base,config::get("table_flatfiles")$samples)
+    sample_meta = read_tsv(sample_flatfile,guess_max=100000)
+    biopsy_flatfile = paste0(base,config::get("table_flatfiles")$biopsies)
+    biopsy_meta = read_tsv(biopsy_flatfile,guess_max=100000)
+
+  }else{
+    db=config::get("database_name")
+    con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
+    sample_meta = dplyr::tbl(con,"sample_metadata") %>% as.data.frame()
+    biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% as.data.frame()
+    DBI::dbDisconnect(con)
+  }
   sample_meta_normal_genomes =  sample_meta %>% dplyr::filter(seq_type == "genome" & tissue_status=="normal") %>%
-    dplyr::select(patient_id,sample_id) %>% as.data.frame() %>% dplyr::rename("normal_sample_id"="sample_id")
+      dplyr::select(patient_id,sample_id) %>% as.data.frame() %>% dplyr::rename("normal_sample_id"="sample_id")
 
-  sample_meta = sample_meta %>% dplyr::filter(seq_type == seq_type_filter & tissue_status %in% tissue_status_filter & bam_available == 1)
+  sample_meta = sample_meta %>% dplyr::filter(seq_type == seq_type_filter & tissue_status %in% tissue_status_filter & bam_available %in% c(1,"TRUE"))
 
   #if we only care about genomes, we can drop/filter anything that isn't a tumour genome
   #The key for joining this table to the mutation information is to use sample_id. Think of this as equivalent to a library_id. It will differ depending on what assay was done to the sample.
-  biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% dplyr::select(-patient_id) %>% dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns
+
+  biopsy_meta = biopsy_meta %>% dplyr::select(-patient_id) %>% dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns
+
   all_meta = dplyr::left_join(sample_meta,biopsy_meta,by="biopsy_id") %>% as.data.frame()
   all_meta = all_meta %>% mutate(bcl2_ba=ifelse(bcl2_ba=="POS_BCC","POS",bcl2_ba))
   if(seq_type_filter == "genome" & length(tissue_status_filter) == 1 & tissue_status_filter[1] == "tumour"){
@@ -176,12 +195,12 @@ get_gambl_metadata = function(seq_type_filter = "genome",
     TRUE ~ 50
   ))
   if(with_outcomes){
-    outcome_table = get_gambl_outcomes() %>% dplyr::select(-sex)
+
     all_meta = left_join(all_meta,outcome_table,by="patient_id") %>%
       mutate(age_group = case_when(cohort=="BL_Adult"~"Adult_BL",cohort=="BL_Pediatric" | cohort == "BL_ICGC" ~ "BL_Pediatric", TRUE ~ "Other"))
 
   }
-  DBI::dbDisconnect(con)
+
   return(all_meta)
 }
 
@@ -252,10 +271,17 @@ add_icgc_metadata = function(incoming_metadata){
 #'
 #' @examples
 #' outcome_df = get_gambl_outcomes()
-get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FALSE,complete_missing=FALSE){
-  db=config::get("database_name")
-  con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
-  all_outcome = dplyr::tbl(con,"outcome_metadata") %>% as.data.frame()
+get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FALSE,complete_missing=FALSE,from_flatfile=FALSE){
+  if(from_flatfile){
+    outcome_flatfile = paste0(config::get("repo_base"),config::get("table_flatfiles")$outcomes)
+    all_outcome = read_tsv(outcome_flatfile)
+
+  }else{
+    db=config::get("database_name")
+    con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
+    all_outcome = dplyr::tbl(con,"outcome_metadata") %>% as.data.frame()
+    DBI::dbDisconnect(con)
+  }
   if(!missing(patient_ids)){
     all_outcome = all_outcome %>% dplyr::filter(patient_id %in% patient_ids)
     if(complete_missing){
@@ -285,7 +311,7 @@ get_gambl_outcomes = function(patient_ids,time_unit="year",censor_cbioportal=FAL
     all_outcome = all_outcome %>% mutate(all_outcome,DFS_MONTHS=PFS_MONTHS)
   }
   all_outcome = all_outcome %>% mutate(is_adult = ifelse(age < 20, "Pediatric","Adult"))
-  DBI::dbDisconnect(con)
+
   return(all_outcome)
 }
 
@@ -735,6 +761,7 @@ get_ssm_by_region = function(chromosome,qstart,qend,
 #' @param exclude_cohort  Supply this to exclude mutations from one or more cohorts in a list
 #' @param limit_pathology Supply this to restrict mutations to one pathology
 #' @param basic_columns Set to TRUE to override the default behaviour of returning only the first 45 columns of MAF data
+#' @param from_flatfile Set to TRUE to obtain mutations from a local flatfile instead of the database. This can be more efficient and is currently the only option for users who do not have ICGC data access.
 #'
 #' @return A data frame containing all the MAF data columns (one row per mutation)
 #' @export
@@ -744,18 +771,13 @@ get_ssm_by_region = function(chromosome,qstart,qend,
 #' #basic usage
 #' maf_data = get_coding_ssm(limit_cohort=c("BL_ICGC"))
 #' maf_data = get_coding_ssm(limit_samples=my_sample_ids)
-get_coding_ssm = function(limit_cohort,exclude_cohort,limit_pathology,limit_samples,basic_columns=TRUE){
-  table_name = config::get("results_tables")$ssm
-  db=config::get("database_name")
-  con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
+get_coding_ssm = function(limit_cohort,exclude_cohort,
+                          limit_pathology,limit_samples,basic_columns=TRUE,
+                          from_flatfile=FALSE,groups=c("gambl","icgc_dart")){
   coding_class = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation","Silent","Splice_Region","Splice_Site","Targeted_Region","Translation_Start_Site")
-  sample_meta = dplyr::tbl(con,"sample_metadata") %>% dplyr::filter(seq_type == "genome" & tissue_status == "tumour")
-  biopsy_meta = dplyr::tbl(con,"biopsy_metadata") %>% dplyr::select(-patient_id) %>%
-    dplyr::select(-pathology) %>% dplyr::select(-time_point) %>% dplyr::select(-EBV_status_inf) #drop duplicated columns
-    all_meta = left_join(sample_meta,biopsy_meta,by="biopsy_id") %>%
-    as.data.frame()
-
+  all_meta= get_gambl_metadata(from_flatfile=from_flatfile)
   #do all remaining filtering on the metadata then add the remaining sample_id to the query
+  all_meta = all_meta %>% dplyr::filter(unix_group %in% groups)
   if(!missing(limit_cohort)){
     all_meta = all_meta %>% dplyr::filter(cohort %in% limit_cohort)
   }
@@ -769,15 +791,39 @@ get_coding_ssm = function(limit_cohort,exclude_cohort,limit_pathology,limit_samp
     all_meta = all_meta %>% dplyr::filter(sample_id %in% limit_samples)
   }
   sample_ids = pull(all_meta,sample_id)
-  muts = tbl(con,table_name) %>%
-    dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame()
+
+  if(from_flatfile){
+    base_path = config::get("project_base")
+    #test if we have permissions for the full gambl + icgc merge
+    maf_partial_path = config::get("results_filatfiles")$ssm$all$cds
+    maf_path = paste0(base_path,maf_partial_path)
+    maf_permissions = file.access(maf_path,4)
+    if(maf_permissions == -1){
+      #currently this will only return non-ICGC results
+      maf_partial_path = config::get("results_filatfiles")$ssm$gambl$cds
+      base_path = config::get("project_base")
+      #default is non-ICGC
+      maf_path = paste0(base_path,maf_partial_path)
+    }
+    muts=fread_maf(maf_path) %>% dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame()
+    mutated_samples = length(unique(muts$Tumor_Sample_Barcode))
+    message(paste("mutations from",mutated_samples,"samples"))
+  }else{
+    table_name = config::get("results_tables")$ssm
+    db=config::get("database_name")
+    con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = db)
+    muts = tbl(con,table_name) %>%
+      dplyr::filter(Variant_Classification %in% coding_class) %>% as.data.frame()
+    DBI::dbDisconnect(con)
+  }
   muts = muts %>%
     dplyr::filter(Tumor_Sample_Barcode %in% sample_ids)
-
+  mutated_samples = length(unique(muts$Tumor_Sample_Barcode))
+  message(paste("after linking with metadata, we have mutations from",mutated_samples,"samples"))
   if(basic_columns){
     muts = muts[,c(1:45)]
   }
-  DBI::dbDisconnect(con)
+
   return(muts)
 }
 

diff --git a/R/preprocessing_io.R b/R/preprocessing_io.R
@@ -3,26 +3,26 @@
 
 #' Populate the database with the per-sample summarized results of various tools
 #'
-#' @param sample_table A data frame with sample_id as the first column
 #' @param tool Name of the tool to get the results for
-#' @param base_directory_gambl
-#' @param base_directory_other
 #'
 #' @return Nothing
 #' @export
 #' @import tidyverse DBI
 #'
 #' @examples
-populate_tool_results = function(sample_table){
+populate_tool_results = function(){
+  #IMPORTANT TODO: This function should only ever work with samples that exist in the metadata
+  # Perhaps it should report any excluded outputs in case they need to be deleted from the main output directories
   matched_analyses = unlist(config::get("analyses")$matched)
-
+  print(matched_analyses)
   database_name = config::get("database_name")
 
   genome_builds = unlist(strsplit(config::get("genome_builds"),","))
   groups= unlist(strsplit(config::get("unix_groups"),","))
   for(analysis_type in names(matched_analyses)){
     tool_name = matched_analyses[analysis_type]
-    populate_each_tool_result(tool_name,genome_builds,groups)
+    message(paste("populating results for",tool_name))
+    populate_each_tool_result(tool=tool_name,genome_builds,groups)
   }
 }
 
@@ -81,9 +81,9 @@ populate_each_tool_result = function(tool,genome_builds,unix_groups){
   if(tool == "sequenza"){
     parse_sequenza = function(sequenza_files){
       seq_data=sequenza_files %>%
-        map(read_tsv) %>% #read each file into a list of tibbles
-        map(head,1) %>% #just keep the first line
-        reduce(rbind) %>% #rbind the elements all back into one
+        purrr::map(read_tsv) %>% #read each file into a list of tibbles
+        purrr::map(head,1) %>% #just keep the first line
+        purrr::reduce(rbind) %>% #rbind the elements all back into one
         dplyr::rename(sequenza_cellularity=cellularity,sequenza_ploidy=ploidy) #change the column names
       return(seq_data)
     }
@@ -123,10 +123,13 @@ populate_each_tool_result = function(tool,genome_builds,unix_groups){
   }
   if(tool == "manta"){
     #fetch the output file names per group/build combination
-
+    message("processing results from manta")
+    message(paste(unix_groups,sep=","))
     #separately process by unix group
     for(ug in unix_groups){
-      file_df = find_files_extract_wildcards(tool_name="manta",genome_build=c("hg38","grch37"),search_pattern=".bed",unix_group=ug)
+      files_df = find_files_extract_wildcards(tool_name="manta",genome_build=genome_builds,search_pattern=".bed",unix_group=ug)
+      print(head(files_df))
+      message(paste("processing",ug))
       manta_df = process_all_manta_bedpe(files_df) #need to add this to the database. Not currently automated
     }
   }
@@ -239,13 +242,14 @@ populate_each_tool_result = function(tool,genome_builds,unix_groups){
 #' @examples
 read_merge_manta_with_liftover = function(bedpe_paths=c(),pattern="--matched",out_dir){
   to_merge = list()
+  print(head(bedpe_paths))
   for(thispath in bedpe_paths){
     sample_paths = dir(thispath,pattern=pattern) #DEBUGGING
     print(sample_paths)
     #sample_paths = head(sample_paths,15) #for debugging
     for(sp in sample_paths){
       full_path = paste0(thispath,sp,"/somaticSV.bedpe")
-      print(paste("working on ", full_path))
+      print(paste("working on HERE:", full_path))
       if(grepl("hg38",full_path)){
         print("using liftOver")
         svbed = liftover_bedpe(full_path) #load and convert to grch37 coordinates
@@ -315,20 +319,20 @@ process_all_manta_bedpe = function(file_df,out_dir){
     this_patient = colnames(svbed)[23]
     this_normal = colnames(svbed)[22]
     out_file = paste0(out_dir,"/",this_patient,"--",this_normal,"--hg38Togrch37_sv.tsv")
-
+    message("working on OVER HERE:",bedpe_file)
     if(file.exists(out_file)){
       if(!only_return_missing){
         print(paste("LOADING",out_file))
         svbed = read_tsv(out_file,col_types = "ccccccccccccnnnnccc",col_names = cnames)
         return(svbed)
       }
       else{
-        svbed = dplyr::filter(is.na(tumour_sample_id))
+        svbed = dplyr::filter(svbed,is.na(tumour_sample_id))
         return(svbed)
       }
     }
     if(liftover_to_hg19){
-      svbed = liftover_bedpe(bedpe_file=bedpe_file)
+      svbed = liftover_bedpe(bedpe_df=svbed)
     }
 
     infos = pull(svbed,this_patient)
@@ -343,6 +347,7 @@ process_all_manta_bedpe = function(file_df,out_dir){
 
     svbed$tumour_sample_id = this_patient
     svbed$normal_sample_id = this_normal
+    message(paste("checking status:",bedpe_file))
     if(grepl("--unmatched",bedpe_file)){
       svbed$pair_status = "unmatched"
     }else{
@@ -354,20 +359,25 @@ process_all_manta_bedpe = function(file_df,out_dir){
     #remove chr prefix from both chromosome names
     svbed = svbed %>% mutate(CHROM_A = gsub("chr","",CHROM_A)) %>% mutate(CHROM_B = gsub("chr","",CHROM_B))
     #print(paste("writing output to",out_file))
+
+    #run liftover after formatting?
+
     write_tsv(svbed,out_file,col_names=FALSE)
     return(svbed)
   }
 
   #separately run the hg38 and other builds
-  not_hg38_files = dplyr::filter(file_df,genome_build != "hg38") %>% pull(file_path)
-  bed_data_not_lifted =not_hg38_files %>%
-    map(process_manta) %>%
-    reduce(rbind)
+
 
   hg38_files = dplyr::filter(file_df,genome_build == "hg38") %>% pull(file_path)
   bed_data_lifted = hg38_files %>%
-    map(process_manta,liftover_to_hg19=TRUE) %>%
-    reduce(rbind)
+    purrr::map(process_manta,liftover_to_hg19=TRUE) %>%
+    purrr::reduce(rbind)
+
+  not_hg38_files = dplyr::filter(file_df,genome_build != "hg38") %>% pull(file_path)
+  bed_data_not_lifted =not_hg38_files %>%
+    purrr::map(process_manta) %>%
+    purrr::reduce(rbind)
 
 }
 
@@ -475,9 +485,9 @@ find_files_extract_wildcards = function(tool_results_path,search_pattern,genome_
       grepl("--unmatched",filename) ~ "unmatched",
       TRUE ~"matched"
     )) %>%
-    mutate(sample_id = strsplit(filename,"--")) %>%
+    dplyr::mutate(sample_id = strsplit(filename,"--")) %>%
     unnest_wider(sample_id,names_sep = "-") %>% dplyr::rename(tumour_sample_id=`sample_id-1`,normal_sample_id=`sample_id-2`) %>%
-    mutate(tool_name=tool,seq_type=seq_type,unix_group=unix_group) %>%
+    dplyr::mutate(tool_name=tool_name,seq_type=seq_type,unix_group=unix_group) %>%
     dplyr::select(tumour_sample_id,unix_group,tool_name,seq_type,genome_build,file_path,pairing_status,normal_sample_id)
   return(found_files)
 }
@@ -617,30 +627,35 @@ assemble_file_details = function(file_details_df,file_paths,tool_name,unix_group
 #'
 #' @return Data frame containing original bedpe data with new coordinates
 #' @export
-#' @import tidyverse
+#' @import tidyverse rtracklayer S4Vectors
 #'
 #' @examples
 #' hg19_sv = get_manta_sv() %>% head(100)
 #' hg38_sv = liftover_bedpe(bedpe_df=hg19_sv,target_build="hg38")
 liftover_bedpe = function(bedpe_file,bedpe_df,target_build="grch37",col_names,col_types){
+
   if(!missing(bedpe_file)){
     if(missing(col_names)){
+      message("imposing column names")
       original_bedpe = read_tsv(bedpe_file,comment = "##",col_types="cddcddccccccccccccccccc")
     }else{
+      message(paste("using column names",col_names,sep=": "))
       original_bedpe = read_tsv(bedpe_file,col_names=col_names,col_types=col_types)
     }
   }else{
     original_bedpe = bedpe_df
+
   }
   colnames(original_bedpe)[1]="CHROM_A"
-  if(!missing(bedpe_df)){
-    original_bedpe = bedpe_df
-  }
+  original_bedpe = as.data.frame(original_bedpe)
+  original_bedpe=original_bedpe %>% mutate_if(is.numeric, as.integer)
+  #print(head(original_bedpe))
   if(!grepl("chr",original_bedpe$CHROM_A)){
     #add chr prefix
     original_bedpe = original_bedpe %>% mutate(CHROM_A = paste0("chr",CHROM_A)) %>%
       mutate(CHROM_B = paste0("chr",CHROM_B))
   }
+  print(head(original_bedpe))
   char_vec = original_bedpe %>% tidyr::unite(united,sep="\t") %>% dplyr::pull(united)
   bedpe_obj <- rtracklayer::import(text=char_vec,format="bedpe")
   if(length(colnames(original_bedpe))>22){