Merge pull request #30 from taylor-lab/master

Merge taylor-lab/facets-suite:master into mskcc/facets-suite:Rpackagev2
mskcc · Feb 11, 2020 · 1c1fdf6 · 1c1fdf6
2 parents 95333a7 + 4cbde6d
commit 1c1fdf6
Show file tree

Hide file tree

Showing 13 changed files with 301 additions and 107 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: facetsSuite
 Type: Package
 Title: Functions to run and parse output from FACETS
-Version: 1.0
+Version: 2.0.2
 Authors@R: person(given = "Philip",
              family = "Jonsson",
              role = c("aut", "cre"),
              email = "philip.jonsson@gmail.com")
-URL: https://github.com/mskcc/facetsSuite
+URL: https://github.com/mskcc/facets-suite
 Description: This package provides functions that wrap the FACETS package for allele-specific copy-number analysis as well as functions to perform post-hoc analyses thereof.
 License: MIT + file LICENSE
 RoxygenNote: 6.1.1
@@ -15,8 +15,6 @@ Encoding: UTF-8
 Depends:
     R (>= 2.10)
 Imports:
-    facets (>= 0.5.2),
-    pctGCdata (>= 0.3.0),
     data.table (>= 1.11.8),
     dplyr (>= 0.8.3),
     plyr (>= 1.8.4),
@@ -28,9 +26,6 @@ Imports:
     scales (>= 1.0.0),
     argparse (>= 1.1.1),
     tibble (>= 2.1.3)
-Remotes:
-    mskcc/facets,
-    veseshan/pctGCdata
 NeedsCompilation: no
 Packaged: 2019-02-07 16:38:38 UTC; jonssonp
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(calculate_loh)
 export(calculate_lst)
 export(calculate_ntai)
 export(ccf_annotate_maf)
+export(ccf_annotate_maf_legacy)
 export(cf_plot)
 export(check_fit)
 export(closeup_plot)
@@ -17,11 +18,11 @@ export(icn_plot)
 export(plot_segmentation)
 export(read_snp_matrix)
 export(run_facets)
+export(create_legacy_output)
+export(load_facets_output)
 export(valor_plot)
 import(data.table)
-import(facets)
 import(ggplot2)
-import(pctGCdata)
 importFrom(data.table,":=")
 importFrom(data.table,foverlaps)
 importFrom(data.table,setDT)

diff --git a/R/ccf-annotate-maf.R b/R/ccf-annotate-maf.R
@@ -32,20 +32,22 @@ ccf_annotate_maf = function(maf,
     # Find segments overlapping mutation loci
     data.table::setDT(maf, key = c('Chromosome', 'Start_Position', 'End_Position'))
     original_cols = names(maf)
+    maf <- maf[, Chromosome:=as.character(Chromosome)]
 
     segs$chrom[segs$chrom == 23] = 'X'
     if (algorithm == 'em') {
         segs$tcn = segs$tcn.em
         segs$lcn = segs$lcn.em
+        segs$cf = segs$cf.em
     }
-    segs = segs[, c('chrom', 'start', 'end', 'tcn', 'lcn')]
+    segs = segs[, c('chrom', 'start', 'end', 'tcn', 'lcn', 'cf')]
     data.table::setDT(segs, key = c('chrom', 'start', 'end'))
 
     maf = data.table::foverlaps(maf, segs,
                                 by.x = c('Chromosome', 'Start_Position', 'End_Position'),
                                 by.y = c('chrom', 'start', 'end'),
                                 type = 'within', mult = 'first', nomatch = NA)
-    maf = maf[, c(original_cols, 'tcn', 'lcn'), with = FALSE]
+    maf = maf[, c(original_cols, 'tcn', 'lcn', 'cf'), with = FALSE]
 
     # Calculate CCFs
     maf[, `:=` (
@@ -59,13 +61,46 @@ ccf_annotate_maf = function(maf,
     maf[, c('ccf_Mcopies', 'ccf_Mcopies_lower', 'ccf_Mcopies_upper', 'ccf_Mcopies_prob95', 'ccf_Mcopies_prob90') :=
             estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     maf[, c('ccf_1copy', 'ccf_1copy_lower', 'ccf_1copy_upper', 'ccf_1copy_prob95', 'ccf_1copy_prob90') :=
-            estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
+            estimate_ccf(purity, tcn, 1, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     maf[, c('ccf_expected_copies', 'ccf_expected_copies_lower', 'ccf_expected_copies_upper',
             'ccf_expected_copies_prob95', 'ccf_expected_copies_prob90') :=
-            estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
+            estimate_ccf(purity, tcn, expected_alt_copies, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     as.data.frame(maf)
 }
 
+#' Estimate CCFs of somatic mutations from cncf.txt file. 
+#'
+#' Based on FACETS data, infer cancer-cell fraction (CCF) for somatic mutations in a sample.
+#'
+#' @param maf Input MAF file.
+#' @param cncf_txt_file .cncf.txt file created with legacy output of facets-suite.
+#' @param purity Sample purity estimate.
+#' @param algorithm Choice between FACETS \code{em} and \code{cncf} algorithm.
+#' 
+#' @importFrom data.table setDT foverlaps :=
+#' @importFrom stats dbinom
+#'
+#' @return MAF file annotated with clonality estimates for each mutation, where the following column prefixes are used:
+#' \itemize{
+#'   \item{\code{ccf_Mcopies*}:} {Inferred CCF if mutation is on the major allele.}
+#'   \item{\code{ccf_1copy*}:} {Inferred CCF if mutation exists in one copy.}
+#'   \item{\code{ccf_expected_copies*}:} {Inferred CCF if mutation exists in number of copies expected from observed VAF and local ploidy.}
+#' }
+#' 
+#' @export
+ccf_annotate_maf_legacy <- function(maf, 
+                                    cncf_txt_file,
+                                    purity,
+                                    algorithm = c('em', 'cncf')) {
+    segs <-
+        fread(cncf_txt_file) %>%
+        select(chrom, seg, num.mark, nhet, cnlr.median,
+               mafR, segclust, cnlr.median.clust, mafR.clust, 
+               start = loc.start, end = loc.end, cf.em, tcn.em, lcn.em, cf, tcn, lcn)
+
+    ccf_annotate_maf(maf, segs, purity, algorithm)    
+}
+
 # Estimate most likely CCF given observed VAF, purity and local ploidy
 # Based on PMID 25877892
 estimate_ccf = function(purity,

diff --git a/R/check-fit.R b/R/check-fit.R
@@ -9,8 +9,8 @@
 #'
 #' @return A list object with the following items:
 #' \itemize{
-#'       \item{\code{diplogr_flag}:} {Boolean indicating extreme dipLogR value.}
-#'       \item{\code{n_alternative_diplogr}:} {Number of alternative dipLogR values.}
+#'       \item{\code{dipLogR_flag}:} {Boolean indicating extreme dipLogR value.}
+#'       \item{\code{n_alternative_dipLogR}:} {Number of alternative dipLogR values.}
 #'       \item{\code{n_dip_bal_segs}, \code{frac_dip_bal_segs}:} {Number of balanced segments at dipLogR and the fraction of genome they represent.}
 #'       \item{\code{n_dip_imbal_segs}, \code{frac_dip_imbal_segs}:} {Number of imbalanced segments at dipLogR and the fraction of genome they represent.}
 #'       \item{\code{n_amp}:} {Number of segments at total copy number >= 10.}
@@ -51,7 +51,7 @@ check_fit = function(facets_output,
     # Set variables
     segs = as.data.table(facets_output$segs)
     snps = facets_output$snps
-    diplogr = facets_output$diplogr
+    dipLogR = facets_output$dipLogR
     alballogr = as.numeric(facets_output$alballogr[, 1])
     purity = facets_output$purity
     mafr_thresh = facets_output$mafr_thresh
@@ -74,15 +74,15 @@ check_fit = function(facets_output,
     auto_segs = segs[chrom < 23]
 
     # Check for extrema dipLogR values
-    diplogr_flag = abs(facets_output$diplogr) > 1
+    dipLogR_flag = abs(facets_output$dipLogR) > 1
 
     # Check for alternative dipLogR values
-    n_alt_diplogr = length(setdiff(alballogr, diplogr))
+    n_alt_dipLogR = length(setdiff(alballogr, dipLogR))
 
     # Check for balance/imbalance of copy-number at segments at dipLogR
     # cnlr.median.clust == dipLogR for purity runs, for others, find the closest one
     cnlr_clusts = unique(segs$cnlr.median.clust)
-    cnlr_clust_value = cnlr_clusts[which.min(abs(cnlr_clusts - diplogr))]
+    cnlr_clust_value = cnlr_clusts[which.min(abs(cnlr_clusts - dipLogR))]
 
     dip_bal_segs = auto_segs[cnlr.median.clust == cnlr_clust_value & mcn == lcn, ]
     n_dip_bal_segs = nrow(dip_bal_segs)
@@ -95,9 +95,13 @@ check_fit = function(facets_output,
     # Number of high-level amplifications and homozygous deletions
     # Clonal homdels, how much of the genome do they represent
     n_amps = nrow(segs[tcn >= 10])
-    n_homdels = nrow(segs[tcn == 0])
+    homdels = auto_segs[tcn == 0]
+    n_homdels = nrow(homdels)
+
     clonal_homdels = auto_segs[tcn == 0 & clonal == TRUE]
     n_homdels_clonal = nrow(clonal_homdels)
+
+    frac_homdels = sum(homdels$length)/sum(auto_segs$length)
     frac_homdels_clonal = sum(clonal_homdels$length)/sum(auto_segs$length)
 
     # Count number of unique copy-number states and total number of segments
@@ -167,14 +171,16 @@ check_fit = function(facets_output,
     # n: denotes number
     # frac: denotes fraction of assesses genome
     output = list(
-        diplogr_flag = diplogr_flag,
-        n_alternative_diplogr = n_alt_diplogr,
+        dipLogR_flag = dipLogR_flag,
+        n_alternative_dipLogR = n_alt_dipLogR,
+        wgd = wgd,
         n_dip_bal_segs = n_dip_bal_segs,
         frac_dip_bal_segs = frac_dip_bal_segs,
         n_dip_imbal_segs = n_dip_imbal_segs,
         frac_dip_imbal_segs = frac_dip_imbal_segs,
         n_amps = n_amps,
         n_homdels = n_homdels,
+        frac_homdels = frac_homdels,
         n_homdels_clonal = n_homdels_clonal,
         frac_homdels_clonal = frac_homdels_clonal,
         n_cn_states = n_cn_states,

diff --git a/R/copy-number-scores.R b/R/copy-number-scores.R
@@ -152,6 +152,23 @@ calculate_ntai = function(segs,
         chrom_seg
     })
 
+    if (nrow(segs) == 0) {
+        return(
+            list(
+                ntelomeric_ai = NA, # telomeric AI
+                telomeric_ai_mean_size = NA,
+                ninterstitial_ai = NA, # interstitial AI
+                interstitial_ai_mean_size = NA,
+                ncentromeric_ai = NA, # chromosomal AI
+                ntelomeric_loh = NA, # telomeric LOH
+                telomeric_loh_mean_size = NA,
+                ninterstitial_loh = NA, # interstitial LOH
+                interstitial_loh_mean_size = NA,
+                ncentromeric_loh = NA # chromosomal LOH
+            ) 
+        )
+    }
+
     # Add a column to call AI
     # Codes for telomeric/interstitial/whole chromosome:
     # 0 = no AI

diff --git a/R/format-igv-seg.R b/R/format-igv-seg.R
@@ -13,8 +13,8 @@
 #' @export
 format_igv_seg = function(facets_output, sample_id, normalize = TRUE) {
 
-    if (!all(c('snps', 'segs', 'diplogr') %in% names(facets_output))) {
-        stop(paste0('Input is missing segs, snps or diplogr ojbect.'), call. = FALSE)
+    if (!all(c('snps', 'segs', 'dipLogR') %in% names(facets_output))) {
+        stop(paste0('Input is missing segs, snps or dipLogR ojbect.'), call. = FALSE)
     }
 
     seg = group_by(facets_output$snps, chrom, seg) %>% 
@@ -26,6 +26,6 @@ format_igv_seg = function(facets_output, sample_id, normalize = TRUE) {
         mutate(ID = sample_id) %>% 
         select(ID, chrom, loc.start, loc.end, num.mark, seg.mean)
 
-    if (normalize) { seg = mutate(seg, seg.mean = seg.mean - facets_output$diplogr) }
+    if (normalize) { seg = mutate(seg, seg.mean = seg.mean - facets_output$dipLogR) }
     data.frame(seg)
 }
diff --git a/R/gene-level-changes.R b/R/gene-level-changes.R
@@ -37,8 +37,9 @@ gene_level_changes = function(facets_output,
     # Set variables
     segs = facets_output$segs
     snps = as.data.table(facets_output$snps)
-    diplogr = facets_output$diplogr
+    dipLogR = facets_output$dipLogR
     purity = facets_output$purity
+    ploidy = facets_output$ploidy
 
     # Get WGD status
     fcna_output = calculate_fraction_cna(segs, ploidy, genome, algorithm)
@@ -87,8 +88,8 @@ gene_level_changes = function(facets_output,
     genes_all[, cn_state := ifelse(!cn_state %in% copy_number_states$call, 'INDETERMINATE', cn_state)]
 
     # Test on cnlr against baseline
-    # cn0_diplogr = unique(segs$cnlr.median.clust)[which.min(abs(unique(segs$cnlr.median.clust)-diplogr))]
-    # cn0_segs = segs[cnlr.median.clust == cn0_diplogr]
+    # cn0_dipLogR = unique(segs$cnlr.median.clust)[which.min(abs(unique(segs$cnlr.median.clust)-dipLogR))]
+    # cn0_segs = segs[cnlr.median.clust == cn0_dipLogR]
     # cn0_snps = snps[segclust %in% cn0_segs$segclust]
     # cn0_snps = cn0_snps[between(cnlr, quantile(cn0_snps$cnlr, .25), quantile(cn0_snps$cnlr, .75))] # remove noise
 

diff --git a/R/plot-facets.R b/R/plot-facets.R
@@ -13,7 +13,7 @@
 #' @param plotX If \code{TRUE}, includes chromosome X.
 #' @param genome Genome build.
 #' @param highlight_gene Highlight gene(s), provide gene symbol or mapped positions (internally).
-#' @param adjust_diplogr Normalize by sample dipLogR.
+#' @param adjust_dipLogR Normalize by sample dipLogR.
 #' @param method When available, choose between plotting solution from \code{em} or \code{cncf} algorithm.
 #' @param subset_snps Subset the SNP profile to reduce weight of plotting, supply a factor by which to reduce or \code{TRUE} for default.
 #' @param plot_chroms Chromosomes to plot when using \code{closeup_plot}.
@@ -37,15 +37,15 @@ cnlr_plot = function(facets_data,
                      plotX = FALSE,
                      genome = c('hg19', 'hg18', 'hg38'),
                      highlight_gene = NULL,
-                     adjust_diplogr = TRUE,
+                     adjust_dipLogR = TRUE,
                      subset_snps = NULL,
                      return_object = FALSE) {
 
     genome = match.arg(genome, c('hg19', 'hg18', 'hg38'), several.ok = FALSE)
 
     snps = facets_data$snps
     segs = facets_data$segs
-    diplogr = facets_data$diplogr
+    dipLogR = facets_data$dipLogR
     if (!plotX) {
         snps = subset(snps, chrom < 23)
         segs = subset(segs, chrom < 23)
@@ -68,11 +68,11 @@ cnlr_plot = function(facets_data,
     if (ymin > -3) ymin = -3
     if (ymax < 3) ymax = 3
 
-    if (adjust_diplogr) {
-        snps$cnlr = snps$cnlr - diplogr
-        my_starts$cnlr_median = my_starts$cnlr_median - diplogr
-        my_ends$cnlr_median = my_ends$cnlr_median - diplogr
-        diplogr = Inf
+    if (adjust_dipLogR) {
+        snps$cnlr = snps$cnlr - dipLogR
+        my_starts$cnlr_median = my_starts$cnlr_median - dipLogR
+        my_ends$cnlr_median = my_ends$cnlr_median - dipLogR
+        dipLogR = Inf
     }
 
     if (!is.null(subset_snps)) {
@@ -138,7 +138,7 @@ valor_plot = function(facets_data,
 
     snps = facets_data$snps
     segs = facets_data$segs
-    diplogr = facets_data$diplogr
+    dipLogR = facets_data$dipLogR
     if (!plotX) {
         snps = subset(snps, chrom < 23)
         segs = subset(segs, chrom < 23)