-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathcalculations--differential_abundance.R
161 lines (143 loc) · 7.12 KB
/
calculations--differential_abundance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#' Differential abundance with DESeq2
#'
#' EXPERIMENTAL: This function is still being tested and developed; use with caution. Uses the
#' \code{\link[DESeq2]{DESeq2-package}} package to conduct differential abundance analysis of count data. Counts can
#' be of OTUs/ASVs or taxa. The plotting function \code{\link{heat_tree_matrix}} is useful for
#' visualizing these results. See details section below for considerations on preparing data for
#' this analysis.
#'
#' Data should be raw read counts, not rarefied, converted to proportions, or modified with any
#' other technique designed to correct for sample size since \code{\link[DESeq2]{DESeq2-package}} is designed to be
#' used with count data and takes into account unequal sample size when determining differential
#' abundance. Warnings will be given if the data is not integers or all sample sizes are equal.
#'
#' @param obj A \code{\link[metacoder]{taxmap}} object
#' @param data The name of a table in \code{obj} that contains data for each sample in columns.
#' @param cols The names/indexes of columns in \code{data} to use. By default, all numeric columns
#' are used. Takes one of the following inputs: \describe{ \item{TRUE/FALSE:}{All/No columns will
#' used.} \item{Character vector:}{The names of columns to use} \item{Numeric vector:}{The indexes
#' of columns to use} \item{Vector of TRUE/FALSE of length equal to the number of columns:}{Use
#' the columns corresponding to \code{TRUE} values.} }
#' @param groups A vector defining how samples are grouped into "treatments". Must be the same order
#' and length as \code{cols}.
#' @param other_cols If \code{TRUE}, preserve all columns not in \code{cols} in the output. If
#' \code{FALSE}, dont keep other columns. If a column names or indexes are supplied, only preserve
#' those columns.
#' @param lfc_shrinkage What technique to use to adjust the log fold change results for low counts.
#' Useful for ranking and visualizing log fold changes. Must be one of the following:
#' \describe{
#' \item{'none'}{No log fold change adjustments.}
#' \item{'normal'}{The original DESeq2 shrinkage estimator}
#' \item{'ashr'}{Adaptive shrinkage estimator from the \code{ashr} package, using a fitted mixture of normals prior.}
#' }
#' @param ... Passed to \code{\link[DESeq2]{results}} if the \code{lfc_shrinkage} option is "none"
#' and to \code{\link[DESeq2]{lfcShrink}} otherwise.
#'
#' @return A tibble with at least the taxon ID of the thing tested, the groups compared, and the
#' DESeq2 results. The \code{log2FoldChange} values will be positive if \code{treatment_1} is more
#' abundant and \code{treatment_2}.
#'
#' @family calculations
#'
#' @examples
#' \dontrun{
#' # Parse data for plotting
#' x = parse_tax_data(hmp_otus, class_cols = "lineage", class_sep = ";",
#' class_key = c(tax_rank = "taxon_rank", tax_name = "taxon_name"),
#' class_regex = "^(.+)__(.+)$")
#'
#' # Get per-taxon counts
#' x$data$tax_table <- calc_taxon_abund(x, data = "tax_data", cols = hmp_samples$sample_id)
#'
#' # Calculate difference between groups
#' x$data$diff_table <- calc_diff_abund_deseq2(x, data = "tax_table",
#' cols = hmp_samples$sample_id,
#' groups = hmp_samples$body_site)
#'
#' # Plot results (might take a few minutes)
#' heat_tree_matrix(x,
#' data = "diff_table",
#' node_size = n_obs,
#' node_label = taxon_names,
#' node_color = ifelse(is.na(padj) | padj > 0.05, 0, log2FoldChange),
#' node_color_range = diverging_palette(),
#' node_color_trans = "linear",
#' node_color_interval = c(-3, 3),
#' edge_color_interval = c(-3, 3),
#' node_size_axis_label = "Number of OTUs",
#' node_color_axis_label = "Log2 fold change")
#'
#' }
#'
#' @export
calc_diff_abund_deseq2 <- function(obj, data, cols, groups, other_cols = FALSE,
lfc_shrinkage = c('none', 'normal', 'ashr'), ...) {
# Check that DESeq2 is intalled
if (! requireNamespace("DESeq2", quietly = TRUE)) {
stop('The "DESeq2" package needs to be installed for this function to work.',
call. = FALSE)
}
# Parse options
lfc_shrinkage <- match.arg(lfc_shrinkage)
# Get abundance by sample data
abund_data <- get_taxmap_table(obj, data)
# Parse columns to use
cols <- get_numeric_cols(obj, data, cols)
# Check that columns contain integers only
col_is_int <- vapply(cols, FUN.VALUE = logical(1), function(c) {
all(abund_data[[c]] %% 1 == 0)
})
non_int_cols <- cols[! col_is_int]
if (length(non_int_cols) > 0) {
stop(call. = FALSE,
'All data given to DESeq2 should be untransformed counts. The following columns contain non-integers:\n',
limited_print(type = 'silent', non_int_cols, prefix = ' '))
}
# Check for equal sample sizes
if (length(unique(colSums(abund_data[cols]))) == 1) {
warning(call. = FALSE,
'All columns have equal counts, suggesting counts were normalized (e.g. rarefied) to correct for sample size variation.',
' DESeq2 is designed to be used with unnormalized data.',
' Use untransformed counts if available.')
}
# Check groups option
groups <- check_option_groups(groups, cols)
# Find other columns
# These might be added back to the output later
other_cols <- get_taxmap_other_cols(obj, data, cols, other_cols)
# Get every combination of groups to compare
combinations <- t(utils::combn(unique(as.character(groups)), 2))
combinations <- lapply(seq_len(nrow(combinations)), function(i) combinations[i, ])
# Format data for DESeq2
metadata <- data.frame(group = groups)
deseq_data <- DESeq2::DESeqDataSetFromMatrix(countData = abund_data[cols],
colData = metadata,
design = ~ group)
# Run DESeq2
deseq_result <- DESeq2::DESeq(deseq_data)
# Make function to compare one pair of groups
one_comparison <- function(treat_1, treat_2) {
# Extract results for a single pair
if (lfc_shrinkage == 'none') {
pair_result <- DESeq2::results(deseq_result, contrast = c('group', treat_1, treat_2), ...)
} else {
pair_result <- DESeq2::lfcShrink(deseq_result, contrast = c('group', treat_1, treat_2),
type = lfc_shrinkage, ...)
}
# Add treatments compared
output <- cbind(data.frame(stringsAsFactors = FALSE,
treatment_1 = treat_1,
treatment_2 = treat_2),
as.data.frame(pair_result))
# Add in other columns if specified
output <- cbind(abund_data[, other_cols], output)
return(output)
}
# Compare all pairs of treatments and combine
output <- lapply(seq_len(length(combinations)), function(i) {
one_comparison(combinations[[i]][1], combinations[[i]][2])
})
output <- do.call(rbind, output)
# Convert to tibble and return
dplyr::as_tibble(output)
}