forked from GreenleafLab/ArchR
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBulkProjection.R
220 lines (190 loc) · 8.56 KB
/
BulkProjection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#' Project Bulk ATAC-seq data into single cell subspace
#'
#' This function will Project Bulk ATAC-seq data into single cell subspace.
#'
#' @param ArchRProj An `ArchRProject` object containing the dimensionality reduction matrix passed by `reducedDims`.
#' @param seATAC Bulk ATAC Summarized Experiment.
#' @param reducedDims A string specifying the reducedDims.
#' @param embedding A string specifying embedding.
#' @param n An integer specifying the number of subsampled "pseudo single cells" per bulk sample.
#' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output.
#' @param threads The number of threads used for parallel execution
#' @param logFile The path to a file to be used for logging ArchR output.
#' @export
#'
projectBulkATAC <- function(
ArchRProj = NULL,
seATAC = NULL,
reducedDims = "IterativeLSI",
embedding = "UMAP",
n = 250,
verbose = TRUE,
threads = getArchRThreads(),
logFile = createLogFile("projectBulkATAC")
){
.validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
.validInput(input = seATAC, name = "seATAC", valid = c("SummarizedExperiment"))
.validInput(input = reducedDims, name = "reducedDims", valid = c("character"))
.validInput(input = embedding, name = "embedding", valid = c("character", "null"))
.validInput(input = n, name = "n", valid = c("integer"))
.validInput(input = verbose, name = "verbose", valid = c("boolean"))
.validInput(input = threads, name = "threads", valid = c("integer"))
.validInput(input = logFile, name = "logFile", valid = c("character"))
tstart <- Sys.time()
.startLogging(logFile = logFile)
.logThis(mget(names(formals()),sys.frame(sys.nframe())), "projectBulkATAC Input-Parameters", logFile = logFile)
##################################################
# Reduced Dimensions
##################################################
rD <- getReducedDims(ArchRProj, reducedDims = reducedDims, returnMatrix = FALSE)
.logThis(names(rD), "reducedDimsNames", logFile = logFile)
.logThis(rD[[1]], "reducedDimsMat", logFile = logFile)
rDFeatures <- rD[[grep("Features", names(rD))]]
if("end" %in% colnames(rDFeatures)){
rDGR <- GRanges(seqnames=rDFeatures$seqnames,IRanges(start=rDFeatures$start, end=rDFeatures$end))
}else{
rDGR <- GRanges(seqnames=rDFeatures$seqnames,IRanges(start=rDFeatures$start, width = (rDFeatures$start) / (rDFeatures$idx - 1)))
}
.logThis(rDGR, "reducedDimsGRanges", logFile = logFile)
subATAC <- subsetByOverlaps(seATAC, rDGR, ignore.strand = TRUE)
subATAC <- subATAC[order(rowSums(as.matrix(.getAssay(subATAC, "counts"))), decreasing = TRUE), ]
o <- DataFrame(findOverlaps(subATAC, rDGR, ignore.strand = TRUE))
sumOverlap <- length(unique(o[,2]))
.logThis(o, "overlapATAC", logFile = logFile)
if(sumOverlap == 0){
.logStop(paste0("No overlaps between bulk ATAC data and reduce dimensions feature found.",
"\nEither recreate counts matrix or most likely these data sets are incompatible!"), logFile = logFile)
}
if( (sumOverlap / length(rDGR)) < 0.25 ){
if(force){
.logMessage("Less than 25% of the features are present in this bulk ATAC data set! Continuing since force = TRUE!", verbose = TRUE, logFile = logFile)
}else{
.logStop("Less than 25% of the features are present in this bulk ATAC data set! Set force = TRUE to continue!", logFile = logFile)
}
}
.logMessage("Overlap Ratio of Reduced Dims Features = ", (sumOverlap / length(rDGR)), verbose = TRUE, logFile = logFile)
o <- o[!duplicated(o$subjectHits),]
subATAC <- subATAC[o$queryHits, ]
rownames(subATAC) <- paste0("f", o$subjectHits)
.logThis(subATAC, "subsettedATAC", logFile = logFile)
##################################################
# Create Bulk Matrix
##################################################
bulkMat <- .safeSubset(
mat = .getAssay(subATAC, "counts"),
subsetRows = paste0("f", seq_along(rDGR))
)
.logThis(bulkMat, "bulkATACMat", logFile = logFile)
##################################################
# Simulate and Project
##################################################
depthN <- round(sum(rD$rowSm / rD$nCol))
nRep <- 5
n2 <- ceiling(n / nRep)
ratios <- c(2, 1.5, 1, 0.5, 0.25) #range of ratios of number of fragments
simRD <- .safelapply(seq_len(ncol(bulkMat)), function(x){
.logDiffTime(sprintf("Projecting Sample (%s of %s)", x, ncol(bulkMat)), t1 = tstart, verbose = verbose, logFile = logFile)
counts <- bulkMat[, x]
counts <- rep(seq_along(counts), counts)
simMat <- lapply(seq_len(nRep), function(y){
ratio <- ratios[y]
simMat <- matrix(sample(x = counts, size = ceiling(ratio * depthN) * n2, replace = TRUE), ncol = n2)
simMat <- Matrix::summary(as(simMat, "dgCMatrix"))[,-1,drop=FALSE]
simMat[,1] <- simMat[,1] + (y - 1) * n2
simMat
}) %>% Reduce("rbind", .)
simMat <- Matrix::sparseMatrix(i = simMat[,2], j = simMat[,1], x = rep(1, nrow(simMat)), dims = c(length(rDGR), n2 * nRep))
simRD <- as.matrix(.projectLSI(simMat, LSI = rD, verbose = FALSE))
rownames(simRD) <- paste0(colnames(bulkMat)[x], "#", seq_len(nrow(simRD)))
simRD
}, threads = threads) %>% Reduce("rbind", .)
if(is.null(embedding)){
if(rD$scaleDims){
simRD <- .scaleDims(simRD)
}
out <- SimpleList(
simulatedReducedDims = simRD
)
return(out)
}
.logThis(simRD, "simulatedReducedDims", logFile = logFile)
##################################################
# Prep Reduced Dims
##################################################
embedding <- getEmbedding(ArchRProj = ArchRProj, embedding = embedding, returnDF = FALSE)
corCutOff <- embedding$params$corCutOff
dimsToUse <- embedding$params$dimsToUse
scaleDims <- embedding$params$scaleDims
if(is.null(scaleDims)){
scaleDims <- rD$scaleDims
}
simRD <- .scaleDims(simRD)
if(embedding$params$nc != ncol(simRD)){
if(is.null(dimsToUse)){
dimsToUse <- seq_len(ncol(rD[[1]]))
}
if(!is.null(corCutOff)){
if(scaleDims){
corToDepth <- rD$corToDepth$scaled
dimsToUse <- dimsToUse[corToDepth < corCutOff]
}else{
corToDepth <- rD$corToDepth$none
dimsToUse <- dimsToUse[corToDepth < corCutOff]
}
}
if(embedding$params$nc != ncol(simRD)){
.logMessage("Error incosistency found with matching LSI dimensions to those used in addEmbedding",
"\nReturning with simulated reduced dimension coordinates...", verbose = TRUE, logFile = logFile)
out <- SimpleList(
simulatedReducedDims = simRD
)
return(out)
}
simRD <- simRD[, dimsToUse, drop = FALSE]
}
##################################################
# Get Previous UMAP Model
##################################################
umapModel <- .loadUWOT(embedding$params$uwotModel, embedding$params$nc)
idx <- sort(sample(seq_len(nrow(rD[[1]])), min(nrow(rD[[1]]), 5000))) #Try to use 5000 or total cells to check validity
rD2 <- getReducedDims(
ArchRProj = ArchRProj,
reducedDims = reducedDims,
dimsToUse = embedding$params$dimsToUse,
scaleDims = embedding$params$scaleDims,
corCutOff = embedding$params$corCutOff
)[idx,,drop=FALSE]
##################################################
# Project UMAP
##################################################
set.seed(1)
threads2 <- max(floor(threads/2), 1)
simUMAP <- uwot::umap_transform(
X = rbind(rD2, simRD),
model = umapModel,
verbose = TRUE,
n_threads = threads2
)
rownames(simUMAP) <- c(rownames(rD2), rownames(simRD))
.logThis(simUMAP, "simulatedUMAP", logFile = logFile)
#Check if the projection matches using previous data
c1 <- cor(simUMAP[rownames(rD2), 1], embedding[[1]][rownames(rD2),1])
c2 <- cor(simUMAP[rownames(rD2), 2], embedding[[1]][rownames(rD2),2])
if(min(c1, c2) < 0.8){
.logMessage("Warning projection correlation is less than 0.8 (R = ", round(min(c1,c2), 4),").\nThese results may not be accurate because of the lack of heterogeneity in the single cell data.", verbose = TRUE, logFile = logFile)
}
dfUMAP <- embedding[[1]]
colnames(dfUMAP) <- c("UMAP1", "UMAP2")
colnames(simUMAP) <- c("UMAP1", "UMAP2")
dfUMAP <- DataFrame(dfUMAP)
dfUMAP$Type <- Rle("scATAC", lengths = nrow(dfUMAP))
simUMAP <- DataFrame(simUMAP[rownames(simRD),,drop=FALSE])
simUMAP$Type <- Rle(stringr::str_split(rownames(simUMAP), pattern = "#", simplify = TRUE)[,1])
out <- SimpleList(
simulatedBulkUMAP = simUMAP,
singleCellUMAP = dfUMAP,
simulatedReducedDims = simRD
)
.endLogging(logFile = logFile)
return(out)
}