-
Notifications
You must be signed in to change notification settings - Fork 3
/
summarize.R
219 lines (152 loc) · 8.17 KB
/
summarize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env Rscript
# R commands to summarize and plot the data
####################
# load relevant libraries
library(ggplot2, quietly = TRUE)
library(cowplot, quietly = TRUE)
library(GGally, quietly = TRUE)
library(chromPlot, quietly = TRUE)
####################
message(" ===== importing files ===== ")
# find all summary files
summary_files = list.files(path = ".", pattern = "summary.*.5000.txt")
summary_files
# import and merge all summary files (619,150 rows for 5kb bins)
imported_files = lapply(summary_files, read.delim, check.names = FALSE, stringsAsFactors = FALSE)
combined_summary = Reduce(function(x, y) merge(x, y, all = FALSE, by = "#BIN"), imported_files, accumulate = FALSE)
dim(combined_summary)
head(combined_summary)
####################
# plot distributions of values
message(" ===== plot distributions ===== ")
y_limits = c(0, 600000)
y_breaks = seq(0, 600000, 100000)
plot_50mer = ggplot(combined_summary, aes(Mapability50mer)) +
geom_histogram(binwidth = 0.01, fill = "dodgerblue4", na.rm = TRUE) +
scale_x_continuous() +
scale_y_sqrt(labels = scales::comma, limits = y_limits, breaks = y_breaks) +
labs(x = "50 bp Mappability", y = "Frequency")
plot_100mer = ggplot(combined_summary, aes(Mapability100mer)) +
geom_histogram(binwidth = 0.01, fill = "dodgerblue4", na.rm = TRUE) +
scale_x_continuous() +
scale_y_sqrt(labels = scales::comma, limits = y_limits, breaks = y_breaks) +
labs(x = "100 bp Mappability", y = "Frequency")
events_x_limits = c(-1, 80)
plot_giab = ggplot(combined_summary, aes(events_GIAB)) +
geom_histogram(binwidth = 1, fill = "dodgerblue4", na.rm = TRUE) +
scale_x_continuous(limits = events_x_limits) +
scale_y_sqrt(labels = scales::comma, limits = y_limits, breaks = y_breaks) +
labs(x = "Genome in a Bottle Breakpoints", y = "Frequency")
plot_1kg = ggplot(combined_summary, aes(events_1KG)) +
geom_histogram(binwidth = 1, fill = "dodgerblue4", na.rm = TRUE) +
scale_x_continuous(limits = events_x_limits) +
scale_y_sqrt(labels = scales::comma, limits = y_limits, breaks = y_breaks) +
labs(x = "1000 Genomes Project Breakpoints", y = "Frequency")
# plot the four distribution plots in one image
plot_dist = plot_grid(plot_50mer, plot_100mer, plot_giab, plot_1kg)
ggsave(filename = "distributions.all.png", plot = plot_dist, width = 12, height = 8, units = "in")
####################
# a matrix of correlation plots with regression lines
message(" ===== plot correlations ===== ")
# calculate correlation for a random subset of entries to save time (skip bin name column)
correlation_columns = c("events_1KG", "events_GIAB", "Mapability100mer", "Mapability50mer")
correlation_subset = combined_summary[sample(nrow(combined_summary), 50000), ]
correlation_subset = correlation_subset[, correlation_columns]
dim(correlation_subset)
head(correlation_subset)
# regression lines
smoothed_mean = function(data, mapping, ...) {
p = ggplot(data = data, mapping = mapping) +
geom_point() +
geom_smooth(method="loess", fill="red", color="red", ...) +
geom_smooth(method="gam", fill="blue", color="blue", ...)
p
}
# plot correlations
png("correlations.png", res = 100, width = 15, height = 10, units = "in")
ggpairs(correlation_subset, lower = list(continuous = smoothed_mean))
dev.off()
####################
# calculate scores
message(" ===== calculate scores ===== ")
scores = combined_summary
scores[,"chr"] = sub("(.*):.*-.*", "\\1", scores[,"#BIN"])
scores[,"start_pos"] = as.integer(sub(".*:(.*)-.*", "\\1", scores[,"#BIN"]))
scores[,"end_pos"] = as.integer(sub(".*:.*-(.*)", "\\1", scores[,"#BIN"]))
# calculate event percentiles after filtering out bins with 0 events
percentiles_giab = quantile(subset(combined_summary, events_GIAB > 0)[,"events_GIAB"], c(0.95, 0.99, 0.999, 0.9999))
score_cap_giab = percentiles_giab["99%"]
percentiles_1kg = quantile(subset(combined_summary, events_1KG > 0)[,"events_1KG"], c(0.95, 0.99, 0.999, 0.9999))
score_cap_1kg = percentiles_1kg["99%"]
# capped break point events
scores[, "events_GIAB_capped"] = scores[, "events_GIAB"]
scores[scores[, "events_GIAB"] > score_cap_giab,][, "events_GIAB_capped"] = score_cap_giab
scores[, "events_1KG_capped"] = scores[, "events_1KG"]
scores[scores[, "events_1KG"] > score_cap_1kg,][, "events_1KG_capped"] = score_cap_1kg
# break points score
breaks_scores_giab = scores[, "events_GIAB_capped"]
breaks_scores_giab = breaks_scores_giab / max(breaks_scores_giab)
breaks_scores_1kg = scores[, "events_1KG_capped"]
breaks_scores_1kg = breaks_scores_1kg / max(breaks_scores_1kg)
scores[, "breaks_score"] = (breaks_scores_giab + breaks_scores_1kg) / 2
# mapability score
scores[, "map_score"] = scores[, "Mapability50mer"] + scores[, "Mapability100mer"]
scores[, "map_score"] = scores[, "map_score"] / 2
scores[, "map_score"] = 1 - scores[, "map_score"]
# DangerTrack score
scores[, "danger_score"] = scores[, "map_score"] + scores[, "breaks_score"]
scores[, "danger_score"] = scores[, "danger_score"] / 2
# export BED file
scores_bed = scores[, c("chr", "start_pos", "end_pos", "#BIN", "danger_score")]
scores_bed[,"danger_score"] = round(scores_bed[,"danger_score"] * 1000, digits = 0)
write.table(scores_bed, file = "dangertrack.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
# export bedGraph file
scores_bedgraph = scores_bed[, c("chr", "start_pos", "end_pos", "danger_score")]
write.table(scores_bedgraph, file = "dangertrack.bedgraph", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
####################
# DangerTrack score and curated regions boxplots
# divide issues into bins
scores[,"GRC_issues_binned"] = "5-95%"
scores[scores[,"GRC_issues"] > .95,][, "GRC_issues_binned"] = "95-100%"
scores[scores[,"GRC_issues"] < .05,][, "GRC_issues_binned"] = "0-5%"
scores[,"ENCODE_DAC_blacklisted_binned"] = "5-95%"
scores[scores[,"ENCODE_DAC_blacklisted"] > .95,][, "ENCODE_DAC_blacklisted_binned"] = "95-100%"
scores[scores[,"ENCODE_DAC_blacklisted"] < .05,][, "ENCODE_DAC_blacklisted_binned"] = "0-5%"
# t-test for DangerTrack score of high and low issue bins
scores_grc_pval = t.test(scores[scores[,"GRC_issues"] < .05,][, "danger_score"],
scores[scores[,"GRC_issues"] > .95,][, "danger_score"])
scores_dac_pval = t.test(scores[scores[,"ENCODE_DAC_blacklisted"] < .05,][, "danger_score"],
scores[scores[,"ENCODE_DAC_blacklisted"] > .95,][, "danger_score"])
# p-value comparison braket
plot_p_bracket = data.frame(a = c(1, 1:3,3), b = c(1.02, 1.05, 1.05, 1.05, 1.02))
# boxplots
boxplot_grc = ggplot(scores, aes(GRC_issues_binned, danger_score)) +
geom_boxplot(fill = "steelblue3", outlier.color = "gray") +
geom_line(data = plot_p_bracket, aes(x = a, y = b)) +
annotate("text", x = 2, y = 1.1, label = "p < 2.2e-16") +
scale_y_continuous(breaks = seq(0, 1, 0.2)) +
labs(x = "GRC Issues Regions Overlap", y = "DangerTrack Score")
ggsave(filename = "score_GRC_issues.png", plot = boxplot_grc, width = 8, height = 5, units = "in")
boxplot_encode = ggplot(scores, aes(ENCODE_DAC_blacklisted_binned, danger_score)) +
geom_boxplot(fill = "steelblue3", outlier.color = "gray") +
geom_line(data = plot_p_bracket, aes(x = a, y = b)) +
annotate("text", x = 2, y = 1.1, label = "p < 2.2e-16") +
scale_y_continuous(breaks = seq(0, 1, 0.2)) +
labs(x = "ENCODE Blacklisted Regions Overlap", y = "DangerTrack Score")
ggsave(filename = "score_ENCODE_DAC_blacklisted.png", plot = boxplot_encode, width = 8, height = 5, units = "in")
####################
# chromosome plots
chromplot_score = data.frame(Chrom = scores[, "chr"], Start = scores[, "start_pos"], End = scores[, "end_pos"],
score = scores[, "danger_score"])
chromplot_issues = read.table(file = "issues.bed", col.names = c("Chrom", "Start", "End"),
check.names = FALSE, stringsAsFactors = FALSE)
chromplot_issues$Group = "Issues"
chromplot_issues$Colors = "dodgerblue4"
png("chrom.png", res = 200, width = 10, height = 8, units = "in")
chromPlot(chr = c(1:9), figCols = 9, yAxis = FALSE, segLwd = 5, noHist = TRUE, maxSegs = 99000, gaps = hg_gap,
stat = chromplot_score,
statCol = "score", colStat="firebrick", statName = "score", statTyp = "l", statSumm = "mean",
segment = chromplot_issues)
dev.off()
####################
# end