-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGSE103465.r
395 lines (312 loc) · 14.6 KB
/
GSE103465.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
################################################################################
## guo_GSE103465.r
## 2021-02-15
## Francisco Martínez Picó - francisco9896@gmail.com
################################################################################
# Dataset info avaliable in:
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103465
Sys.info()[c('nodename', 'user')]
rm(list = ls())
R.version.string # 'R version 4.0.3 (2020-10-10)'
# LOAD PACKAGES -----------------------------------------------------------
library(Biobase)
library(GEOquery)
library(tseries)
library(affy)
library(affyPLM)
library(ggplot2)
library(RColorBrewer)
library(AnnotationDbi)
library(limma)
library(dplyr)
library(sva)
library(arrayQualityMetrics)
functions_path = '/Users/francisco/Desktop/TFM/functions'
dataset_path = '/Users/francisco/Desktop/TFM/datasets/GSE103465_guo'
data_path = paste0(dataset_path, '/data')
raw_plots_path = paste0(dataset_path, '/raw_plots')
normal_plots_path = paste0(data_path, '/normal_plots')
results_path = '/Users/francisco/Desktop/TFM/datasets/results/results_sva_2' # modify
source(paste0(functions_path, '/function_arbol.r'))
source(paste0(functions_path, '/function_plot_PCAscores.r'))
getwd()
setwd(dataset_path)
# DOWNLOAD RAW DATA FOR GUO (GSE103465) -------------------------------
getGEOSuppFiles('GSE103465') # Raw dataset provided as supplementary file
system('tar xvf GSE103465/GSE103465_RAW.tar')
gse103465raw = ReadAffy()
system('rm -fr GSE103465')
system('rm *.gz')
# SAVE RAW DATA -----------------------------------------------------------
save(gse103465raw, file = paste0(data_path, '/gse103465raw.rda'), version = 2)
load(paste0(data_path, '/gse103465raw.rda'), verbose = T) # raw data
# PRE-NORMALIZATION ANALYSIS ----------------------------------------------
# GSE = getGEO('GSE103465') # series_matrix
# dim(exprs(gse103465raw)) # 535.824 probes y 6 samples
# annotation(gse103465raw) # chip 'primeview'
head(exprs(gse103465raw))
# colnames(gse103465raw) = gsub(x = colnames(gse103465raw),
# pattern = '_.*.CEL.gz', replacement = '')
#### MDplot ####
# Change names to get better plots:
old_rownames_raw = rownames(pData(gse103465raw))
fancy_rownames_raw = gsub(x = colnames(gse103465raw),
pattern = '_.*.CEL.gz', replacement = '')
rownames(pData(gse103465raw)) = fancy_rownames_raw
affy::MAplot(gse103465raw, type = 'pm', plot.method = 'smoothScatter')
#### Density estimator ####
affy::hist(gse103465raw)
legend(x = 'topright', # Position
legend = rownames(pData(gse103465raw)), # Legend
lty = c(1:6), # Line types
col = c(1:6), # Line colors
lwd = 2) # line width
#### Boxplot ####
affy::boxplot(gse103465raw)
#### Clustering ####
datos_raw = exprs(gse103465raw)
colnames(datos_raw) = rownames(pData(gse103465raw))
# Make a data.frame to colour our groups
group.raw = c(rep('control', 3), rep('RIF', 3))
sample.name = colnames(datos_raw)
sinfo_raw = as.data.frame(cbind(sample.name, group.raw), as.is = T)
char.group = as.character(sinfo_raw[, 'group.raw'])
tan = unique(char.group)
micolor = rainbow(length(tan))
names(micolor) = tan
sinfo_raw$color.group = micolor[char.group]
sinfo_raw
## Clustering by correlation distance
correlacion_raw = cor(datos_raw)
distancia_raw = as.dist((1 - correlacion_raw) / 2)
hc_raw = hclust(distancia_raw)
table(hc_raw$labels == colnames(datos_raw)) # same layout?
hc_raw$clase = sinfo_raw$group # Colour
## Plot using 'function_arbol.r' ##
arbol(cluster = hc_raw, main = 'Clustering by Correlation Distance')
# RAW PCA -----------------------------------------------------------------
pca_raw = prcomp(t(log2(exprs(gse103465raw)) + 1))
# t cause we need samples in rows
# We can check PCA results
# see:
# https://towardsdatascience.com/principal-component-analysis-pca-101-using-r
# consider 'center = TRUE, scale = TRUE'
# screeplot(pca, type = 'l', npcs = 6, main = 'Screeplot of the first 6 PCs')
# abline(h = 1, col = 'red', lty = 5)
# legend('topright', legend = c('Eigenvalue = 1'),
# col = c('red'), lty = 5, cex = 0.6)
var_raw = round(summary(pca_raw)$importance[2, c(1,2)] * 100, 1) # rounds 1st d
toplot_raw = data.frame(pca_raw$x[, c(1,2)], stringsAsFactors = F)
lim_raw = max(abs(c(min(toplot_raw), max(toplot_raw)))) # axis values limit
axis_limits_raw = c(-lim_raw, lim_raw)
# toplot$color = c(paste0('Control-', 1:3), paste0('RIF-', 1:3))
toplot_raw$color = c(rep('control', 3), rep('RIF', 3))
# png(filename = paste0(raw_plots_path, '/PCAplot_group.png'), # File name
# width = 600, height = 600, # Width and height in pixels
# unit = 'px', # Width and height units, pixels
# bg = 'white') # Background color
ggplot(data = toplot_raw,
aes_string(x = colnames(toplot_raw)[1], y = colnames(toplot_raw)[2])) +
geom_point(aes(color = color), size = 3) +
# scale_color_manual(name = 'RIF', values = brewer.pal(n = 3, 'Dark2')) +
scale_color_manual(name = 'RIF', values = c('#D95F02', '#1B9E77')) +
geom_text_repel(label = rownames(pData(gse103465raw)), size = 3) +
xlab(paste0('PC1', ': ', var_raw[1], '%')) +
ylab(paste0('PC2', ': ', var_raw[2], '%')) +
ggtitle('PCA: Guo (GSE103465)') +
xlim(axis_limits_raw) + ylim(axis_limits_raw) +
theme_light() +
theme(legend.position = 'bottom',
axis.title = element_text(size = 18),
axis.text = element_text(size = 15),
plot.title = element_text(size = 22, hjust = 0.5),
legend.title = element_blank(),
legend.text = element_text(size = 13))
# dev.off()
# OUTLIER DETECTION (ARRAY QUALITY METRICS) -------------------------------
outdir = paste0(dataset_path, '/arrayQuality_report')
arrayQualityMetrics(expressionset = gse103465raw,
outdir = outdir,
force = TRUE,
do.logtransform = TRUE) # Since data is not processed yet.
# Check index.html file in outdir for results.
# EXCLUDING GSM2771797 (CONTROL-3) ----------------------------------------
rownames(pData(gse103465raw)) = old_rownames_raw # needed for rma work properly
# According to all previous analysis we decide to exclude control-3 (GSM2771797)
gse103465raw_no3 = gse103465raw[, c(1, 2, 4, 5, 6)]
# NORMALIZATION -----------------------------------------------------------
gse103465 = affy::rma(gse103465raw_no3)
# POST-NORMALIZATION ANALYSIS ---------------------------------------------
#### MDplot ####
old_rownames_norm = rownames(pData(gse103465))
fancy_rownames_norm = gsub(x = colnames(gse103465),
pattern = '_.*.CEL.gz', replacement = '')
rownames(pData(gse103465)) = fancy_rownames_norm
affy::MAplot(gse103465, plot.method = 'smoothScatter')
#### Density estimator ####
affy::hist(gse103465)
legend(x = 'topright', # Position
legend = rownames(pData(gse103465)), # Legend texts
lty = c(1:6), # Line types
col = c(1:6), # Line colors
lwd = 2) # line width
#### Boxplot ####
affy::boxplot(gse103465)
#### Clustering ####
datos_norm = exprs(gse103465)
colnames(datos_norm) = rownames(pData(gse103465))
# Make a data.frame to colour our groups
group.norm = c(rep('control', 2), rep('RIF', 3))
sample.name = colnames(datos_norm)
sinfo_norm = as.data.frame(cbind(sample.name, group.norm), as.is = T)
char.group = as.character(sinfo_norm[, 'group.norm'])
tan = unique(char.group)
micolor = rainbow(length(tan))
names(micolor) = tan
sinfo_norm$color.group = micolor[char.group]
sinfo_norm
# Clustering by correlation distance
correlacion_norm = cor(datos_norm)
distancia_norm = as.dist((1 - correlacion_norm) / 2)
hc_norm = hclust(distancia_norm)
table(hc_norm$labels == colnames(datos_norm)) # same layout?
hc_norm$clase = sinfo_norm$group # Colour
# Plot using 'function_arbol.r'
arbol(cluster = hc_norm, main = 'Clustering by Correlation Distance')
# NORMALIZED PCA ----------------------------------------------------------
pca_norm = prcomp(t(exprs(gse103465)))
# screeplot(pca, type = 'l', npcs = 6, main = 'Screeplot of the first 6 PCs')
# abline(h = 1, col = 'red', lty = 5)
# legend('topright', legend = c('Eigenvalue = 1'),
# col = c('red'), lty = 5, cex = 0.6)
var_norm = round(summary(pca_norm)$importance[2, c(1,2)] * 100, 1)
toplot_norm = data.frame(pca_norm$x[, c(1,2)], stringsAsFactors = F)
lim_norm = max(abs(c(min(toplot_norm), max(toplot_norm))))
axis_limits_norm = c(-lim_norm, lim_norm)
toplot_norm$color = c(rep('control', 2), rep('RIF', 3))
# png(filename = paste0(normal_plots_path, '/PCAplot_group.png'), # File name
# width = 650, height = 650, # Width and height in pixels
# unit = 'px', # Width and height units, pixels
# bg = 'white') # Background color
ggplot(data = toplot_norm,
aes_string(x = colnames(toplot_norm)[1], y = colnames(toplot_norm)[2])) +
geom_point(aes(color = color), size = 3) +
scale_color_manual(name = 'RIF', values = c('#D95F02', '#1B9E77')) +
# geom_text_repel(label = rownames(pData(gse103465)), size = 3) +
xlab(paste0('PC1', ': ', var_norm[1], '%')) +
ylab(paste0('PC2', ': ', var_norm[2], '%')) +
ggtitle('PCA: Guo (GSE103465)') +
xlim(axis_limits_norm) + ylim(axis_limits_norm) +
theme_light() +
theme(legend.position = 'bottom',
axis.title = element_text(size = 18),
axis.text = element_text(size = 15),
plot.title = element_text(size = 22, hjust = 0.5),
legend.title = element_blank(),
legend.text = element_text(size = 13))
# dev.off()
# ANNOTATION --------------------------------------------------------------
#### pData ####
factores = c(0, 0, 1, 1, 1)
condition = factor(factores, levels = 0:1, labels = c('control','RIF'))
infor_fenotip = data.frame(pData(gse103465), condition)
pData(gse103465) = infor_fenotip
rownames(pData(gse103465)) = fancy_rownames_norm
pData(gse103465)$sample = fancy_rownames_norm
pData(gse103465)
#### fData ####
annotation(gse103465) # primeview
# We need to create primeview.db since this annotation info does not exist.
# More info in:
# /Estudios_InSilico/Trabajo/RIF_gse103465.Rmd
# https://support.bioconductor.org/p/130727/
# require(primeview.db)
# Or alternatively, we read GPL (chip/platform) info:
gpl = read.delim(file = paste0(data_path, '/GPL16043-14507.txt'), header = T,
comment.char = '#', stringsAsFactors = F, sep = '\t')
# Expression info:
exprinfo = exprs(gse103465)
# head(gpl)
dim(gpl) # 49.495 rows
dim(exprinfo)# 49.495 rows
# Are rows ordered?
gpl$ID[(!gpl$ID == rownames(exprs(gse103465)))] # No order, show not-ordered
# When multiple gene symbols with '///', we keep the first:
gpl$Gene.Symbol = unlist(lapply(gpl$Gene.Symbol, function(my_symbol){
unlist(strsplit(x = my_symbol, split = ' /// '))[1]
}))
# Which positions in gpl are not in order?:
which(gpl$ID%in%gpl$ID[(!gpl$ID == rownames(exprs(gse103465)))])
# [1] 49301 49302 49303 49304 49305 49306 49307 49308 49309 49310 49311 49312
# 49313 49314 49315 49316 49317 49318 49319 49320 49321 49322
# [23] 49323 49324 49325 49326 49327 49328 49329 49330 49331 49332 49333 49334
# 49335 49336 49337 49338 49339 49340 49341 49342 49343 49344
# [45] 49345 49346 49347 49348 49349 49350 49351 49352 49353 49354 49355 49356
# 49357 49358 49359 49360 49361 49362 49363 49364 49365 49366
# [67] 49367 49368 49369 49370 49371 49372 49373 49374 49375 49376 49377 49378
# 49379 49380 49381 49382 49383 49384 49385 49386 49387 49388
# [89] 49389 49390 49391 49392 49393 49394 49395
gpl_filtered = gpl[-c(49301:49395), ] # Ordered rows in gpl.
not_ordered_rows_gpl = gpl[c(49301:49395), ]
dim(gpl_filtered) # [1] 49400 24
exprinfo_filtered = exprinfo[-c(49301:49395), ]
not_ordered_rows_exprinfo = as.data.frame(exprinfo[c(49301:49395),])
dim(exprinfo_filtered) # [1] 49400 5
# Ordered positions are the same:
all(gpl_filtered$ID == rownames(exprinfo_filtered))
# Add a column with probe id to order the disordered rows:
not_ordered_rows_exprinfo$ID = rownames(not_ordered_rows_exprinfo)
rows_to_unit = full_join(x = not_ordered_rows_gpl,
y = not_ordered_rows_exprinfo,
by = 'ID')
rownames(rows_to_unit) = rows_to_unit$ID
dim(rows_to_unit) # 95 rows to add in order
gpl_filtered = rbind(gpl_filtered, rows_to_unit[, 1:24])
dim(gpl_filtered) # [1] 49495 24
exprinfo_filtered = rbind(exprinfo_filtered, rows_to_unit[, 25:29])
dim(exprinfo_filtered) # [1] 49495 5
table(gpl_filtered$ID == rownames(exprinfo_filtered))
#### experimentData ####
infoData = new('MIAME',
name = 'Guo et al.',
lab = 'Ruijin Hospital, Shanghai Jiaotong University',
contact ='Mingjie Wang <huzai920621@126.com>',
title = ' Characterization of gene expression profile during the
implantation phase in IVF cycles in endometrium of women with recurrent
implantation failure',
abstract = 'Our study was designed with an aim to compare the gene
expression profile of RIF women with that of healthy proven fertile
controls to explore the influence factors of endometrial receptivity and
the mechanisms of RIF.',
url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103465')
experimentData(gse103465) = infoData
experimentData((gse103465))
# SAVE NORMALIZED AND ANNOTATED DATA --------------------------------------
save(gse103465, file = paste0(data_path, '/gse103465.rda'), version = 2)
load(paste0(data_path, '/gse103465.rda'), verbose = T)
# CREATE ED AND DAT -------------------------------------------------------
#### Removing NAs from annotation ####
gpl_filtered_notNA = gpl_filtered[!is.na(gpl_filtered$Gene.Symbol), ]
exprinfo_filtered_notNA = exprinfo_filtered[!is.na(gpl_filtered$Gene.Symbol), ]
dim(gpl_filtered_notNA) # [1] 49372 24
dim(exprinfo_filtered_notNA) # [1] 49372 5
#### Group expression info by gene symbol ####
dat = avereps(exprinfo_filtered_notNA, ID = gpl_filtered_notNA$Gene.Symbol)
# Correct sample name from dat:
colnames(dat) = gsub(pattern = '_.*', replacement = '', x = colnames(dat))
head(dat)
#### We create the ed ####
pData(gse103465)
ed = pData(gse103465) # 'ed' after experimental dessign
# rownames(ed) = colnames(dat)
# SAVE .RDA ---------------------------------------------------------------
save(dat, ed, file = paste0(results_path, '/guo.rda'), version = 2)
# CHECK DAT AND ED ----------------------------------------------------------
load(paste0(results_path, '/guo.rda'), verbose = T)
head(dat)
head(ed)
plot_PCAscores(dat = dat, ed = ed,
condition1 = 'condition',
components = c(1,2),
colors = c('#D95F02', '#1B9E77'),
title = 'PCA: Guo (GSE103465)')