-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFigure_5.R
114 lines (74 loc) · 2.74 KB
/
Figure_5.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
setwd("/Users/datn/github/SNP_array_comparison")
setwd("data/real_data")
require(scales)
require(ggplot2)
require(data.table)
# p1: imputation
cutoffs = list("(0-0.01]" = c(0, 0.01), "(0.01-0.05]" = c(0.01, 0.05), "(0.01-0.5]" = c(0.01, 0.5), "(0.05-0.5]" = c(0.05,0.5))
get_mean_imp_acc <- function(df, cutoffs, flag = "flag"){
res = list()
for( i in 1: length(cutoffs)){
cut = cutoffs[[i]]
cut_name = names(cutoffs)[i]
df2 = df[df$AF > cut[1] & df$AF <= cut[2],]
m = mean(df2$r_2, na.rm =T)
t = paste(cut_name, flag, sep = "_")
r = c(m,t, flag)
res[[t]] = r
}
rdf = do.call("rbind", res)
return(rdf)
}
get_imp_acc_array <- function(array_path, flag){
res = list()
cutoffs = list("(0-0.01]" = c(0, 0.01), "(0.01-0.05]" = c(0.01, 0.05), "(0.01-0.5]" = c(0.01, 0.5), "(0.05-0.5]" = c(0.05,0.5))
for(i in 1:22){
x = fread( paste0(array_path, "/chr", i , "_imputed.correlation.txt.gz"))
res[[i]] = get_mean_imp_acc(x, cutoffs, flag)
}
df = do.call("rbind", res)
df = as.data.frame(df)
names(df) = c("imp_acc", "Bin", "Type")
df$imp_acc = as.numeric(df$imp_acc)
return(df)
}
# res = list()
# for(i in 1:22){
# # real
# x = fread( paste0("95_shared_sample_pmra/chr", i , "_imputed.correlation.txt.gz"))
# res[[i]] = get_mean_imp_acc(x, cutoffs, "Real_PMRA")
# }
# df1 = do.call("rbind", res)
# res = list()
# for(i in 1:22){
# # simulated
# x = fread( paste0("95_shared_sample_simulated/chr", i , "_imputed.correlation.txt.gz"))
# res[[i]] = get_mean_imp_acc(x, cutoffs, "Simulated_PMRA")
# }
# df2 = do.call("rbind", res)
df1 = get_imp_acc_array("pmra_real_array", "PMRA_Real")
df2 = get_imp_acc_array("pmra_simulated_array", "PMRA_Simulated")
df3 = get_imp_acc_array("gsa_real_array", "GSA_Real")
df4 = get_imp_acc_array("gsa_simulated_array", "GSA_Simulated")
df = rbind(df1, df2, df3, df4)
# names(df) = c("imp_acc", "Bin", "Type")
# df$imp_acc = as.numeric(df$imp_acc)
p1 <- ggplot(df, aes( y = imp_acc, x = Bin, color= Type)) + geom_boxplot() + theme_light() + ylab("Mean imputation r2") + xlab("") + guides(x = guide_axis(angle = 90))
pdf(file= "../../output_paper/Figure_5.pdf", width=8, height=6)
p1
dev.off()
bin = unique(df$Bin)
df$imp_acc = as.numeric(df$imp_acc)
mean_imp_acc = c()
sd_imp_acc = c()
for(b in bin){
t1 = mean(df$imp_acc[df$Bin == b])
t2 = sd(df$imp_acc[df$Bin == b])
mean_imp_acc = c(mean_imp_acc, t1)
sd_imp_acc = c(sd_imp_acc, t2)
}
mean_imp_acc = round(mean_imp_acc,4)
sd_imp_acc = round(sd_imp_acc,4)
mean_sd = paste(mean_imp_acc, sd_imp_acc, sep = "±")
stat_df = data.frame(Bin = bin, Imputation_accuracy = mean_sd)
write.csv(stat_df, file = "../../output_paper/tables/Imputation_acc_real_simulated_pmra_gsa.csv", quote = F, row.names = F)