-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalculate_stats_gz.sh
100 lines (94 loc) · 3.46 KB
/
calculate_stats_gz.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#
# calculate_stats_gz.sh
# Bash script for calculating per variant statistics of gz.vcf files
#
# Copyright (C) 2021 Anestis Gkanogiannis <anestis@gkanogiannis.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
#$1 input
#$2 output
zcat $1 |\
awk -F"\t" '{line=$0} BEGIN { \
print "CHR\tPOS\tID\tA\ta\tAA\tAa\taa\taa_e\tAa_e\taa_e\tMissing\tMissPercentage\tAF_A\tAF_a\tMAF\tHo\tHe\tPIC\tF\tchi_sq_2df" \
} !/^#/ { AA=gsub(/\t(0\/0|0\|0)/,"") ;
Aa=gsub(/\t(0\/1|1\/0|0\|1|1\|0)/,"") ;
aa=gsub(/\t(1\/1|1\|1)/,"") ;
mis=gsub(/\t(.\/.|.\|.)/,"") ;
if (AA+Aa+aa==0) {
AA_e=0 ;
Aa_e=0 ;
aa_e=0 ;
hetperc="NA" ;
misperc=1 ;
AF_A="NA" ;
AF_a="NA" ;
MAF="NA" ;
Ho="NA" ;
He="NA" ;
PIC="NA" ;
Fst="NA" ;
chi_sq_1df="NA" ;
chi_sq_2df="NA" ;
}
else {
misperc=mis/(AA+Aa+aa+mis) ;
AF_A=(AA+0.5*Aa)/(AA+Aa+aa) ;
AF_a=(aa+0.5*Aa)/(AA+Aa+aa) ;
AA_e=(AA+Aa+aa)*(AF_A^2) ;
aa_e=(AA+Aa+aa)*(AF_a^2) ;
Aa_e=(AA+Aa+aa)*2.0*AF_A*AF_a ;
MAF=AF_a ;
if (AF_a>AF_A) MAF=AF_A ;
Ho=Aa/(AA+Aa+aa) ;
He=1.0-MAF^2-(1.0-MAF)^2 ;
PIC=He-2.0*(MAF^2)*((1.0-MAF)^2) ;
if (He==0) {Fst="NA" ;}
else {Fst=(He-Ho)/He ;}
if (AA_e!=0 && Aa_e!=0 && aa_e!=0){
chi_sq_2df=(((AA_e-AA)^2)/AA_e)+(((Aa_e-Aa)^2)/Aa_e)+(((aa_e-aa)^2)/aa_e);
}
else{
chi_sq_2df="NA";
}
}
print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t" \
AA "\t" \
Aa "\t" \
aa "\t" \
AA_e "\t" \
Aa_e "\t" \
aa_e "\t" \
mis "\t" \
misperc "\t" \
AF_A "\t" \
AF_a "\t" \
MAF "\t" \
Ho "\t" \
He "\t" \
PIC "\t" \
Fst "\t" \
chi_sq_2df \
}' > $1.stats.tmp ;
unset R_HOME ;
Rscript --vanilla <(echo -e "\
data<-read.table(file='$1.stats.tmp',\
colClasses=c('NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL',\
'NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL',NA),header=T,fill=T)
res.p2<-pchisq(data\$chi_sq_2df, df=2, lower.tail=FALSE)
write.table(res.p2,file='$1.stats.tmp2',sep='\t',col.names=F,row.names=F)") ;
sed -i '1iP2_value' $1.stats.tmp2 ;
paste $1.stats.tmp $1.stats.tmp2 > $2 ;
rm -f $1.stats.tmp $1.stats.tmp2 ;