-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathrunall.sh
143 lines (106 loc) · 4.53 KB
/
runall.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# instalar utilizando o brew install (gitpod)
brew install sratoolkit
# parallel fastq-dump
# https://github.com/rvalieris/parallel-fastq-dump
pip install parallel-fastq-dump
# rodar validate
# aperte a tecla X para sair
# vdb-config
echo "Aexyo" | vdb-config -i
# fastq-dump: comando que faz o download do arquivo utilizando o SRR ID da amostra
# -I: --readids da amostra
# --split-files: ele vai separar os arquivos fastq em 1 e 2 (paired)
time parallel-fastq-dump --sra-id SRR8856724 --threads 4 --outdir ./ --split-files --gzip
# AS Referências do Genoma hg38 (FASTA, VCFs)
# Os arquivos de Referência: Panel of Normal (PoN), Gnomad AF e Exac common:
# https://console.cloud.google.com/storage/browser/gatk-best-practices/somatic-hg38?project=broad-dsde-outreach
wget -c https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz
wget -c https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz.tbi
wget -c https://storage.googleapis.com/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz
wget -c https://storage.googleapis.com/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz.tbi
# Arquivo no formato FASTA do genoma humano hg38
# Diretório Download UCSC hg38: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/
# chr9.fa.gz: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr9.fa.gz
wget -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr9.fa.gz
# BWA para mapeamento dos arquivos FASTQ (gitpod)
# Instalando o BWA no gitpod
brew install bwa
# descompacta o arquivo gz
gunzip chr9.fa.gz
# bwa index para indexar o arquivo .fa (~5min)
bwa index chr9.fa
# Samtools faidx
brew install samtools
samtools faidx chr9.fa
# BWA-mem para fazer o alinhamento (FASTQ -> BAM)
NOME=WP312; Biblioteca=Nextera; Plataforma=illumina;
bwa mem -t 13 -M -R "@RG\tID:$NOME\tSM:$NOME\tLB:$Biblioteca\tPL:$Plataforma" \
chr9.fa \
SRR8856724_1.fastq.gz \
SRR8856724_2.fastq.gz > WP312.sam
# delete dados intermediarios
rm -f SRR8856724_1.fastq.gz SRR8856724_2.fastq.gz
# samtools: fixmate, sort e index (~min)
# -@ numero de cores utilizados
time samtools fixmate -@10 WP312.sam WP312.bam
# delete dados intermediarios
rm -f WP312.sam
# ordenando o arquivo fixmate
time samtools sort -O bam -@6 -m2G -o WP312_sorted.bam WP312.bam
# delete dados intermediarios
rm -f WP312.bam
# indexando o arquivo BAM ordenado (sort)
time samtools index WP312_sorted.bam
# abordagem de target sequencing utilizamos o rmdup para remover duplicata de PCR
time samtools rmdup WP312_sorted.bam WP312_sorted_rmdup.bam
# delete dados intermediarios
rm -f WP312_sorted.bam
# indexando o arquivo BAM rmdup
time samtools index WP312_sorted_rmdup.bam
# Cobertura - make BED files
# Instalação do bedtools
brew install bedtools
# Gerando BED do arquivo BAM
bedtools bamtobed -i WP312_sorted_rmdup.bam > WP312_sorted_rmdup.bed
bedtools merge -i WP312_sorted_rmdup.bed > WP312_sorted_rmdup_merged.bed
bedtools sort -i WP312_sorted_rmdup_merged.bed > WP312_sorted_rmdup_merged_sorted.bed
# Cobertura Média
bedtools coverage -a WP312_sorted_rmdup_merged_sorted.bed \
-b WP312_sorted_rmdup.bam -mean > WP312_coverageBed.bed
# Filtro por total de reads >=20
cat WP312_coverageBed.bed | awk -F "\t" '{if($4>20){print}}' > WP312_coverageBed20x.bed
# GATK4 instalação (MuTect2 com PoN)
# Download
wget -c https://github.com/broadinstitute/gatk/releases/download/4.2.2.0/gatk-4.2.2.0.zip
# Descompactar
unzip gatk-4.2.2.0.zip
# Gerar arquivo .dict
./gatk-4.2.2.0/gatk CreateSequenceDictionary -R chr9.fa -O chr9.dict
# Gerar interval_list do chr9
./gatk-4.2.2.0/gatk ScatterIntervalsByNs -R chr9.fa -O chr9.interval_list -OT ACGT
# Converter Bed para Interval_list
./gatk-4.2.2.0/gatk BedToIntervalList -I WP312_coverageBed20x.bed \
-O WP312_coverageBed20x.interval_list -SD chr9.dict
# GATK4 - CalculateContamination
./gatk-4.2.2.0/gatk GetPileupSummaries \
-I WP312_sorted_rmdup.bam \
-V af-only-gnomad.hg38.vcf.gz \
-L chr9.interval_list \
-O WP312.table
./gatk-4.2.2.0/gatk CalculateContamination \
-I WP312.table \
-O WP312.contamination.table
# GATK4 - MuTect2 Call
./gatk-4.2.2.0/gatk Mutect2 \
-R chr9.fa \
-I WP312_sorted_rmdup.bam \
--germline-resource af-only-gnomad.hg38.vcf.gz \
--panel-of-normals 1000g_pon.hg38.vcf.gz \
-L WP312_coverageBed20x.interval_list \
-O WP312.somatic.pon.vcf.gz
# GATK4 - MuTect2 FilterMutectCalls
./gatk-4.2.2.0/gatk FilterMutectCalls \
-R chr9.fa \
-V WP312.somatic.pon.vcf.gz \
--contamination-table WP312.contamination.table \
-O WP312.filtered.pon.vcf.gz