-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenotypeHLA.py
executable file
·296 lines (272 loc) · 12.4 KB
/
genotypeHLA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/usr/bin/env python
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import click
import sys
import ssw
import operator
import os # for basename
# it is a global variable - once it is refactored to be OO, it will disappear
# the whole point is to map HLA00005 to HLA*02:01:01:01
gHLAtypes = {}
def validate_locus(ctx,param,value):
try:
assert value in ["HLA-A","HLA-B", "HLA-C", "HLA-DPA1", "HLA-DPB1", "HLA-DQA1", "HLA-DQB1", "HLA-DRA", "HLA-DRB1", "HLA-DRB3", "HLA-DRB4", "HLA-DRB5", "HLA-DRB6", "HLA-DRB7", "HLA-DRB8", "HLA-DRB9"]
except:
raise click.BadParameter('Please define locus as HLA-A, HLA-B, HLA-DRB1 ... as you can find in awk -F[\*\ ] \'/^DE/ && /HLA/ {print $4}\' hla.dat|sort -u')
return value
@click.command(context_settings = dict( help_option_names = ['-h', '--help'] ))
@click.option('--cons','-c', type=str, help='FASTA file containing consensuses (or a single consensus)')
@click.option('--dat','-d', type=str, help='the IMGT/HLA reference hla.dat file from ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/hla.dat')
@click.option('--locus','-l', type=str, help='the locus [either HLA-A, HLA-B, HLA-DRB1 ...]',default="HLA-A",callback=validate_locus)
def doGenotype(cons, dat, locus):
fixedFile = fixIMGTfile(dat)
( primaryExons, secondaryExons, genomicRefs) = getCompartmenstForAlleles(fixedFile,locus)
# for each consensus
# preselect types considering only the important exons
# refine the preselected list by checking mismatch in the secondary exons
# final touches by looking at the introns/UTRs
for seq_record in SeqIO.parse(cons,"fasta"):
print " Start Processing Consensus: " + seq_record.id
print "################################################"
# select genotypes considering only the important exons
genotypes = preSelectTypes(primaryExons,seq_record,locus)
print "___________________PRESELECTED_____________________"
printGenotypes(genotypes)
success = True # we will change it for failure
if len(genotypes) == 0: # we have failed for some reason
success = False
print "Could not find a proper type for consensus"
elif len(genotypes) == 1: # there is a single genotype only: no need to shrink the candidate set
print "Final HLA type(s) for consensus: "
print gHLAtypes[genotypes[0]]
else: # there are more than one type candidates, go for exons
try:
print "Selecting best alleles using the whole available CDS"
genotypes = selectGenotypesConsideringCommonExons(genotypes, secondaryExons,seq_record)
print "___________________PRESELECTED_____________________"
printGenotypes(genotypes)
except KeyError:
success = False
print "***************************** FAILURE **********************************"
print "* Could not find a common set of exons for typing. Current candidates: *"
print "************************************************************************"
printGenotypes(genotypes)
print " Fitting genomic sequences for the consensus: "
# this part of code is executed all the time:
if len(genotypes) > 1:
print "Selecting best alleles using genomic sequences"
genotypes = selectGenotypesConsideringIntronsAndUTRs(genotypes, genomicRefs, seq_record)
print "############## SUCCESS #########################"
print "Final HLA types for consensus:"
printGenotypes(genotypes)
print "################################################"
def getCompartmenstForAlleles(fixedIMGT,locus):
"""
Go through each entry in the IMGT/HLA EMBL file, and store out exons in a dictionary.
For Class-I we are storing exons 2 & 3, for Class-II only exon 2.
The data is is like
primary{}
ex2='ACTGATCGATCGATACG'
ex3='CCAGGCCTGGATCGCATTAGC'
primary['HLA000101']=[ex2,ex3]
{'HLA000101': ['ACTGATCGATCGATACG', 'CCAGGCCTGGATCGCATTAGC']}
"""
print "Processing reference IMGT file"
primary = {}
secondary = {}
genomicRefs = {}
for seq_record in SeqIO.parse(fixedIMGT,"imgt"):
gHLAtypes[seq_record.id] = seq_record.description
# if it is the correct locus and there is a sequence record (not a deleted one)
if seq_record.description.startswith(locus) and len(seq_record.seq) > 1:
primary[seq_record.id] = getPrimaryExons(seq_record, locus)
secondary[seq_record.id] = getSecondaryExons(seq_record, locus)
genomicRefs[seq_record.id] = str(seq_record.seq)
print "ready"
print "################################################"
return (primary,secondary,genomicRefs)
def getPrimaryExons(sr,locus):
"""
Primary exons are exon 2 and 3 for HLA-A,B,C and exon 2 for all the rest.
TODO: Note, this is actually wrong. There are quite a few other loci where the important polymorphic
exons is not exon 2 only, but in the moment we are ignoring this fact.
"""
exonList = []
for f in sr.features:
if f.type == "exon":
if f.qualifiers['number'] == ['2'] :
exonList.append( sr.seq[f.location.start:f.location.end] )
if locus in ["HLA-A","HLA-B", "HLA-C"] and f.qualifiers['number'] == ['3']:
exonList.append( sr.seq[f.location.start:f.location.end] )
return exonList
def getSecondaryExons(sr,locus):
"""
We are adding all the other exons as secondary.
Instead of a list, we have to add qualifiers as well, since for some alleles there are 5 exons defined, and only 3 for the other, etc.
The data looks like:
{
'allele1': { 'ex1': seq, 'ex4': seq, 'ex5': seq},
'allele2': { 'ex1': seq, 'ex4': seq, 'ex5': seq},
'allele3': { 'ex1': seq, 'ex4': seq},
...
}
For a set like that only ex1 and ex4 are in the smallest common set.
So, we are returning with the { 'ex1': seq, 'ex4': seq, 'ex5': seq,...} part
"""
exonDict = {}
for f in sr.features:
if f.type == "exon": # consider only exons
# treat Class-I and Class-II separately: it can be faster, but it is more readable this way
# Class-I first
if locus in ["HLA-A","HLA-B", "HLA-C"] and f.qualifiers['number'][0] not in ['2','3']:
exonDict['ex'+f.qualifiers['number'][0] ] = str( sr.seq[f.location.start:f.location.end] )
# Class-II and other (all non-Class-I entries)
elif locus not in ["HLA-A","HLA-B", "HLA-C"] and f.qualifiers['number'] != ['2']:
exonDict['ex'+f.qualifiers['number'][0] ] = str( sr.seq[f.location.start:f.location.end] )
return exonDict
def getBestScoringAlleles( sortedTupleList ):
"""
We are expecting a sorted list of tuples, containing allele names and alignment scores like
[('HLA10395.1', 545), ('HLA06895.1', 545), ('HLA14515.1', 540), ('HLA12165.1', 532), ('HLA03670.1', 532)]
We are returning with a list of best matches:
['HLA10395.1','HLA06895.1']
"""
(firstAllele,bestScore) = sortedTupleList[0]
bestAlleles = [firstAllele]
for (allele,score) in sortedTupleList[1:]:
if bestScore > score:
break
else:
bestAlleles.append(allele)
return bestAlleles
def preSelectTypes(primary,consensus, locus):
"""
For each primary exon (or exon pair)
make an alignment for exon 2, and put the result into a dictionary as alignmentEx2['allele'] = #score
# we are getting [score, matches, mismatches, inserts, deletions] if we want to
if there is exon 3 also,
do an alignmentEx3['allele'] = #mismatches
merge the two in a way that sort both, keep the best for both, and make an intersect
else
sort, and keep the best alignments only
"""
print "Selecting best alleles using primary exons"
# we are going to use https://github.com/vishnubob/ssw that is using
# https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library for SW and SAM output
# unfortunatelly it can not made multiprocess yet, but it is fast enough
isClassI = locus in ["HLA-A","HLA-B", "HLA-C"]
alignmentEx2 = {}
alignmentEx3 = {}
sw = ssw.Aligner()
for allele,exons in primary.items():
alignment = sw.align(str(consensus.seq),exons[0])
alignmentEx2[allele] = alignment.score
# ditto for exon 3 if it is not a Class-I
if isClassI:
alignment = sw.align(str(consensus.seq),exons[1])
alignmentEx3[allele] = alignment.score
# print allele + " scores: exon 2: " + str(alignmentEx2[allele]) + " exon 3 " + str(alignmentEx3[allele])
# sort the dict by values and reverse the result
bestEx2 = getBestScoringAlleles( sorted(alignmentEx2.items(),key=operator.itemgetter(1),reverse=True) )
if isClassI:
bestEx3 = getBestScoringAlleles( sorted(alignmentEx3.items(),key=operator.itemgetter(1),reverse=True) )
# return with the intersect of the two sets, leaving only entries that are in the besty matching set for both exon 2 and exon 3
# or exon 2 only for other than Class-I alleles
print "done"
return list(set(bestEx2) & set(bestEx3)) if isClassI else bestEx2
def getCommonExons(genotypes,exons):
"""
genotypes is a list of strings like
['HLA10395.1','HLA06895.1']
exons is a dict having dicts as values like
{
'HLA10395.1': { 'ex1': seq, 'ex4': seq, 'ex5': seq},
'HLA10495.1': { 'ex1': seq, 'ex4': seq, 'ex5': seq},
'HLA10355.1': { 'ex1': seq, 'ex4': seq},
...
}
"""
# as the initial value of the common set, get exons of the first allele
commonExons = set(exons[genotypes[0]].keys())
# print "starting from "
# print genotypes[0], commonExons
for gt in genotypes[0:]:
commonExons = commonExons & set( exons[gt].keys() )
#print gt,exons[gt].keys(),commonExons
# print "Common exons: "
# print commonExons
return commonExons
def printGenotypes(genotypes):
for gt in genotypes:
print gHLAtypes[gt]
def selectGenotypesConsideringCommonExons(genotypes,secondary,consensus):
# select a set of common exons
print "________________ITERATING ON COMMON EXONS____________________"
numOfCandidates = len(genotypes)
commonExons = getCommonExons(genotypes,secondary)
print commonExons
# select the set of best matching alleles for the very first exon in the list
# we have to do it for all exons, but initialize for the first
printGenotypes(genotypes)
newGenotypes = set(genotypes) & getBestScoringAllelesForExon(genotypes,commonExons.pop(),secondary,consensus)
while commonExons:
newGenotypes = set(newGenotypes) & getBestScoringAllelesForExon(newGenotypes,commonExons.pop(),secondary,consensus)
#print " ---------------- new genotypes: ----------------------"
#printGenotypes(list(newGenotypes))
if numOfCandidates == len(newGenotypes):
# we were not able to decrease the number of candidates
return list(newGenotypes)
else:
return selectGenotypesConsideringCommonExons(list(newGenotypes),secondary,consensus)
def getBestScoringAllelesForExon(genotypes,commonExon,secondary,consensus):
exAlign = {}
sw = ssw.Aligner()
for allele in genotypes:
exonSeq = secondary[allele][commonExon]
alignment = sw.align(str(consensus.seq),exonSeq)
exAlign[allele] = alignment.score
return set(getBestScoringAlleles( sorted(exAlign.items(),key=operator.itemgetter(1),reverse=True)) )
def selectGenotypesConsideringIntronsAndUTRs(genotypes,genomicRefs,consensus):
"""
Now we are doing something similar as in getBestScoringAllelesForExon(), but for whole sequences
"""
alleleAlign = {}
sw = ssw.Aligner()
for allele in genotypes:
alleleSeq = genomicRefs[allele]
alignment = sw.align(str(consensus.seq),alleleSeq)
alleleAlign[allele] = alignment.score
return set(getBestScoringAlleles( sorted(alleleAlign.items(),key=operator.itemgetter(1),reverse=True)) )
def fixIMGTfile(hladat):
"""
For some reason IMGT is not following the standard EMBL format, so we have to add entries to the ID line
So, we will add an extra "IMGT;" as:
ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.
going to be ->
ID HLA00001; SV 1; standard; DNA; IMGT; HUM; 3503 BP.
"""
newFileName = "fixed" + os.path.basename(hladat)
fixedfh = open(newFileName,"w")
with open(hladat,"r") as fh:
for line in fh:
if line.startswith("ID"):
parts = line.split()
line = parts[0] + " " + \
parts[1] + " " +\
parts[2] + " " +\
parts[3] + " " +\
parts[4] + " " +\
parts[5] + " " +\
"IMGT; " +\
parts[6] + " " +\
parts[7] + " " +\
parts[8] + "\n"
fixedfh.write(line)
return newFileName
if __name__ == "__main__":
doGenotype()
# import pdb
# pdb.set_trace()