-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepareNCBItaxFiles.py
executable file
·124 lines (111 loc) · 5.44 KB
/
prepareNCBItaxFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
'''
Created on Aug 29, 2013
@author: ranko
'''
''' FUNCTION(s) THAT CREATES D.gtxt files from NCBI taxonomy
.gtxt files are files used to generate .gv (graphviz) files
and are used with taxmapper to merge results with ncbi taxonomy
'''
import parseNCBItaxonomy as pNT
#taxIDNames = {}
#taxMap = {}
class TaxIdDataDirectedT():
def __init__(self, _startTax,_targetTax):
self.targetTax = _targetTax
self.startTax = _startTax
self.rank = ''
def prepareNCBITaxForDrawing(namesFile, nodesFile,drawRanks='SGFOCPKD',doTrimSName = 'Y',doDummies=True,verbose=False):
names = namesFile
nodes = nodesFile
print ' ---> LOADING TAXBASE '
pNT.initTax(names, nodes)
print ' ---> DONE '
print ' ---> GRABBIN ALL --- '
allTaxes = {}
c = 0
ct = 0
print 'total taxa: ',len(pNT.taxIDNames.keys())
for n in pNT.taxIDNames.keys():
t = pNT.getTaxFromTaxID(n)
c+=1
if not t.getRank('species').name == 'N/D':
ct+=1
if doDummies:
rl = t.getTax9EwDummy(drawRanks,doTrimSName)
#if 'phylum:P:Bacteroidetes -> superkingdom:D:Bacteria' in rl.toStrRow():
# print rl.toStrRow()
# print t.toStrRow()
# print rl.getTax9EwDummy().toStrList()
# exit (-1)
else:
rl = t.getTax9E(drawRanks,doTrimSName)
h = rl.toTupleList()
taxLvlC = 0
for n2 in h:
taxLvlC +=1
# if n2[0] in allTaxes.keys():
# pass
# else:
allTaxes[n2[0]] = TaxIdDataDirectedT(n2[0],n2[1])
allTaxes[n2[0]].rank = rl.records[taxLvlC-1].rank
if ct % 10000 == 0 and verbose:
print 'grabbed',ct,'taxa', '; now have: ',len(allTaxes.keys()),'taxa'
if c % 10000 == 0 and verbose:
print 'parsed',c,'taxa'
elif c % 50000 == 0:
print 'parsed',c,'taxa'
# if c > 5000:
# break
print '----> DONE ! <----'
outName = 'taxOut'
if doDummies:
outName = 'taxOutD'
with open (outName+'_'+drawRanks+'.gtxt','w') as out:
for t in allTaxes.keys():
out.write('"'+allTaxes[t].startTax+'"'+'\n')
for t in allTaxes.keys():
out.write('"'+allTaxes[t].startTax+'"'+' -> '+'"'+allTaxes[t].targetTax+'"'+'\n')
# for l in allTaxes.keys():
# print allTaxes[l].startTax, '->',allTaxes[l].targetTax
''' ------------------------ MAIN ---------------------------- '''
''' run it '''
import sys
if len (sys.argv) < 3 or len(sys.argv) > 3:
print '------------------- USAGE -----------------------'
print '> python prepareNCBItaxFiles <namefile> <nodefile> '
print '> where: '
print '> namefile = names.dmp of NCBI taxonomy database'
print '> nodefile = nodes.dmp of NCBI taxonomy database'
print '-------------------------------------------------'
exit(-1)
nameF = sys.argv[1]
nodeF = sys.argv[2]
'''
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DK',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKP',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPC',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCO',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCOF',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCOFG',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCOFGS',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCF',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPF',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKF',doTrimSName = 'Y',doDummies=True)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKOF',doTrimSName = 'Y',doDummies=True)
'''
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DK',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKP',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPC',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCO',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCOG',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCOGF',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCOGFS',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPC',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCG',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCGF',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPCGFS',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKP',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPG',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPGF',doTrimSName = 'Y',doDummies=False)
prepareNCBITaxForDrawing(nameF,nodeF,drawRanks='DKPGFS',doTrimSName = 'Y',doDummies=False)