-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path20_hgnc_name_resolution_table.py
executable file
·113 lines (94 loc) · 3.5 KB
/
20_hgnc_name_resolution_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#! /usr/bin/python3
#
# This source code is part of icgc, an ICGC processing pipeline.
#
# Icgc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Icgc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see<http://www.gnu.org/licenses/>.
#
# Contact: ivana.mihalek@gmail.com
#
# from HUGO gene nomenclature committee
# https://www.genenames.org/download/custom/
# header names:
# head -n1 /data/hgnc/hgnc_name_res.tsv \
# | sed 's/\t/\n/g' | awk 'ct +=1 {printf "%d ", ct; print}'
# locus group:
# | protein-coding gene |
# | non-coding RNA |
# | pseudogene |
# | other |
# | phenotype |
from config import Config
from icgc_utils.mysql import *
#########################################
def make_hgnc_table(cursor, db_name, hgnc_table, header_names):
if check_table_exists (cursor, db_name, hgnc_table): return
switch_to_db (cursor, db_name)
qry = ""
qry += " CREATE TABLE %s (" % hgnc_table
qry += " id INT NOT NULL, "
for name in header_names:
if name in ['approved_name', 'synonyms', 'refseq_ids']:
charlen = 150
elif name == 'uniprot_id_by_uniprot':
charlen = 300
elif name in ['approved_symbol', 'chromosome']: # chrom can have annotation such as "not on reference assembly"
charlen = 30
else:
charlen = 20
qry += " %s VARCHAR(%d)," % (name, charlen)
qry += " PRIMARY KEY (id) "
qry += ") ENGINE=MyISAM"
rows = search_db(cursor, qry)
print(qry)
print(rows)
return
#########################################
def strip_arm_annotation(chrom_address):
if "p" in chrom_address:
return chrom_address.split("p")[0]
if "q" in chrom_address:
return chrom_address.split("q")[0]
return chrom_address
#########################################
#########################################
def main():
hgncfile = "/storage/databases/hgnc/hgnc_name_res.tsv"
tmp_outfile = "hgnctmp.tsv"
ct = 0
outf = open (tmp_outfile, "w")
with open(hgncfile, "r") as inf:
headers = inf.readline().rstrip("\n").split("\t")
chromosome_column = headers.index('Chromosome')
for line in inf:
fields = line.rstrip("\n").split("\t")
fields[chromosome_column] = strip_arm_annotation(fields[chromosome_column])
ct += 1
outfields = [str(ct)] + fields
outf.write("\t".join(outfields)+"\n")
outf.close()
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
qry = "SET GLOBAL local_infile=1"
error_intolerant_search(cursor, qry)
db_name = "identifier_maps"
new_headers = ["_".join(h.lower().replace("(supplied", "").replace(")", "").split(" ")) for h in headers]
make_hgnc_table(cursor, db_name, "hgnc", new_headers)
qry = "load data local infile '%s' into table %s" % ("hgnctmp.tsv","hgnc")
search_db(cursor,qry,verbose=True)
os.remove(tmp_outfile)
cursor.close()
db.close()
#########################################
if __name__ == '__main__':
main()