-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrnaseq_count_matrix.py
executable file
·173 lines (148 loc) · 5.57 KB
/
rnaseq_count_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/opt/bin/python
import sys
import re
import os
import fnmatch
#####################################
## Find all count files matching pattern entered by user (there should be one count file
## per sample) and create a matrix with one row for each gene and one column for each sample
#####################################
#####################################
## Usage: /opt/bin/python rnaseq_count_matrix.py rootDir suffixToSearch outputFileName
## Example: /opt/bin/python rnaseq_count_matrix.py /ifs/res/liang/RNASeq/Proj2983_MassagueJ .htseq_count Proj2983_MassagueJ_htseq.count_allSamples.txt
#####################################
def usage():
print "/opt/bin/python rnaseq_count_matrix.py rootDir patternToSearch outputFileName [idConversionFile]"
return
## create a list of full paths to files that match pattern entered
def findFiles(rootDir,pattern):
"""
create and return a list of full paths to files that match pattern entered
"""
filepaths = []
pattern = '*'+pattern
#for path, dirs, files in os.walk(os.path.abspath(rootDir)):
files = os.listdir(os.path.abspath(rootDir))
if fnmatch.filter(files, pattern):
for file in fnmatch.filter(files, pattern):
filepaths.append(os.path.join(rootDir,file))
else:
if "Proj" in rootDir.split("/")[-1] and "-" in rootDir.split("/")[-1]:
print>>sys.stderr, "WARNING: No files matching pattern %s found in %s" %(pattern, path)
return filepaths
def getSampleID(filepath,pattern):
"""
Extract and return sample ID from file path
"""
if filepath[-1] == "/":
filepath = filepath[:-1]
file_name = os.path.basename(filepath)
#pattern = pattern.replace("*","")
samp = file_name.replace(pattern,"")
return samp
def printMatrix(matrix,allSamps,id_idx,outFile):
"""
Print matrix of counts, with one row for each gene and one column
for each sample
"""
if id_idx:
header = "GeneID\tGeneSymbol\t" + "\t".join(allSamps)
else:
header = "GeneID\t" + "\t".join(allSamps)
with open(outFile,'w') as out:
print>>out,header
ids = sorted(matrix.keys())
for id in ids:
row = id
for samp in allSamps:
if matrix[id].has_key(samp):
row += "\t" + matrix[id][samp]
else:
row += "\t" + "0"
if not (len(row.split("\t")) == len(allSamps)+1 or len(row.split("\t")) == len(allSamps)+2):
print>>sys.stderr, "ERROR: data missing for record with ID ",id
else:
print>>out,row
return
def indexIdConversions(conversionFile):
id_idx = {}
with open(conversionFile,'r') as conv:
for line in conv:
id,gn = line.strip().split("\t")
if not id in id_idx:
id_idx[id] = gn
else:
print "WARNING: multiple gene symbols found for ID %s" %id
return id_idx
def getGeneSymbol(id, id_idx):
final_id = []
part = ''
if ":" in id:
part = id[id.index(":"):]
id = id.split(":")[0]
ids = id.split("+")
for id in ids:
if id in id_idx:
final_id.append(id_idx[id])
else:
print>>sys.stderr, "ERROR: NO GENE SYMBOL FOUND FOR ID %s" %id
gen_sym = "+".join(final_id) + part
return gen_sym
def makeCountMatrix(args):
"""
Find files to parse, create one matrix of all counts and print
matrix to file
"""
id_idx = {}
if len(args) == 4:
rootDir,filePattern,outFile,conversionFile = args
id_idx = indexIdConversions(conversionFile)
elif len(args) == 3:
rootDir,filePattern,outFile = args
else:
usage()
sys.exit(1)
## store all counts in a nested dictionary (indexed by gene then by sampleID)
## in order to account for genes that might be included in one count file
## but not another
matrix = {}
## store a list of all samples
allSamps = []
## find all files matching pattern entered
files = findFiles(rootDir,filePattern)
fwg = None
if files:
print>>sys.stderr, "\nCombining the following files:\n"
for file in files:
if id_idx:
file_w_gns = file + ".geneSymbols.txt"
fwg = open(file_w_gns,'w')
print>>sys.stderr, file
if os.stat(file)[6]==0: ## file is empty
print>>sys.stderr, "WARNING: This file is empty!"
else:
samp = getSampleID(file,filePattern)
allSamps.append(samp)
with open(file,'r') as fl:
for line in fl:
line = line.strip()
id,count = line.split()
gen_sym = ''
if id_idx:
gen_sym = getGeneSymbol(id, id_idx)
id = "\t".join([id,gen_sym])
print>>fwg, "\t".join([id,count])
if not matrix.has_key(id):
matrix[id] = {}
if matrix[id].has_key(samp):
print>>sys.stderr, "ERROR: gene",id,"already has sample",samp
else:
matrix[id][samp] = count
if id_idx:
fwg.close()
printMatrix(matrix,allSamps,id_idx,outFile)
else:
print>>sys.stderr, "\nNo files found matching pattern entered. Exiting.\n"
sys.exit(-1)
if __name__ == '__main__':
makeCountMatrix(sys.argv[1:])