-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
245 lines (219 loc) · 7.65 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import re
import os
from os.path import join as filejoin, isdir, isfile
import sys
import csv
import shutil
HAR_ID_RE = re.compile(r'(?:2x)?HAR\.?(\d+)')
ACTIVITY_DOMAIN_RE = re.compile(r'(.+)\s+\((\d+)\)')
def parse_activity_domains(s):
if not s or s == 'none':
return []
pieces = s.split(';')
domains = []
for piece in pieces:
piece = piece.strip()
match = ACTIVITY_DOMAIN_RE.match(piece)
if not match: continue
name = match.group(1)
num_imgs = int(match.group(2))
domains.append((name, num_imgs))
return domains
def parse_har_id(s):
match = HAR_ID_RE.match(s)
if not match:
return None
else:
return int(match.group(1))
GENE_AND_LOCATION_RE = re.compile(r'^(\S+)\s*\((\-?\d+)\)$')
def parse_genes(s):
if not s or s == 'None': return None
pieces = s.split(',')
genes = []
for piece in pieces:
piece = piece.strip()
match = GENE_AND_LOCATION_RE.match(piece)
if not match:
continue
gene = match.group(1)
location = int(match.group(2))
genes.append((gene, location))
return genes
def parse_bracketed_genes(s, har_name):
genes = parse_genes(s)
if not genes: return None
upstreams = filter(lambda g: g[1] < 0, genes)
if not upstreams:
print 'Warning: no upstream genes found for', har_name
return None
downstreams = filter(lambda g: g[1] > 0, genes)
if not downstreams:
print 'Warning: no downstream genes found for', har_name
return None
nearest_upstream = max(upstreams, key = lambda g: g[1])
nearest_downstream = min(downstreams, key = lambda g: g[1])
return [nearest_upstream[0], nearest_downstream[0]]
def parse_aliases(s):
if not s:
return []
pieces = s.split(';')
return map(lambda s: parse_har_id(s.strip()), pieces)
def parse_har_csv(csv_path):
hars = {}
input_file = open(csv_path, 'rU')
input = csv.DictReader(input_file)
for row in input:
if not row['ID']:
continue
har_id = parse_har_id(row['ID'])
if not har_id:
print "Warning: skipping unrecognized HAR ID in '%s': %s" % (csv_path, row['ID'])
continue
if not har_id in hars:
hars[har_id] = {
'name' : row['ID'],
'species' : {},
'aliases' : parse_aliases(row['Aliases']),
'species-difference' : row['Human-Chimp Difference'],
}
if not hars[har_id].get('bracketed-genes'):
hars[har_id]['bracketed-genes'] = parse_bracketed_genes(row['Genes within 1 Mb (distance to TSS)'], row['ID'])
hars[har_id]['species'][row['Species'].lower()] = {
'genome-coords' : row['Coordinates (hg19 or panTro4)'],
'consistent-activity-domains' : parse_activity_domains(row['Consistent Activity Domains (# pos)']),
'suggestive-activity-domains' : parse_activity_domains(row['Suggestive Activity Domains (# pos)']),
'expression' : row['Expression'],
'stage' : row['Stage'],
'imgs' : {},
}
input_file.close()
return hars
ALLOWED_IMG_RE = re.compile(r'(?P<har>\d+)_(?P<species>[^\W\d_]{2})\d+_(?P<num>\d+)L\.tif$')
IMG_SPECIES_TO_CSV = {
'hg': 'human',
'pt': 'chimp'
}
def find_imgs(hars, img_dir_path, skipped):
for dirname, _, filenames in os.walk(img_dir_path):
for filename in filenames:
filepath = filejoin(dirname, filename)
match = ALLOWED_IMG_RE.match(filename)
if not match:
continue
har = int(match.group('har'))
if not har in hars:
if skipped: print "Skipping -- unknown HAR '%d': %s" % (har, filepath)
continue
img_species = match.group('species')
species = IMG_SPECIES_TO_CSV.get(img_species)
if not species:
if skipped: print "Skipping -- unknown species '%s': %s" % (species, filepath)
continue
img_num = int(match.group('num'))
hars[har]['species'][species]['imgs'][img_num] = filepath
OUTPUT_DIR_PREFIX = 'website'
def make_output_dir_name():
dir_name = OUTPUT_DIR_PREFIX
n = 1
while isdir(dir_name):
n += 1
dir_name = OUTPUT_DIR_PREFIX + ' ' + str(n)
return dir_name
THUMBNAIL_WIDTH = 280
def calc_thumbnail_size(img_size):
w, h = img_size
tw = THUMBNAIL_WIDTH
th = tw * h / w
return (tw, th)
def convert_img(original_path, target_prefix):
import Image # Python caches import statements
try:
img = Image.open(original_path)
except Exception, e:
print "Error: could not open image because: %s\n %s" % (str(e), original_path)
return
img.save(target_prefix + '.tif', format = 'TIFF') # copy full tiff image
thumbnail_img = img.resize(calc_thumbnail_size(img.size), Image.ANTIALIAS)
thumbnail_img.save(target_prefix + '-small.png', format = 'PNG')
def convert_imgs(hars, output_dir):
img_dir = filejoin(output_dir, 'imgs')
os.mkdir(img_dir)
for har in hars:
har_dir = filejoin(img_dir, str(har))
if not isdir(har_dir): os.mkdir(har_dir)
for species in hars[har]['species']:
species_dir = filejoin(har_dir, species)
if not isdir(species_dir): os.mkdir(species_dir)
for img_num, original_path in hars[har]['species'][species]['imgs'].iteritems():
target_path = filejoin(species_dir, str(img_num))
convert_img(original_path, target_path)
GENOME_URL_PREFIXES = {
'human': 'http://genome.ucsc.edu/cgi-bin/hgTracks?org=Human&db=hg19&position=',
'chimp': 'http://genome.ucsc.edu/cgi-bin/hgTracks?org=Chimp&db=panTro4&position='
}
def setup_web_files(hars, output_dir):
from jinja2 import Environment, FileSystemLoader
env = Environment(loader = FileSystemLoader('.'))
template_input = env.get_template('index-template.html')
output_file = open(filejoin(output_dir, 'index.html'), 'w')
template_input.stream(hars = hars, genome_url_prefixes = GENOME_URL_PREFIXES).dump(output_file)
output_file.close()
shutil.copyfile('style.css', filejoin(output_dir, 'style.css'))
def print_summary(hars):
for har in hars:
for species in hars[har]['species']:
print har, species
img_dict = hars[har]['species'][species]['imgs']
for img in sorted(img_dict.iterkeys()):
print ' %2d: %s' % (img, img_dict[img])
def parse_args():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('har_csv', help='Path to CSV file containing HAR IDs')
parser.add_argument('img_dir', help='Path to top-level directory containing embryo images')
parser.add_argument('--summary', action='store_true', help='Show list of images to be processed; will not produce a web page')
parser.add_argument('--skipped', action='store_true', help='Show names of images that will not be processed')
return parser.parse_args()
def main(args):
csv_path = args.har_csv
if not isfile(csv_path):
print "Not a valid file: %s" % csv_path
return
hars = parse_har_csv(csv_path)
img_dir_path = args.img_dir
if not isdir(img_dir_path):
print "Not a valid directory: %s" % img_dir_path
return
find_imgs(hars, img_dir_path, args.skipped)
if args.summary:
print_summary(hars)
return
output_dir = make_output_dir_name()
os.mkdir(output_dir)
setup_web_files(hars, output_dir)
convert_imgs(hars, output_dir)
def ensure_pil():
try:
import Image
except:
print 'Could not import the Python Imaging library.'
print 'To install it with pip: pip install PIL'
sys.exit()
def ensure_jinja():
try:
import jinja2
except:
print 'Could not import the jinja2 library.'
print 'To install it with pip: pip install jinja2'
sys.exit()
def ensure_argparse():
try:
import argparse
except:
print 'Could not import the argparse library.'
print 'This script requires Python 2.7 or greater'
sys.exit()
ensure_argparse()
ensure_pil()
ensure_jinja()
main(parse_args())