-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcache_data_child.py
73 lines (59 loc) · 2.23 KB
/
cache_data_child.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from nlp_services.syntax import AllNounPhrasesService, AllVerbPhrasesService, HeadsService
from nlp_services.discourse.entities import CoreferenceCountsService, EntityCountsService
from nlp_services.discourse.sentiment import DocumentSentimentService, DocumentEntitySentimentService, WpDocumentEntitySentimentService
from nlp_services.caching import use_caching
from multiprocessing import Pool
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from boto.exception import S3ResponseError
import traceback
import boto
import sys
import re
import json
import time
import random
BUCKET = boto.connect_s3().get_bucket('nlp-data')
service_file = sys.argv[2] if len(sys.argv) > 2 else 'services-config.json'
SERVICES = json.loads(open(service_file).read())['services']
use_caching(per_service_cache=dict([(service+'.get', {'write_only': True}) for service in SERVICES]))
def process_file(filename):
if filename.strip() == '':
return # newline at end of file
global SERVICES
match = re.search('([0-9]+)/([0-9]+)', filename)
if match is None:
print "No match for %s" % filename
return
doc_id = '%s_%s' % (match.group(1), match.group(2))
for service in SERVICES:
try:
getattr(sys.modules[__name__], service)().get(doc_id)
except KeyboardInterrupt:
sys.exit()
except Exception as e:
print 'Could not call %s on %s!' % (service, doc_id)
print traceback.format_exc()
def call_services(keyname):
global BUCKET
key = BUCKET.get_key(keyname)
if key is None:
return
eventfile = "data_processing/%s_%s_%s" % (boto.utils.get_instance_metadata()['local-hostname'], str(time.time()), str(int(random.randint(0, 100))))
try:
key.copy('nlp-data', eventfile)
key.delete()
except S3ResponseError as e:
print e
print 'EVENT FILE %s NOT FOUND!' % eventfile
return
except KeyboardInterrupt:
sys.exit()
print 'STARTING EVENT FILE %s' % eventfile
k = Key(BUCKET)
k.key = eventfile
print k.key
map(process_file, k.get_contents_as_string().split('\n'))
print 'EVENT FILE %s COMPLETE' % eventfile
k.delete()
call_services(sys.argv[1])