This repository has been archived by the owner on Nov 11, 2023. It is now read-only.
forked from openeventdata/phoenix_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_connection.py
173 lines (131 loc) · 6.1 KB
/
scraper_connection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from __future__ import print_function
from __future__ import unicode_literals
import utilities
import datetime
import logging
import codecs
def query_all(collection, lt_date, gt_date, sources, write_file=False):
"""
Function to query the MongoDB instance and obtain results for the desired
date range. The query constructed is: greater_than_date > results
< less_than_date.
Parameters
----------
collection: pymongo.collection.Collection.
Collection within MongoDB that holds the scraped news stories.
lt_date: Datetime object.
Date for which results should be older than. For example,
if the date running is the 25th, and the desired date is
the 24th, then the `lt_date` is the 25th.
gt_date: Datetime object.
Date for which results should be older than. For
example, if the date running is the 25th, and the
desired date is the 24th, then the `gt_date`
is the 23rd.
sources: List.
Sources to pull from the MongoDB instance.
write_file: Boolean.
Option indicating whether to write the results from the web
scraper to an intermediate file. Defaults to false.
Returns
-------
posts: List.
List of dictionaries of results from the MongoDB query.
final_out: String.
If `write_file` is True, this contains a string representation
of the query results. Otherwise, contains an empty string.
"""
logger = logging.getLogger('pipeline_log')
final_out = ''
if write_file:
output = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
posts = collection.find({"$and": [{"date_added": {"$lt": lt_date}},
{"date_added": {"$gt": gt_date}},
{"source": {"$in": sources}}]})
for num, post in enumerate(posts):
try:
#print 'Processing entry {}...'.format(num)
content = post['content'].encode('utf-8')
if post['source'] == 'aljazeera':
content = content.replace("""Caution iconAttention The browser or device you are using is out of date. It has known security flaws and a limited feature set. You will not see all the features of some websites. Please update your browser.""", '')
header = ' '.join(utilities.sentence_segmenter(content)[:4])
string = '{}\t{}\t{}\n{}\n'.format(num, post['date'],
post['url'], header)
output.append(string)
except Exception as e:
print('Error on entry {}: {}.'.format(num, e))
final_out = '\n'.join(output)
posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}},
{"date_added": {"$gt": gt_date}},
{"source": {"$in": sources}}]})
print('Total number of stories: {}'.format(posts.count()))
logger.info('Total number of stories: {}'.format(posts.count()))
posts = list(posts)
return posts, final_out
def _get_sources(filepath):
"""
Function to create a list of sources that will be used in the query to the
MongoDB instance.
Parameters
----------
filepath: String.
Path to file containing the source keys. File should have a
single key on each line.
Returns
-------
sources: List.
List of sources, basically.
"""
with open(filepath, 'r') as f:
sources = [key.replace('\n', '') for key in f.readlines()]
return sources
def main(current_date, file_details, write_file=False, file_stem=None):
"""
Function to create a connection to a MongoDB instance, query for a given
day's results, optionally write the results to a file, and return the
results.
Parameters
----------
current_date: datetime object.
Date for which records are pulled. Normally this is
$date_running - 1. For example, if the script is running on
the 25th, the current_date will be the 24th.
file_details: Named tuple.
Tuple containing config information.
write_file: Boolean.
Option indicating whether to write the results from the web
scraper to an intermediate file. Defaults to false.
file_stem: String. Optional.
Optional string defining the file stem for the intermediate
file for the scraper results.
Returns
-------
posts: Dictionary.
Dictionary of results from the MongoDB query.
filename: String.
If `write_file` is True, contains the filename to which the
scraper results are writen. Otherwise is an empty string.
"""
sources = _get_sources('source_keys.txt')
conn = utilities.make_conn(file_details.auth_db, file_details.auth_user,
file_details.auth_pass, file_details.db_host)
less_than = datetime.datetime(current_date.year, current_date.month,
current_date.day)
greater_than = less_than - datetime.timedelta(days=1)
less_than = less_than + datetime.timedelta(days=1)
results, text = query_all(conn, less_than, greater_than, sources,
write_file=write_file)
filename = ''
if text:
text = text.decode('utf-8')
if file_stem:
filename = '{}{:02d}{:02d}{:02d}.txt'.format(file_stem,
current_date.year,
current_date.month,
current_date.day)
with codecs.open(filename, 'w', encoding='utf-8') as f:
f.write(text)
else:
print('Need filestem to write results to file.')
return results, filename