Skip to content

Commit

Permalink
Don't consider snippets to be fixed if the page hasn't changed.
Browse files Browse the repository at this point in the history
This is a low-hanging fruit for #70.
  • Loading branch information
eggpi committed Oct 6, 2017
1 parent 5b68a5f commit 8914e9b
Showing 1 changed file with 26 additions and 5 deletions.
31 changes: 26 additions & 5 deletions scripts/compute_fixed_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import time
import urlparse
import datetime
import dateutil.parser
import dateutil.tz

import config
import chdb
Expand All @@ -31,6 +33,23 @@

log = Logger()

def get_page_contents_and_timestamp(wiki, title):
params = {
'prop': 'revisions',
'rvprop': 'content|timestamp',
'titles': title
}
contents = ''
for response in wiki.query(params):
for page in response['query']['pages'].values():
timestamp = page['revisions'][0]['timestamp']
contents += page['revisions'][0]['*']
local_tz = dateutil.tz.tzlocal()
timestamp = dateutil.parser.parse(timestamp)
# convert the timestamp to local time but make it 'naive', since that's
# what we get back from the database as well
return contents, timestamp.astimezone(local_tz).replace(tzinfo = None)

def load_pages_and_snippets_to_process(cursor, lang_code, start_date, end_date):
cursor.execute('''
SELECT ts, snippet_id, url FROM requests
Expand Down Expand Up @@ -83,18 +102,20 @@ def compute_fixed_snippets(cfg):
parser = snippet_parser.create_snippet_parser(wiki, cfg)

for page_title, snippet_to_ts in page_title_to_snippets.items():
snippets = parser.extract(wiki.get_page_contents(title = page_title))
contents, page_ts = get_page_contents_and_timestamp(wiki, page_title)
snippets = parser.extract(contents)
# FIXME Duplicated logic with parse_live.py :(
for sec, snips in snippets:
for sni in snips:
id = mkid(d(page_title) + sni)
snippet_to_ts.pop(id, None)

for snippet_id, clicked_ts in snippet_to_ts.items():
log.info(snippet_id)
stats_db.execute_with_retry_s(
'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)',
clicked_ts, snippet_id, cfg.lang_code)
if clicked_ts < page_ts:
log.info(snippet_id)
stats_db.execute_with_retry_s(
'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)',
clicked_ts, snippet_id, cfg.lang_code)

live_db.close()
stats_db.close()
Expand Down

0 comments on commit 8914e9b

Please # to comment.