-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathidentify_wiki_subjects.py
135 lines (120 loc) · 5.13 KB
/
identify_wiki_subjects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import division
import json
import logging
import re
import requests
import sys
from nlp_client.wiki_parses import main_page_nps, phrases_for_wiki_field
from id_subject import BinaryField, TermFreqField, preprocess, to_list
from id_subject import build_dict_with_original_values
from id_subject import get_subdomain, guess_from_title_tag
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
fh = logging.FileHandler('identify_wiki_subjects.log')
fh.setLevel(logging.ERROR)
log.addHandler(fh)
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
log.addHandler(sh)
SOLR = 'http://search-s10:8983/solr/xwiki/select'
def identify_subject(wid, terms_only=False):
"""For a given wiki ID, return a comma-separated list of top-scoring
subjects."""
# Request data from Solr
params = {'q': 'id:%s' % wid,
'fl': 'url,hostname_s,domains_txt,top_articles_txt,' +
'top_categories_txt',
'wt': 'json'}
r = requests.get(SOLR, params=params)
j = json.loads(r.content)
docs = j['response']['docs']
# Handle 0 docs response
if not docs:
if terms_only:
return ''
return wid
response = docs[0]
# Get lists of NPs or raw data, depending on the field
hostname = [get_subdomain(url) for url in to_list(response.get('hostname_s'))]
domains = [get_subdomain(url) for url in to_list(response.get('domains_txt'))]
sitename = to_list(phrases_for_wiki_field(wid, 'sitename_txt'))
headline = to_list(phrases_for_wiki_field(wid, 'headline_txt'))
description = to_list(phrases_for_wiki_field(wid, 'description_txt'))
top_titles = to_list(response.get('top_articles_txt'))
top_categories = to_list(response.get('top_categories_txt'))
try:
title_tag = to_list(guess_from_title_tag(wid))
except:
log.error('Cannot retrieve title HTML tag data from wid %s' % wid)
title_tag = []
fields = [hostname, domains, sitename, headline, description, top_titles,
top_categories, title_tag]
log.debug(fields)
# Build dictionary with preprocessed candidate keys and original term values
candidates = main_page_nps(wid)
[candidates.extend(field) for field in fields]
candidates = list(set(candidates))
candidates = build_dict_with_original_values(candidates)
# Instantiate BinaryField and TermFreqField objects: avoid rebuilding dicts
hostname_field = BinaryField(hostname)
domains_field = TermFreqField(domains)
sitename_field = BinaryField(sitename)
headline_field = BinaryField(headline)
description_field = TermFreqField(description)
top_titles_field = TermFreqField(top_titles)
top_categories_field = TermFreqField(top_categories)
title_tag_field = BinaryField(title_tag)
def score_candidate(candidate):
"""Return a total score for a candidate across all fields."""
hostname_score = hostname_field.score(candidate) * 2
domains_score = domains_field.score(candidate)
sitename_score = sitename_field.score(candidate)
headline_score = headline_field.score(candidate)
description_score = description_field.score(candidate)
top_titles_score = top_titles_field.score(candidate)
top_categories_score = top_categories_field.score(candidate)
title_tag_score = title_tag_field.score(candidate) * 4
total_score = (hostname_score + domains_score + sitename_score +
headline_score + description_score + top_titles_score +
top_categories_score + title_tag_score)
return total_score
# Combine score of original candidate with scores of individual tokens
total_scores = {}
for candidate in candidates:
total_score = score_candidate(candidate)
# Add scores of individual tokens, normalized by token count
if len(candidate) > 1:
token_score = 0
for token in list(set(candidate)):
token_score += score_candidate((token,))
token_score = token_score / len(candidate)
total_score += token_score
total_scores[candidate] = total_score
# Sort candidates by highest score
total_scores = sorted([(k, v) for (k, v) in total_scores.items() if 'wiki'
not in ''.join(k).lower()], key=lambda x: x[1],
reverse=True)
# DEBUG
#print response.get('hostname_s')
#print '\n'.join(['%f\t%s' % (v, k) for (k, v) in total_scores[:5]])
# Return unstemmed forms of all candidates sharing the top score
top_score = total_scores[0][1]
top_terms = []
top_stemmed = []
for pair in total_scores:
if pair[1] >= top_score:
top_terms.append(candidates[pair[0]][0])
top_stemmed.append(pair[0])
else:
break
if terms_only:
return ','.join(top_terms)
return '%s,%s,%s' % (wid, response.get('hostname_s'), ','.join(top_terms))
if __name__ == '__main__':
count = 0
for line in open('topwams.txt'):
count += 1
if count > 100:
break
wid = line.strip()
print identify_subject(wid).encode('utf-8')