-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest-wp-topics.py
128 lines (95 loc) · 3.68 KB
/
test-wp-topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from math import sqrt
import gensim
from sklearn.svm import SVC
import os
from nlp_client.services import WikiPageEntitiesService, WikiEntitiesService, WpWikiPageEntitiesService
from nlp_client.caching import useCaching
import sys
import requests
wid = sys.argv[1]
useCaching(perServiceCaching={'WpEntityCountsService.get': {'write_only': True}, 'WpWikiPageEntitiesService.get': {'write_only': True}, 'WpEntitiesService.get': {'write_only': True}})
def vec2dense(vec, num_terms):
'''Convert from sparse gensim format to dense list of numbers'''
return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0])
entities = WpWikiPageEntitiesService().nestedGet(wid)
print entities
pageToEntityList = {}
for page in entities:
pageToEntityList[page] = []
for entity in entities[page]:
pageToEntityList[page] += [entity] * entities[page][entity]
dct = gensim.corpora.Dictionary(pageToEntityList.values())
unfiltered = dct.token2id.keys()
dct.filter_extremes(no_below=2)
filtered = dct.token2id.keys()
filtered_out = set(unfiltered) - set(filtered)
print "\nThe following super common/rare words were filtered out..."
print list(filtered_out), '\n'
print "Vocabulary after filtering..."
print dct.token2id.keys(), '\n'
print "---Bag of Words Corpus---"
bow_docs = {}
for name in pageToEntityList:
sparse = dct.doc2bow(pageToEntityList[name])
bow_docs[name] = sparse
dense = vec2dense(sparse, num_terms=len(dct))
print name, ":", dense
print "\n---LSI Model---"
lsi_docs = {}
num_topics = 5
lsi_model = gensim.models.LsiModel(bow_docs.values(),
num_topics=num_topics)
for name in pageToEntityList:
vec = bow_docs[name]
sparse = lsi_model[vec]
dense = vec2dense(sparse, num_topics)
lsi_docs[name] = sparse
print name, ':', dense
print "\n---Unit Vectorization---"
unit_vecs = {}
for name in pageToEntityList:
vec = vec2dense(lsi_docs[name], num_topics)
norm = sqrt(sum(num ** 2 for num in vec))
unit_vec = [num / norm for num in vec]
unit_vecs[name] = unit_vec
print name, ':', unit_vec
print "\n---Document Similarities---"
titles = dict([(doc['id'], doc['title_en']) for doc in requests.get('http://search-s10:8983/solr/select', params={'wt':'json', 'q':'iscontent:true AND wid:'+sys.argv[1], 'rows':len(entities), 'fl':'title_en, id'}).json()['response']['docs']])
print titles
index = gensim.similarities.MatrixSimilarity(lsi_docs.values())
for i, name in enumerate(pageToEntityList):
vec = lsi_docs[name]
sims = index[vec]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
#Similarities are a list of tuples of the form (doc #, score)
#In order to extract the doc # we take first value in the tuple
#Doc # is stored in tuple as numpy format, must cast to int
if int(sims[0][0]) != i:
match = int(sims[0][0])
else:
match = int(sims[1][0])
match = pageToEntityList.keys()[match]
try:
print titles.get(name, '?'), "is most similar to...", titles.get(match, '?')
except Exception as e:
print e.message
"""
print "\n---Classification---"
dog1 = unit_vecs['dog1.txt']
sandwich1 = unit_vecs['sandwich1.txt']
train = [dog1, sandwich1]
# The label '1' represents the 'dog' category
# The label '2' represents the 'sandwich' category
label_to_name = dict([(1, 'dogs'), (2, 'sandwiches')])
labels = [1, 2]
classifier = SVC()
classifier.fit(train, labels)
for name in names:
vec = unit_vecs[name]
label = classifier.predict([vec])[0]
cls = label_to_name[label]
print name, 'is a document about', cls
print '\n'
"""