-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethod.py
116 lines (88 loc) · 3.24 KB
/
method.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
LDA training and evaluation
"""
import numpy as np
from gensim.models import LdaModel
from gensim.corpora import Dictionary
def train_eval(data, n_topics=10, iterations=2000, chunksize=2000, passes=1, fix_random=False):
print('Start training')
dictionary = Dictionary(data)
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(text) for text in data]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
# Make a index to word dictionary.
_ = dictionary[0] # This is only to "load" the dictionary.
id2word = dictionary.id2token
if fix_random:
random_state = 0
else:
random_state = np.random.RandomState()
model = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=n_topics,
passes=passes,
eval_every=None,
random_state=random_state,
minimum_probability=0.0
)
print('Training finished')
print('Prepare topics')
top_topics = model.top_topics(texts=data, dictionary=dictionary, topn=10, coherence='c_uci')
topics = dict()
for k, item in enumerate(top_topics):
words = []
for w in top_topics[k][0]:
words.append(w[1])
topics.update({str(k): words})
doc_topic = dict()
for k, item in enumerate(corpus):
doc_top = model.get_document_topics(corpus[k])
li = [[doc_top.index(m), float(m[1])] for m in doc_top]
doc_topic.update({str(k): li})
print('Calculate metrics')
coherence, avg_topic_coherence = calc_metrics(data, len(dictionary), dictionary.token2id, top_topics, n_topics)
metrics = dict()
metrics.update({"coherence": coherence})
metrics.update({"total_coherence": [avg_topic_coherence]})
return topics, doc_topic, metrics
def calc_metrics(docs, n_terms, dictionary, top_topics, n_topics):
dt_mat = np.zeros([n_terms, n_terms])
for itm in docs:
for kk in itm:
for jj in itm:
if kk != jj:
dt_mat[dictionary[kk], dictionary[jj]] += 1.0
pmi_arr = []
for k in range(n_topics):
top_keywords_index = [dictionary[m[1]] for m in top_topics[k][0]]
pmi_arr.append(calculate_pmi(dt_mat, top_keywords_index))
avg_pmi = np.average(np.array(pmi_arr))
# print(pmi_arr)
print('Average PMI={}'.format(avg_pmi))
return pmi_arr, avg_pmi
def calculate_pmi(aa, top_keywords_index):
"""
Reference:
Short and Sparse Text Topic Modeling via Self-Aggregation
This function has been taken over from SeaNMF implementation to provide a comparable metric
"""
d1 = np.sum(aa)
n_tp = len(top_keywords_index)
pmi = []
for index1 in top_keywords_index:
for index2 in top_keywords_index:
if index2 < index1:
if aa[index1, index2] == 0:
pmi.append(0.0)
else:
c1 = np.sum(aa[index1])
c2 = np.sum(aa[index2])
pmi.append(np.log(aa[index1, index2] * d1 / c1 / c2))
avg_pmi = 2.0*np.sum(pmi)/float(n_tp)/(float(n_tp)-1.0)
return avg_pmi