-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
326 lines (272 loc) · 13.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import math
import numpy as np
from scipy.special import gammaln # log(gamma(x))
from scipy.special import psi # digamma(x)
from scipy.special import polygamma
from scipy.special import logsumexp
class LDA:
"""
Vanilla LDA optimized with variational EM, treating topics as parameters, with scalar smoothing parameter
"""
def __init__(self, K, V, alpha=0.1):
"""
Create an LDA model
:param K: The number of topics (int)
:param V: The size of the vocabulary
:param alpha: Initial hyperparameter for document-topic distributions
"""
self.K = K # scalar number of topics
self.V = V # scalar size of vocabulary
self.D = None # scalar number of documents
self.gammas = None # D x K matrix of gammas
self.log_betas = None # V x K matrix of log(\beta)
self.alpha = alpha # scalar initial hyperparameter for p(\theta)
self.bound = 0 # the variational bound
self.token_bound = 0
def fit(self, X, tolerance=1e-4, max_epochs=10, initial_smoothing=1.0, n_initial_docs=1,
max_inner_iterations=20, inner_tol=1e-6, vocab=None, display_topics=False):
"""
fit a model to data
:param X: a list of documents (each a list of (word_index, count) tuples)
:param tolerance: stopping criteria (relative change in bound)
:param max_epochs: the maximum number of epochs
:param initial_smoothing: smoothing to use when initializing topics
:param n_initial_docs: the number of documents to use to randomly initialize each topic
:param max_inner_iterations: maximum number of iterations for inner optimization loop (E-step)
:param inner_tol: the tolerance for the inner optimization loop (E-step)
:param vocab: a list of words in the vocabulary (used for displaying topics)
:param display_topics: if True, print topics after each epoch
:return: None
"""
# set up initial values for monitoring convergence
prev_bound = -np.inf
delta = np.inf
self.D = len(X)
# initialize model parameters based on data
self.init_parameters(X, initial_smoothing, n_initial_docs=n_initial_docs)
# repeat optimization until convergence
print("Iter bound\tLL/token\tDelta")
for i in range(max_epochs):
# update parameters (this also computes the bound)
self.update_parameters(X, max_inner_iterations=max_inner_iterations, inner_tol=inner_tol)
# compute the relative change in the bound
if i > 0:
delta = (prev_bound - self.bound) / float(prev_bound)
# print progress
print('%d\t%0.3f\t%0.3f\t%0.5f' % (i, self.bound, self.token_bound / self.D, delta))
# store the new value of the bound
prev_bound = self.bound
# check for convergence
if (0 < delta < tolerance) or (i + 1) >= max_epochs:
break
if vocab is not None and display_topics:
self.print_topics(vocab)
def init_parameters(self, X, initital_smoothing, n_initial_docs=1):
"""
Initialize parameters using recommended values from the original LDA paper
:param X: the data (as above)
:param initial_smoothing: the amount of smoothing to use in initializing the topics
:param n_initial_docs: the number of documents to use to initialize each topic
:return: None
"""
phi_total = np.zeros([self.V, self.K])
random_docs = list(np.random.choice(np.arange(self.D), size=n_initial_docs * self.K, replace=False))
# initialize each topic with word counts from a subset of documents
for k in range(self.K):
docs = random_docs[k * n_initial_docs: (k + 1) * n_initial_docs]
for d in docs:
for w, c in X[d]:
phi_total[w, k] += c
# smooth the counts - you should know why!
phi_total += initital_smoothing
# compute the corresponding topics
self.log_betas = self.compute_log_betas_mle(phi_total)
self.gammas = np.zeros([self.D, self.K])
def compute_log_betas_mle(self, phi_total):
"""
M-step for topics: compute the values of log betas to maximize the bound
:param phi_total: np.array (V,K): Expected number of each type of token assigned to each class k
:return: np.array (V, K): log(beta)
"""
# sum counts over vocbaulary
topic_word_totals = np.sum(phi_total, axis=0)
# compute new optimal values for log betas
log_betas = np.log(phi_total) - np.log(topic_word_totals)
# avoid negative infinities
log_betas[phi_total == 0] = -100
return log_betas
def update_parameters(self, X, max_inner_iterations=20, inner_tol=1e-6):
"""
Do one epoch of updates for all parameters
:param X: the data (D x V np.array)
:param max_inner_iterations: the maximum number of iterations for optimizing document parameters
:param inner_tol: the tolerance for optimizing document parameters
:return: None
"""
self.bound = 0
self.token_bound = 0
phi_total = np.zeros_like(self.log_betas)
# make one update for each document
for d in range(self.D):
if d % 1000 == 0 and d > 0:
print(d)
counts_d = X[d]
# optimize the phi and gamma parameter for this document
bound, phi_d, gammas = self.update_parameters_for_one_item(counts_d, max_iter_d=max_inner_iterations,
tol_d=inner_tol)
self.gammas[d, :] = gammas
# only need to store the running sum of phi over the documents
N_d = 0
for n, (w, c) in enumerate(counts_d):
phi_total[w, :] += c * phi_d[n, :]
N_d += c
# add the contribution of this document to the bound
self.bound += bound
self.token_bound += bound / float(N_d)
# finally update the topic-word distributions and hyperparameters
self.log_betas = self.compute_log_betas_mle(phi_total)
self.update_alpha()
def update_parameters_for_one_item(self, count_tuples, max_iter_d=20, tol_d=1e-6):
"""
Update gamma and compute updates for beta and the bound for one document
:param counts: the word counts for the corresponding document (length-V np.array)
:param max_iter_d: the maximum number of epochs for this inner optimization
:param tol_d: the tolerance required for convergence of the inner optimization problem
:return: (contribution to the bound, phi values for this doc, gammas for this doc)
"""
# unzip counts into lists of word indices and counts of those words
word_indices, counts = zip(*count_tuples)
# convert the lists into vectors of the required shapes
word_indices = np.reshape(np.array(word_indices, dtype=np.int32), (len(word_indices),))
counts = np.reshape(np.array(counts, dtype=np.int32), (len(word_indices),))
count_vector_2d = np.reshape(np.array(counts), (len(word_indices), 1))
# count the total number of words
N_d = int(count_vector_2d.sum())
# and the number of distinct word types
n_word_types = len(word_indices)
# initialize gamma values to alpha + 1/K
gammas = self.alpha + N_d * np.ones(self.K) / float(self.K)
# initialize phis to 1/K; only need to consider each word type
phi_d = np.ones([n_word_types, self.K]) / float(self.K)
# do the optimization step
bound = update_params_for_one_item(self.K, n_word_types, word_indices, counts, gammas, phi_d, self.log_betas,
self.alpha, max_iter_d, tol_d)
return bound, phi_d, gammas
def update_alpha(self, newton_thresh=1e-5, max_iter=1000):
"""
Update hyperparameters of p(\theta) using Netwon's method [ported from lda-c]
:param newton_thresh: tolerance for Newton optimization
:param max_iter: maximum number of iterations
:return: None
"""
init_alpha = 100
log_alpha = np.log(init_alpha)
psi_gammas = psi(self.gammas)
psi_sum_gammas = psi(np.sum(self.gammas, axis=1))
E_ln_thetas = psi_gammas - np.reshape(psi_sum_gammas, (self.D, 1))
sum_E_ln_theta = np.sum(E_ln_thetas) # called ss (sufficient statistics) in lda-c
# repeat until convergence
print("alpha\tL(alpha)\tdL(alpha)")
for i in range(max_iter):
alpha = np.exp(log_alpha)
if np.isnan(alpha):
init_alpha *= 10
print("warning : alpha is nan; new init = %0.5f" % init_alpha)
alpha = init_alpha
log_alpha = np.log(alpha)
L_alpha = self.compute_L_alpha(alpha, sum_E_ln_theta)
dL_alpha = self.compute_dL_alpha(alpha, sum_E_ln_theta)
d2L_alpha = self.compute_d2L_alpha(alpha)
log_alpha = log_alpha - dL_alpha / (d2L_alpha * alpha + dL_alpha)
print("alpha maximization: %5.5f\t%5.5f\t%5.5f" % (np.exp(log_alpha), L_alpha, dL_alpha))
if np.abs(dL_alpha) <= newton_thresh:
break
self.alpha = np.exp(log_alpha)
def compute_L_alpha(self, alpha, sum_E_ln_theta):
return self.D * (gammaln(self.K * alpha) - self.K * gammaln(alpha)) + (alpha - 1) * sum_E_ln_theta
def compute_dL_alpha(self, alpha, sum_E_ln_theta):
return self.D * (self.K * psi(self.K * alpha) - self.K * psi(alpha)) + sum_E_ln_theta
def compute_d2L_alpha(self, alpha):
return self.D * (self.K ** 2 * polygamma(1, self.K * alpha) - self.K * polygamma(1, alpha))
def print_topics(self, vocab, n_words=8):
"""
Display the top words in each topic
:param vocab: a list of words in the vocabulary
"""
for k in range(self.K):
order = list(np.argsort(self.log_betas[:, k]).tolist())
order.reverse()
print("%d %s" % (k, ' '.join([vocab[i] for i in order[:n_words]])))
def update_params_for_one_item(K, n_word_types, word_indices, counts, gammas, phi_d, log_betas, alpha, max_iter_d,
tol_d):
"""
Optimize the per-document variational parameters for one document, namely gamma and phi.
:param K: the number of topics
:param n_word_types: the number of word types in this document (length of word_indices)
:param word_indices: a typed memory view of the indices of the words in the document
:param counts: the corresponding counts of each word in the document
:param gammas: the variational parameters of this document to be updated (length-K)
:param phi_d: n_word_types x K memory view of expected distribution of topics for each word
:param log_betas: V x K memoryview of the current value of the log of the topic distributions
:param alpha: current value of the hyperparmeter alpha
:param max_iter_d: the maximum number of iterations for this inner optimization loop
:param tol_d: the tolerance required for convergence of this inner optimization loop
"""
i = 0
prev_bound = -1000000.0
bound = 0.0
delta = tol_d
new_phi_dn = np.empty(K)
psi_gammas = psi(gammas)
# repeat until convergence
while i < max_iter_d and delta >= tol_d:
# process all the word index: count pairs in this documents
n = 0
while n < n_word_types:
w = word_indices[n]
c = counts[n]
gammas += c*(np.exp(log_betas[w]+psi_gammas - logsumexp(log_betas[w]+psi_gammas)) - phi_d[n])
phi_d[n] = np.exp(log_betas[w]+psi_gammas - logsumexp(log_betas[w]+psi_gammas))
psi_gammas = psi(gammas)
n += 1
# compute the part of the variational bound corresponding to this document
bound = compute_bound_for_one_item(K, n_word_types, word_indices, counts, alpha, gammas, psi_gammas, phi_d, log_betas)
# compute the relative change in the bound
delta = (prev_bound - bound) / prev_bound
# save the new value of the bound
prev_bound = bound
i += 1
return bound
def compute_bound_for_one_item(K, n_word_types, word_indices, count_vector, alpha, gammas, psi_gammas, phi_d,
log_betas):
"""
Compute the parts of the variational bound corresponding to the particular document
:param K: the number of topics
:param n_word_types: the number of word types in this document (length of count_vector)
:param word_indices: a vector of vocabulary indices of the word types in this document
:param count_vector: the corresponding vector of counts of each word type
:param alpha: the current value of the hyperparameter alpha
:param gammas: the current value of gammas for this document
:param psi_gammas: pre-computed values of psi(gammas)
:param phi_d: the expected distribution of topics for each word type in this document
:param log_betas: the current value of the log of the topic distributions
"""
bound = 0.0
phi_sum = 0
for i in range(n_word_types):
phi_sum = phi_sum + np.sum(phi_d[i]* (psi(gammas) - psi(np.sum(gammas))))
bound = gammaln(K * alpha)
- K * gammaln(alpha)
+ (alpha - 1) * (np.sum(psi_gammas)
- K * psi(np.sum(gammas)))
+ phi_sum + np.sum(phi_d * log_betas[word_indices, :])
- gammaln(np.sum(gammas)) + np.sum(gammaln(gammas))
- np.sum((gammas - 1) * (psi(gammas) - psi(np.sum(gammas))))
- np.sum(phi_d* np.log(phi_d + 1e-5))
return bound
if __name__ == "__main__":
from utils import preprocess
#docs, _, vocab = preprocess("./dataset/dataset_cn_full.txt")
docs, _, vocab = preprocess("./dataset/dataset.txt")
lda = LDA(K=10, V=len(vocab))
lda.fit(docs, vocab=vocab, display_topics=True)