-
Notifications
You must be signed in to change notification settings - Fork 4
/
compute_parameter_npmi.py
executable file
·122 lines (102 loc) · 4.78 KB
/
compute_parameter_npmi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from optparse import OptionParser
import numpy as np
from tqdm import trange
import file_handling as fh
# compute topic coherence in terms of NPMI with respect to a reference corpus
def main(topics_file):
usage = "%prog topics.txt ref_counts.npz ref_vocab.json"
parser = OptionParser(usage=usage)
parser.add_option('-n', dest='n_vals', default='10',
help='Number of words to consider (comma-separated): default=%default')
parser.add_option('-c', dest='cols', default=0,
help='Columns to skip (for Mallet output): default=%default')
#parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
# help='Keyword argument: default=%default')
(options, args) = parser.parse_args()
ref_counts_file = args[1]
ref_vocab_file = args[2]
n_vals = options.n_vals
n_vals = [int(n) for n in n_vals.split(',')]
cols_to_skip = int(options.cols)
output_file = None
return load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, n_vals, cols_to_skip, output_file=output_file)
def load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, n_vals, cols_to_skip=0, output_file=None):
ref_vocab = fh.read_json(ref_vocab_file)
ref_counts = fh.load_sparse(ref_counts_file).tocsc()
compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip, output_file)
return compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip, output_file)
def compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip=0, output_file=None):
topics = fh.read_text(topics_file)
mean_vals = []
for n in n_vals:
mean_npmi = compute_npmi_at_n(topics, ref_vocab, ref_counts, n, cols_to_skip=cols_to_skip)
mean_vals.append(mean_npmi)
if output_file is not None:
lines = [str(n) + ' ' + str(v) for n, v in zip(n_vals, mean_vals)]
fh.write_list_to_text(lines, output_file)
return mean_vals
def compute_npmi_at_n(topics, ref_vocab, ref_counts, n=10, cols_to_skip=0):
vocab_index = dict(zip(ref_vocab, range(len(ref_vocab))))
n_docs, _ = ref_counts.shape
npmi_means = []
for topic in topics:
words = topic.split()[cols_to_skip:]
npmi_vals = []
for word_i, word1 in enumerate(words[:n]):
if word1 in vocab_index:
index1 = vocab_index[word1]
else:
index1 = None
for word2 in words[word_i+1:n]:
if word2 in vocab_index:
index2 = vocab_index[word2]
else:
index2 = None
if index1 is None or index2 is None:
npmi = 0.0
else:
col1 = np.array(ref_counts[:, index1].todense() > 0, dtype=int)
col2 = np.array(ref_counts[:, index2].todense() > 0, dtype=int)
c1 = col1.sum()
c2 = col2.sum()
c12 = np.sum(col1 * col2)
if c12 == 0:
npmi = 0.0
else:
npmi = (np.log10(n_docs) + np.log10(c12) - np.log10(c1) - np.log10(c2)) / (np.log10(n_docs) - np.log10(c12))
npmi_vals.append(npmi)
# print(str(np.mean(npmi_vals)) + ': ' + ' '.join(words[:n]))
npmi_means.append(np.mean(npmi_vals))
return np.mean(npmi_means)
def compute_npmi_at_n_during_training(beta, ref_counts, n=10, smoothing=0.01):
n_docs, _ = ref_counts.shape
n_topics, vocab_size = beta.shape
npmi_means = []
for k in range(n_topics):
order = np.argsort(beta[k, :])[::-1]
indices = order[:n]
npmi_vals = []
for i, index1 in enumerate(indices):
for index2 in indices[i+1:n]:
col1 = np.array((ref_counts[:, index1] > 0).todense(), dtype=int) + smoothing
col2 = np.array((ref_counts[:, index2] > 0).todense(), dtype=int) + smoothing
c1 = col1.sum()
c2 = col2.sum()
c12 = np.sum(col1 * col2)
if c12 == 0:
npmi = 0.0
else:
npmi = (np.log10(n_docs) + np.log10(c12) - np.log10(c1) - np.log10(c2)) / (np.log10(n_docs) - np.log10(c12))
npmi_vals.append(npmi)
npmi_means.append(np.mean(npmi_vals))
return np.mean(npmi_means)
if __name__ == '__main__':
model = 'imdb_12'
npmi_list = []
for i in trange(200):
if model == 'imdb_12':
topics_file = './outputs/parameter_scholar/contrastive_imdb_12_' + str(i) +'/topics.txt'
else:
topics_file = './outputs/parameter_scholar/contrastive_imdb_11_' + str(i) +'/topics.txt'
npmi_list.append(main(topics_file)[0])
print(npmi_list)