-
Notifications
You must be signed in to change notification settings - Fork 4
/
compute_ref_counts.py
executable file
·86 lines (71 loc) · 3.13 KB
/
compute_ref_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import re
import os
import codecs
from collections import Counter
from optparse import OptionParser
from scipy import sparse
import file_handling as fh
# Count word occurrence statistics for computing NPMI on a file with one document per line
def main():
usage = "%prog infile.txt output_dir output_prefix"
parser = OptionParser(usage=usage)
parser.add_option('-m', dest='max_lines', default=None,
help='Quit after processing this many lines (documents): default=%default')
#parser.add_option('--lower', action="store_true", dest="lower", default=False,
# help='Lower case words: default=%default')
(options, args) = parser.parse_args()
infile = args[0]
output_dir = args[1]
output_prefix = args[2]
max_lines = options.max_lines
if max_lines is not None:
max_lines = int(max_lines)
vocab = []
vocab_index = {}
counter = Counter()
# start by converting each document into a dict of word counts, building a vocab as we go
rows = []
cols = []
values = []
n_docs = 0
print("Counting words...")
with codecs.open(infile, 'r', encoding='utf-8') as f:
for line_i, line in enumerate(f):
line = line.strip()
if len(line) > 0:
if max_lines is not None and line_i >= max_lines:
print("Quitting after processing %d lines" % (line_i+1))
break
if n_docs % 1000 == 0 and n_docs > 0:
print(n_docs)
# split on white space
words = line.split()
# filter out everything that's not just letters, and lower case
words = [word.lower() for word in words if re.match('^[a-zA-Z]*$', word) is not None]
# look for new words and add them to the vocabulary
new_words = [word for word in words if word not in vocab_index]
if len(new_words) > 0:
vocab_size = len(vocab)
#print("Adding %d words to vocab" % len(new_words))
#print("New total should be %d" % (vocab_size + len(new_words)))
vocab.extend(new_words)
vocab_index.update(dict(zip(new_words, range(vocab_size, vocab_size + len(new_words)))))
indices = [vocab_index[word] for word in words]
counter.clear()
counter.update(indices)
keys = counter.keys()
counts = counter.values()
rows.extend([line_i] * len(keys))
cols.extend(keys)
values.extend(counts)
n_docs += 1
print("Processed %d documents" % n_docs)
print("Size of final vocab = %d" % len(vocab))
print("Saving counts...")
# now convert these count vectors in to a giant sparse matrix
counts = sparse.coo_matrix((values, (rows, cols)), shape=(n_docs, len(vocab)))
fh.save_sparse(counts, os.path.join(output_dir, output_prefix + '.npz'))
fh.write_to_json(vocab, os.path.join(output_dir, output_prefix + '.vocab.json'))
print("Done")
if __name__ == '__main__':
main()