-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTFIDF.py
90 lines (71 loc) · 2.54 KB
/
TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def get_corpus():
c1 = "Do you like Green eggs and ham"
c2 = "I do not like them Sam I am I do not like Green eggs and ham"
c3 = "Would you like them Here or there"
c4 = "I would not like them Here or there I would not like them Anywhere"
return [c1, c2, c3, c4]
def split_into_tokens(data, normalize=True, min_length=0, stopwords = []):
# returns an array of words
# each word is simply a string
arr = data.split()
if (normalize):
arr = [x.lower() for x in arr]
if len(stopwords):
stopwords = [elem.lower() for elem in stopwords]
if len(stopwords):
arr = [elem for elem in arr if elem not in stopwords]
arr = [x for x in arr if len(x) > min_length]
# splits the incoming data (a string) based on white-space
# if normalize is True, normalize the case of the words
# only return those words/tokens longer than min_length
return arr
def test_split():
corpus = get_corpus()
doc1 = corpus[0]
print(split_into_tokens(doc1))
import collections
def build_tf(corpus, min_length=0, stopwords = []):
# corpus is a list of documents
# a document is an unparsed string of words
master_list = []
tf = list()
for document in corpus:
doc = collections.Counter(split_into_tokens(document, min_length = min_length, stopwords = stopwords))
for key in doc:
doc[key] /= len(split_into_tokens(document, min_length = min_length, stopwords = stopwords))
tf.append(doc)
master_list += (split_into_tokens(document, min_length = min_length, stopwords = stopwords))
vocab = collections.Counter(master_list)
return vocab, tf
def test_tf():
corpus = get_corpus()
vocab, tf = build_tf(corpus)
print(tf[0]['eggs']) # 0.143
print(tf[3]['there']) # 0.0714
import math
def build_idf(vocabulary, corpus_tf):
# return a collection.Counter object
# such that counter[term] is the idf for that term
term_idf = collections.Counter()
for word in vocabulary:
term_count = 0
for doc in corpus_tf:
if doc[word] > 0:
term_count+=1
term_idf[word] = math.log(len(corpus_tf)/term_count)
return term_idf
def compute_TFIDF(doc_tf, idf):
x = collections.Counter()
for key in doc_tf:
x[key] = idf[key] * doc_tf[key]
return x
def build_tf_idf(tfs, idf):
tfidf = [collections.Counter() for x in tfs]
for idx, doc_tf in enumerate(tfs):
tfidf[idx] = compute_TFIDF(doc_tf, idf)
return tfidf
def test_tfidf():
corpus = get_corpus()
vocab, tf = build_tf(corpus)
idf = build_idf(vocab, tf)
tfidf = build_tf_idf(tf, idf)