-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscorer.py
45 lines (32 loc) · 1.46 KB
/
scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import string
from math import log10, sqrt
from collections import defaultdict
#TODO: refactor to utils class
def normalized_word_frequency(text, stopwords):
table = str.maketrans("", "", string.punctuation)
hist = defaultdict(int)
for term in [word.lower() for word in text.translate(table).split() if word.lower() not in stopwords]:
hist[term] += 1
return hist
class Scorer:
def __init__(self, index):
self.index = index
def calculate_scores(self, query):
scores = defaultdict(int)
query_terms = normalized_word_frequency(query, self.index.stopwords)
query_length = 0
for term in query_terms:
#TODO: how to correctly deal with query terms that don't accour in any document ??
if term not in self.index.term_weight:
continue
postings = self.index.term_weight[term]
# we assume that every term in a query only occures once
# so the tf part would look like this: (1 + log10(1)) == 1
tf_idf_tq = log10(len(self.index.documents_length) / self.index.document_frequency[term]);
query_length += (tf_idf_tq ** 2)
for document, tf_idf_td in postings:
scores[document] += (tf_idf_tq * tf_idf_td)
# length normalization
for document in scores:
scores[document] = scores[document] / (self.index.documents_length[document] * sqrt(query_length))
return scores