-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProbabilistic.py
55 lines (50 loc) · 1.91 KB
/
Probabilistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from collections import defaultdict, OrderedDict
import math
# Remove weights from Descriptor
def freqByDoc(descriptor):
new_dc = defaultdict(dict)
for doc in descriptor:
for term in descriptor[doc].keys():
new_dc[doc][term] = int(next(iter(descriptor[doc][term])))
return new_dc
# Apply BM25 model using K & B
def BM25(descriptor2, inverse, K, B, query):
if len(query):
# init
liste = {}
avdl = N = 0
descriptor = freqByDoc(descriptor2)
for doc in descriptor:
avdl += sum(descriptor[doc].values()) # avdl = collection length
N += 1 # N = number of docs
avdl = avdl / N
for doc in descriptor: # iterating through docs
RSV = 0 # init RSV
dl = sum(descriptor[doc].values()) # dl = doc length
dCTE = K * ((1 - B) + B * (dl / avdl))
for term in query:
# if doc contains term
if term in descriptor[doc].keys():
freq = descriptor[doc][term] # frequency
else:
continue # RSVi = 0
# if collection contains term
if term in inverse.keys():
Ni = len(inverse[term]) # appearence
else:
Ni = 0
RSV += (freq / (dCTE + freq)) * math.log10((N - Ni + 0.5)/(Ni + 0.5))
if RSV == 0:
continue
liste[int(doc)+1] = round(RSV, 4)
# displaying relevant docs in order
if liste == {}:
return "No relevant docs.", None
liste = OrderedDict(sorted(liste.items(), key=lambda x: x[1], reverse=True))
result_txt = "Doc \t Relevance\n"
for key, value in liste.items():
result_txt += f"{key} \t {value}\n"
else:
liste = None
result_txt = "Error: empty query."
return result_txt, liste