-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvsm.py
225 lines (161 loc) · 7.45 KB
/
vsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
'''
Everything related to the Vector Space Model
'''
import math
#==============================================================================================
def write_index(docs_path, weighting_method):
'''
Creates an inverted index from the documents in docs_path.
It writes the resulting index to disk as "inverted_index.txt".
It also returns the index as a dictionary of terms.
For each term, a tuple of (document frequency, occurencies).
"occurencies" is an array of occurencies.
Each occurence is represented as a tuple of (document_id, frequency, [word positions]).
- docs_path: A path (e.g. proj/docs/) to a directory will all the documents
Returns:
- The index
- A dictionary of terms containing their norms
- A dictionary of terms containing their max frequency between all documents
'''
index = dict()
max_doc_freq = dict() #id: int => max_freq: int
N = 0 #Number of documents
for i in range(1, 1240):
linecount = 1
temp = dict() #For each word, give the number of occurencies in the current document
pos = dict()
#First get the occurencies and the positions for each term
try:
f = open(docs_path + f"{i:05}")
for word in f:
word = word.strip()
if word in temp:
temp[word] += 1
pos[word].append(linecount)
else:
temp[word] = 1
pos[word] = [linecount]
linecount += 1
f.close()
#Updating the inverted file using data from the new document
for term, freq in temp.items():
if term in index:
index[term].append((i, freq, pos[term]))
else:
index[term] = [(i, freq, pos[term])]
if i in max_doc_freq:
max_doc_freq[i] = max(max_doc_freq[i], freq)
else:
max_doc_freq[i] = freq
N += 1
except FileNotFoundError as ferr:
continue
#Calculate the norms of each document in the collection, examining one term at a time
#Write final index to file
#=====================================================================================================
doc_norms = dict() #id: int, norm: float
with open("results/inverted_index.txt", "w") as out:
for term, postings in index.items():
#Each posting is p = (doc_id, num_occurencies, [list_of_occurencies])
#Store this term's document frequency in the index
out.write(f"{term}: ({len(postings)}, {postings})\n")
index[term] = (len(postings), postings)
#Calculate this term's contribution to each document's norm
#====================================================================
for p in postings:
val = calculate_weight(p[1], max_doc_freq[p[0]], N, len(postings), weighting_method)
if p[0] in doc_norms: #If this text's norm has been initialized
doc_norms[p[0]] += val**2
else:
doc_norms[p[0]] = val**2
for doc in doc_norms:
doc_norms[doc] = math.sqrt(doc_norms[doc])
#print(f"Document #{doc}: {doc_norms[doc]}")
return index, doc_norms, max_doc_freq
#==============================================================================================
def freq(index: dict, term: str, doc_id: int):
'''
Return a term's frequency inside of a document.
Uses binary search to locate the requested doc_id, assuming that the postings list is sorted by document id.
'''
postings = index[term][1]
low = 0
high = len(postings) - 1
while low <= high:
mid = (low + high) // 2
val = postings[mid][0]
if val == doc_id:
#print(postings[mid])
return postings[mid][1]
elif val < doc_id:
low = mid + 1
else:
high = mid - 1
return 0
#==============================================================================================
def calculate_weight(f, max_doc_freq, num_docs, doc_freq, weighting_method):
'''
Calculate TF-IDF for a document with one of two implementations
Which implementation gets used is determined by the weighting_method parameter.
'''
if weighting_method == 0: #tfc
return f*math.log(num_docs/doc_freq, 10)
else: #txc
return f
#==============================================================================================
def calculate_query_weight(f, max_query_freq, num_docs, doc_freq, weighting_method):
'''
Calculate TF-IDF for a query with one of two implementations.
Which implementation gets used is determined by the weighting_method parameter.
'''
return (0.5 + 0.5*f/max_query_freq)*math.log(num_docs/doc_freq, 10)
#==============================================================================================
def search(index: dict, doc_norms: dict, max_doc_freq: dict, query: str, num_results: int, weighting_method: int) -> list:
'''
Search using the Vector Space Model.
Parameters:
- index: The inverted file returned by write_index
- doc_norms: The norms of all documents, returned by write_index
- max_doc_freq: The frequency of the most frequent term for each document, returned by write_index
- query: ...query
- num_results: The number of document that the model should return
- weighting_method: Which implementation of TF-IDF weights should be used (0 or 1)
Returns:
- A list with the retrieved documents' IDs, sorted in descending order of similarity score
'''
result_list = []
#Remove special characters from the query
remove_chars = str.maketrans('', '', '?,()')
query = query.translate(remove_chars).strip().upper().split()
term_set = set(filter(lambda term: term in index, query))
#print(term_set)
#Get max term frequency in query
max_query_freq = 0
for term in term_set:
temp = query.count(term)
if temp > max_query_freq:
max_query_freq = temp
#Get the similarity between the query and every document
for doc in range(1, 1240):
if doc not in doc_norms:
continue
similarity = 0
#Only the terms in the query contribute to the similarity
for term in term_set:
#Query weight
query_weight = calculate_query_weight(query.count(term), max_query_freq, len(doc_norms.keys()), index[term][0], weighting_method)
#Document weight
#=======================================================
doc_weight = 0
if term in index:
doc_weight = calculate_weight(freq(index, term, doc), max_doc_freq[doc], len(doc_norms.keys()), index[term][0], weighting_method)
#print(f"{term}: {doc_weight}")
similarity += doc_weight*query_weight
#Normalization
similarity /= doc_norms[doc]
#print(doc, similarity)
result_list.append((similarity, doc))
#Sort all documents by their similarity to the query
result_list = sorted(result_list, key = lambda x: x[0], reverse=True)
#Only return top k results, and only the doc ids (NOT their similarity scores)
return list(map(lambda x: x[1], result_list[:num_results]))