-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpos_index.py
100 lines (90 loc) · 3.47 KB
/
pos_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# pos_index.py - Be sure to run in python 3.6.3^
# Author: Joe Williams
import sys
import pickle # For serializing data
import os.path # For checking whether a file exist
from nltk.stem import PorterStemmer as ps # For stemming and word tokenization
def isint(s):
try: int(s)
except ValueError: return False
return True
# Takes a file that has a list of files
def getInputFiles(filelist):
with open(filelist) as f:
return [a for a in f.read().split("\n") if a != ""]
files = getInputFiles("input-files.txt")
# Removes most special characters and caps
def preprocess(data):
for p in "!.,:@#$%^&?<>*()[}{]-=;/\"\\\t\n":
if p in '\n;?:!.,.':
data = data.replace(p,' ')
else: data = data.replace(p,'')
return data.lower()
# For each file, opens and adds it to the hashmap
def createPositionalIndex(files):
index = {}
for i in range(len(files)):
with open(files[i]) as f:
doc = [a for a in preprocess(f.read()).split(' ') if a != ""]
for idx, word in enumerate(doc):
stemmed = ps().stem(word)
if not stemmed in index:
index[stemmed] = [(i,idx)]
else: index[stemmed].append((i,idx))
return index
# shows a preview based on the positions and the how
# much text to show around the data found
def showPreview(positions,radius):
for i, (doc_id, word_index) in enumerate(positions):
docArr = getInputFiles("input-files.txt")
with open(docArr[doc_id]) as f:
wordArr = [a for a in preprocess(f.read()).split(' ') if a != ""]
result = " ".join(wordArr[word_index-radius:word_index+radius])
print(str(i+1)+": ..."+result+"... "+files[doc_id].split("/")[-1])
print()
# Serialization/Positional Index
pi = {}
if os.path.isfile("index_data"):
print("Loading data...")
with open("index_data","rb") as f:
pi = pickle.load(f)
else:
print("Processing and serializing data for future use...")
pi = createPositionalIndex(files)
with open("index_data","wb") as f:
pickle.dump(pi,f)
# User interface and positional index querying
while True:
print("Enter Query: 'word word <int>'")
sys.stdout.write("'/exit' to close > ")
q = [a for a in input().lower().split(' ') if a != ""]
matches = []
if len(q) == 1 and q[0] == '/exit':
exit()
elif len(q) == 2:
word1, word2 = q
word1 = ps().stem(preprocess(word1).replace(' ',''))
word2 = ps().stem(preprocess(word2).replace(' ',''))
print("Searching... \n")
for doc1, index1 in pi[word1]:
for doc2, index2 in pi[word2]:
if doc1 != doc2: continue
if index1 == (index2 - 1): matches.append( (doc1,index1) )
showPreview(matches,5)
elif len(q) == 3:
word1, word2, l = q
if not isint(l):
print("arg 3 needs to be an int\n")
continue
word1 = ps().stem(preprocess(word1).replace(' ',''))
word2 = ps().stem(preprocess(word2).replace(' ',''))
print("Searching... \n")
rad = int(l)
for doc1, index1 in pi[word1]:
for doc2, index2 in pi[word2]:
if doc1 != doc2: continue
abs_pos = abs(index1 - index2)
# when abs_pos is 0, the word is itself
if abs_pos <= rad and abs_pos != 0: matches.append( (doc1,index1) )
showPreview(matches, 5 if rad <= 5 else rad)
else: print("Needs to have 2 or 3 args\n")