-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenization.py
117 lines (102 loc) · 3.54 KB
/
Tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
class Tokenization:
def __init__(self):
self.query=[]
def tokenization(self,query):
stopWords = set(stopwords.words('english'))
stopWords.remove('in')
stopWords.remove('to')
stopWords.remove('where')
tokens = word_tokenize(query)
filteredTokens = [word.lower() for word in tokens if word not in stopWords]
return filteredTokens
def tokenizationn(query):
stopWords = set(stopwords.words('english'))
stopWords.remove('in')
stopWords.remove('to')
stopWords.remove('where')
tokens = word_tokenize(query)
filteredTokens = [word.lower() for word in tokens if word not in stopWords]
return filteredTokens
def get_p_index(docslist) :
index = {}
for doc in docslist:
f = open(doc)
text = f.read()
f.close()
b=Tokenization()
tokens=b.tokenization(text)
for i in range(len(tokens)):
if tokens[i] not in index.keys():
index[tokens[i]] = [[],{}]
index[tokens[i]][0] = 1
index[tokens[i]][1][doc] = []
index[tokens[i]][1][doc].append(i+1)
else:
wordMap = index[tokens[i]][1]
if doc not in wordMap.keys():
index[tokens[i]][0] +=1
# Positions not added for that document
wordMap[doc] = []
wordMap[doc].append(i+1)
index[tokens[i]][1] = wordMap
return index
def retrieve_list(word,index):
ans = []
if word in index.keys():
ans = index[word]
else:
print('Term : {} not present in dictionary'.format(word))
return ans
def check(res,post):
listt=list(res[1].keys())
keys=list(post[1].keys())
s= [[],{}]
for i in range(len(keys)):
if(keys[i] in listt):
for j in range(len(post[1][keys[i]])):
c=0
for k in range(len(res[1][keys[i]])):
if(post[1][keys[i]][j]==(res[1][keys[i]][k]+1)) :
a = post[1][keys[i]]
s[1][keys[i]] = a
c=1
break
else:
k+=1
if(c==1):
break
else:
j+=1
res=s
return s
def process_query(query,index):
b = Tokenization
res= []
test=[]
query = query.lower()
text = b.tokenizationn(query)
if(len(text)==1):
post= b.retrieve_list(text[0], index)
if(post==[]):
return res
else:
return list(post[1].keys())
else:
for i in range(len(text)):
if(i==0):
post = b.retrieve_list(text[0], index)
if (post==[]):
return res
else:
res = post
else:
post=b.retrieve_list(text[i],index)
if (post==[]):
return []
else:
test= b.check(res, post)
res=test
return list(res[1].keys())