-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpositional_inverted_index.py
153 lines (132 loc) · 8.83 KB
/
positional_inverted_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import re
import sys
import time
# list of stop words
stopwords=['अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'अंदर', 'आदि', 'आप', 'इत्यादि', 'इन ', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे','कई', 'कर', 'करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जा', 'जितना', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिन', 'तिन्हें', 'तिन्हों', 'तिस', 'तिसे', 'तो', 'था', 'थी', 'थे', 'दबारा', 'दिया', 'दुसरा', 'दूसरे', 'दो', 'द्वारा', 'न', 'नके', 'नहीं', 'ना', 'निहायत', 'नीचे', 'ने', 'पर', 'पहले', 'पे', 'फिर', 'बनी', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वग़ैरह', 'वर्ग', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'सकता', 'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'संग', 'ही', 'हुआ', 'हुई', 'हुए', 'है', 'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने', '']
#print(stopwords)
#---- Index Creation-------#
from collections import Counter
from pprint import pprint as pp
from glob import glob
try: reduce
except: from functools import reduce
try: raw_input
except: raw_input = input
#---- Stemmer for Hindi---------------------#
suffixes = {
1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}
def hi_stemmer(word):
for L in 5, 4, 3, 2, 1:
if len(word) > L + 1:
for suf in suffixes[L]:
if word.endswith(suf):
return word[:-L]
return word
def stem_terms(terms_list):
for term in terms_list:
term=hi_stemmer(term)
return terms_list
#-------------parsing splitting on | and , and then adding space before and after |--------------------#
#------------ PARSING TEXT-------------------#
def parsetexts(fileglob='C:\\Users\\laisha wadhwa\\Documents\\sem5\\IR\\project\\IR project_IR_Course\\InputFiles\\*.txt'):
texts = {}
words=[]
for txtfile in glob(fileglob):
per_file_words=[]
arr=[]
f=open(txtfile, encoding='utf-8-sig')
txt = f.read()
arr=txt.split("।")
for i in arr:
i=i.replace(',','')
i=i.replace('.','')
i=i.replace('!','')
i=i.replace(')','')
i=i.replace('(','')
i=i.replace('"','')
i=i.replace('\'','')
per_file_words=per_file_words+ i.strip().strip('"').split()
#per_file_words = list(set(per_file_words))
per_file_words =[word for word in per_file_words if word not in stopwords] #list(set(per_file_words)-set(stopwords))
#time to stem
per_file_words=stem_terms(per_file_words)
filename= txtfile.split('\\')[-1]
texts[filename] = per_file_words
words=words+per_file_words
return texts, words
#-------------PRINTING DICTIONARY-------------------
texts, words = parsetexts()
#print('\nTexts')
#pp(texts)
#print('\nWords')
#pp(sorted(words))
def termsearch(terms): # Searches full inverted index
if not set(terms).issubset(words):
return set()
return reduce(set.intersection,
(set(x[0] for x in txtindx)
for term, txtindx in finvindex.items()
if term in terms),
set(texts.keys()) )
def phrasesearch(phrase):
wordsinphrase = phrase.strip().strip('"').split()
wordsinphrase=[word for word in wordsinphrase if word not in stopwords]
wordsinphrase=stem_terms(wordsinphrase)
if not set(wordsinphrase).issubset(words):
return set()
#firstword, *otherwords = wordsinphrase
firstword, otherwords = wordsinphrase[0], wordsinphrase[1:]
found = []
for txt in termsearch(wordsinphrase):
# Searching text files
for firstindx in (indx for t,indx in finvindex[firstword]
if t == txt):
# Over all positions of the first word of the phrase in this txt
if all( (txt, firstindx+1 + otherindx) in finvindex[otherword]
for otherindx, otherword in enumerate(otherwords) ):
found.append(txt)
return found
finvindex = {word:set((txt, wrdindx)
for txt, wrds in texts.items()
for wrdindx in (i for i,w in enumerate(wrds) if word==w)
if word in wrds)
for word in words}
def pos_term_search(phrase):
#phrase = '"एक चिड़िया थी वह बहुत उच उड़ती इधर उधर चहचहाती रहती"'
result,intmd_result,it_st=[],[],''
start = time.clock()
query_terms = phrase.strip().strip('"').split()
query_terms=list(set(query_terms)-set(stopwords))
query_terms=stem_terms(query_terms)
print('\nTerm Search on full inverted index for: ' + repr(query_terms))
intmd_result=list(termsearch(query_terms))
for i in range(len(intmd_result)):
it_st=it_st+ str(i+1)+ " "+ intmd_result[i] +'\n'
#pp(finvindex)
result.append("===========================================================")
result.append("\n\n\n-------स्थितित्मक उलटा सूचकांक पर सामान्य खोज के परिणाम---------------\n\n")
result.append("परिणाम प्राप्त करने के लिए "+ " " + str(round(time.clock() - start,4))+"s " +"का समय लिया गया")
result.append(it_st)
return result
#phrasal query search
def pos_phrase_search(phrase):
#phrase = '"दैनिक कार्यों"'#'" पूरा जीवन झुंझलाते"'
result,intmd_resul,it_st=[],[],''
start = time.clock()
phrase=f'"{phrase}"'
print('\nPhrase Search for: ' + phrase)
intmd_resul=list(phrasesearch(phrase))
for i in range(len(intmd_resul)):
it_st=it_st+ str(i+1)+ " "+ intmd_resul[i] +'\n'
#pp(finvindex)
result.append("===========================================================")
result.append("\n\n\n-------स्थितित्मक उलटा सूचकांक पर वाक्यांश खोज के परिणाम---------------\n\n")
result.append("परिणाम प्राप्त करने के लिए "+ " " + str(round(time.clock() - start,4))+"s " +"का समय लिया गया")
result.append(it_st)
return result