-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathboolean.py
87 lines (68 loc) · 2.12 KB
/
boolean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import string
import glob
import pandas as pd
from typing import List
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
punctuation = re.compile('[' + string.punctuation + ']')
nonstandard = re.compile(r'[^a-z\s]')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
DATA_DIR = 'stemmed_biographies'
def preprocess(text: str) -> List[str]:
text = text.lower()
text = re.sub(punctuation, ' ', text)
text = re.sub(nonstandard, '', text)
text = word_tokenize(text)
text = [token for token in text if len(token) > 1]
text = [token for token in text if token not in stop_words]
text = [stemmer.stem(token) for token in text]
text = [token for token in text if token in unique_words]
return text
unique_words = set()
for filename in glob.glob(f'{DATA_DIR}/*'):
file = open(filename, 'r')
words = set(file.read().split())
file.close()
unique_words = unique_words.union(words)
docs = []
for filename in glob.glob(f'{DATA_DIR}/*'):
doc = {'DOCUMENT_NAME': filename[len(DATA_DIR) + 1:]}
file = open(filename, 'r')
words = set(file.read().split())
file.close()
for word in unique_words:
if word in words:
doc[word] = 1
else:
doc[word] = 0
docs.append(doc)
columns = ['DOCUMENT_NAME'] + list(unique_words)
docs = pd.DataFrame(docs, columns=columns)
k = 10
# query = input('What are you looking for? ')
# while query != 'exit':
# query = preprocess(query)
#
# ans = docs.copy()
#
# for word in query:
# if len(ans) > 0:
# ans = ans[ans[word] == 1]
#
# ans = list(ans.head(k)['DOCUMENT_NAME'])
# ans = [word.replace('_', ' ') for word in ans]
# print(*ans, sep='\n')
# query = input('What are you looking for? ')
def query_result(query):
query = preprocess(query)
ans = docs.copy()
for word in query:
if len(ans) > 0:
ans = ans[ans[word] == 1]
ans = list(ans.head(k)['DOCUMENT_NAME'])
ans = [word.replace('_', ' ') for word in ans]
return ans
# return '\n'.join(ans)