-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlang_proc.py
executable file
·60 lines (43 loc) · 1.66 KB
/
lang_proc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
import itertools
import string
_stop_words = stopwords.words('english')
class Term(object):
def __init__(self, full_word):
self.full_word = full_word
# TODO: Lemmatization requires downloads
# wnl = WordNetLemmatizer()
# lemmas = [wnl.lemmatize(token) for token in tokens]
self.stem = PorterStemmer().stem(full_word).lower()
def __eq__(self, other):
return self.stem == other.stem
def __hash__(self):
return hash(self.stem)
def __repr__(self):
return "Term {}({})".format(self.stem.encode('utf8'), self.full_word.encode('utf8'))
def __str__(self):
return repr(self)
def is_punctuation(self):
return self.stem in string.punctuation
def is_stop_word(self):
return self.full_word in _stop_words
def stem_and_tokenize_text(text):
sents = sent_tokenize(text)
tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
terms = [Term(token) for token in tokens]
return filter(lambda term: not term.is_punctuation(), terms)
def to_query_terms(query_raw):
# In case query and doc require different processing in the future
return stem_and_tokenize_text(query_raw)
def remove_stopwords(text):
result = ""
for i in text.split():
if i not in _stop_words:
result += " "
result += i
return result
def to_doc_terms(doc_raw):
# In case query and doc require different processing in the future
return stem_and_tokenize_text(doc_raw)