-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk_utils.py
37 lines (27 loc) · 955 Bytes
/
nltk_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import numpy as np
import nltk
# nltk.download("punkt")
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def tokenize(sentence):
return nltk.word_tokenize(sentence)
def stem(word):
return stemmer.stem(word.lower())
def bag_of_words(tokenized_sentence, all_words):
tokenized_sentence = [stem(w) for w in tokenized_sentence]
bag = np.zeros(len(all_words), dtype=np.float32)
for idx, w in enumerate(all_words):
if w in tokenized_sentence:
bag[idx] = 1.0
return bag
# txt = "So I am pursuing Computer Science Engineering."
# print(txt)
# txt = tokenize(txt)
# print(txt)
# words = ["understandable", "understanding", "understand"]
# stemmed_words = [stem(w) for w in words]
# print(stemmed_words)
sentence = ["hello", "how", "are", "you"]
words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
bag = bag_of_words(sentence, words)
print(bag)