-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_extraction_text.py
37 lines (27 loc) · 1.22 KB
/
feature_extraction_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# External Imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# Project Level Imports
import config
# Extract the sentences and their labels from a dataset
def extractTextFeaturesAndLabels(inputDF, utteranceKey, labelKey):
utterances = inputDF[utteranceKey].values
labels = inputDF[labelKey].values
return utterances, labels
# Change labels into categorical values (returns train labels, test labels and the encoder)
def encodeTextLabels(trainLabels, testLabels):
lb = LabelEncoder()
trainLabels = np_utils.to_categorical(lb.fit_transform(trainLabels))
testLabels = np_utils.to_categorical(lb.fit_transform(testLabels))
return trainLabels, testLabels, lb
# Create the vectorizer object, fitted to the input data
def createVectorizer(lowercase, inputDF):
vectorizer = CountVectorizer(min_df=0, lowercase=lowercase)
vectorizer.fit(inputDF)
return vectorizer
# Convert the train and test sentences into vectors
def vectorizeSentences(trainSentences, testSentences, vectorizer):
X_train = vectorizer.transform(trainSentences)
X_test = vectorizer.transform(testSentences)
return X_train, X_test