-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlstm.py
120 lines (100 loc) · 4.57 KB
/
lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, SpatialDropout1D, Dropout, Activation
from keras.layers import Bidirectional, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
# Load pre-trained word vectors
EMBEDDING ='data/glove.840B.300d.txt'
# EMBEDDING='data/crawl-300d-2M.vec'
# Save training and testing data
TRAIN_DATA ='train.csv'
TEST_DATA = 'test.csv'
SAMPLE_SUB = 'sample_submission.csv'
# Size of word vector, given by our pre-trained vectors
embed_size = 300
# Number of unique words to use (i.e num rows in embedding matrix)
max_features = 150000
# Max number of words in a comment to use
maxlen = 200
# Load data into pandas
train = pd.read_csv(TRAIN_DATA, usecols=["id", "comment_text", "toxic"])
test = pd.read_csv(TEST_DATA)
submission = pd.read_csv(SAMPLE_SUB, usecols=["id", "toxic"])
# Replace missing values in training and test set
list_train = train["comment_text"].fillna("_na_").values
classes = ["toxic"]
y = train[classes].values
list_test = test["comment_text"].fillna("_na_").values
def handle_text():
"""
This function simply assumes that list_train and list_test are Pandas
columns with training/testing comments, and that EMBEDDING is a trained
embedding. Then, it outputs those comments in vector form (one for train,
one for test) along with an embedding matrix.
"""
# Use Keras preprocessing tools
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(list_train))
tokenized_train = tok.texts_to_sequences(list_train)
tokenized_test = tok.texts_to_sequences(list_test)
# Pad vectors with 0s for sentences shorter than maxlen
X_t = pad_sequences(tokenized_train, maxlen=maxlen)
X_te = pad_sequences(tokenized_test, maxlen=maxlen)
# Read word vectors into a dictionary
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING))
# Create the embedding matrix
word_index = tok.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
if i >= max_features: continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
return X_t, X_te, embedding_matrix
X_t, X_te, embedding_matrix = handle_text()
def get_model(filename):
"""
This function returns a model and callbacks to be used during a call
to "fit." The callbacks ensure that the best model is saved to the
filename provided.
"""
# Bidirectional LSTM-CNN with max-pooling and 2 FC layers
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform", activation="relu")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
x = Dense(128, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss',
min_delta=0,
patience=3,
verbose=0, mode='auto')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=0, save_best_only=True, mode='auto')
return model, [es, checkpoint]
def next_filename():
file_nums = [int(file.replace('model', '').replace('.h5', ''))
for file in os.listdir("models")]
return 'models/model' + str(max(file_nums) + 1) + '.h5'
best_model = next_filename()
model, callbacks = get_model(best_model)
# Fit the model
model.fit(X_t, y, batch_size=1024, epochs=30, callbacks=callbacks, validation_split=0.1)
# Load best model and predict
# model = load_model(best_model)
# print ('**Predicting on test set**')
# pred = model.predict(X_te, batch_size=1024, verbose=1)
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = pred
# submission.to_csv('preds/submission15.csv', index=False)