-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsentiment_model.py
68 lines (62 loc) · 2.58 KB
/
sentiment_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.layers import Dense, Dropout, PReLU
from keras.layers import Masking, GlobalAveragePooling1D, Embedding, Dense
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
import pickle, gc
from tweets import load_dataset
from utils import load_targets, efficient_candle_load
import numpy as np
import pandas as pd
from pathlib import Path
def load_train_test(train_prc=0.7, window='10min'):
tweets = load_dataset(window=window)
gc.collect()
targets = load_targets(window=window, threshold=.004)
gc.collect()
tweets.index = pd.to_datetime(tweets.index)
tweets, targets = tweets.align(targets, join='inner', axis=0)
# tweets = np.asarray([i[0] for i in tweets.values])
indexes = targets.index
targets = to_categorical(targets.values, num_classes=3)
num_samples = indexes.shape[0]
num_train = int(num_samples * train_prc)
train_features = tweets[:num_train]
test_features = tweets[num_train:]
train_labels = targets[:num_train]
test_labels = targets[num_train:]
train_idx = indexes[:num_train]
test_idx = indexes[num_train:]
return train_features, train_labels, train_idx, test_features, test_labels, test_idx
if __name__ == '__main__':
tokenizer = None
num_words = 10000
if Path('tokenizer').exists():
with open('tokenizer', 'rb') as fp:
tokenizer = pickle.load(fp)
num_words = tokenizer.num_words
del tokenizer
gc.collect()
train_features, train_labels, train_idx, test_features, test_labels, test_idx = load_train_test(window='30min')
print(np.unique(train_labels.argmax(axis=-1), return_counts=True))
print(np.unique(test_labels.argmax(axis=-1), return_counts=True))
model = Sequential()
model.add(Embedding(10000, 200, input_length=10000))
model.add(GlobalAveragePooling1D())
model.add(Dense(64))
model.add(PReLU())
model.add(Dense(3, activation='softmax'))
optimizer = RMSprop()
model.compile(loss='categorical_crossentropy',
# optimizer=Adam(lr=0.001),
optimizer=optimizer,
metrics=['accuracy'])
model.fit(train_features, train_labels,
batch_size=16,
epochs=100,
validation_data=(test_features, test_labels), verbose=2)
train_preds = model.predict(train_features)
test_preds = model.predict(test_features)
print(np.unique(train_preds.argmax(axis=-1), return_counts=True))
print(np.unique(test_preds.argmax(axis=-1), return_counts=True))