-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtrain.py
293 lines (235 loc) · 8.32 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import re
import sklearn
import nltk
import pandas as pd
import numpy as np
import xgboost as xgb
import spacy
import datetime
from sklearn import metrics, model_selection
import utility
import variation
import feature
from utility import *
from variation import Variation, Variations
from feature import *
from model import Model
#######################################
#
# Preprocessing phase
#
# we will do preprocessing here.
# 1. get training data and testing data
# 2. text preprocessing
# 3. variation preprocessing
#
#######################################
# get training data for stage 2
train, y = get_stage2_train_data(preprocessed=False)
# make labels starts from 0
y = y - 1
test_all = get_stage2_test_data()
# consider only 150 real case
test = test_all[(test_all['real']==True)&(test_all['in_stage1']==False)]
try:
# load spacy english model, this one performs better than the regular model
nlp = spacy.load('en_core_web_md')
except:
nlp = spacy.load('en')
if not nlp.parser:
print('We need spacy parsers, please install spacy models first')
print('Recommend model: en_core_web_md')
exit(1)
# parse the variation, get information about
# site and amino acid
train['var'] = train.apply(lambda r: Variation(r['Variation']), axis=1)
test['var'] = test.apply(lambda r: Variation(r['Variation']), axis=1)
# this will do text cleaning and find sentences containing variation
get_text_features(train, nlp)
get_text_features(test, nlp)
# concat train and test, we will process them together
df_all = pd.concat((train, test), axis=0).drop(['Unnamed: 0', 'in_stage1', 'real', 'real_pred'], axis=1)
# this dataframe will be used to calculate features based on former observations
df_ans = pd.concat((train, y), axis=1)
#######################################
#
# Feature extracting phase
#
# we will do feature extraction here.
# 1. get all features (explanation
# and extracting steps can be found
# in feature.py)
#
#######################################
def get_dist_count(df):
"""
Given vectors of counts, transform them
into distribution and count for class
14/27.
"""
df_dist = df.apply(lambda r: r/sum(r) if sum(r)>0 else r, axis=1)
df_count = pd.DataFrame()
df_count['count'] = df.apply(lambda r: sum(r), axis=1)
# we can't really distinguish class 1/4 and 2/7,
# maybe this will help to seperate 14/27
df_count['count_14'] = df_dist.apply(lambda r: r[0] + r[3], axis=1)
df_count['count_27'] = df_dist.apply(lambda r: r[1] + r[6], axis=1)
return df_dist, df_count
# is gene in text + is variation in text
df_share = get_share(df_all)
# encode variations by heuristic, extract features like
# amino acid, site, mutation type
df_dummy = get_var_dummy(df_all)
# get pre-trained Doc2Vec vectors
df_d2v_kuo = get_d2v_kuo()
# get neighbor_class feature, this is based on former observations
df_nc = get_neighbor_classes(df_all, df_ans)
df_nc_dist, df_nc_count = get_dist_count(df_nc)
# get point substitution type distribution
df_hc2 = get_hotspot_count(df_all, df_ans)
df_hc2_dist, df_hc2_count = get_dist_count(df_hc2)
# get text class distribution
df_tc = get_text_classes(df_all, df_ans)
df_tc_dist, df_tc_count = get_dist_count(df_tc)
# get tfidf features, either train a new model or
# transform from existing model
df_tfidf = get_tfidf(df_all, model_file='models/tfidf_all.pkl')
df_var_tfidf = get_var_tfidf(df_all, model_file='models/tfidf_var_all.pkl')
# number of sentences that contains the variation
var_len = df_all.apply(lambda r: len(r['Var_sent'].split(' . ')), axis=1).values.reshape(-1, 1)
# get keyword frequencies
df_kf_kuo = get_keyfreq_kuo()
# some important dependency bigram
dep_bigram = {'not show', 'not affect', 'not clear', 'not significant',
'activity increase', 'significantly reduce'}
def get_dep_bigram_freq(row):
"""
Count dependency bigram from Var_sent
"""
split = row['Var_sent'].split()
bigram_count = {key:0 for key in dep_bigram}
if len(split) == 0:
return bigram_count
if row['ID'] % 500 == 0:
print(row['ID'], end='\r')
doc = nlp(row['Var_sent'], entity=False)
for token in doc:
if token.pos_ == 'PUNCT' or token.pos_ == 'DET':
continue
if token.lemma_ == token.head.lemma_:
continue
bigram = token.lemma_ + ' ' + token.head.lemma_
if bigram in dep_bigram:
bigram_count[bigram] = bigram_count[bigram] + 1
bigram_freq = {key:bigram_count[key]/len(split) for key in bigram_count}
return bigram_freq
# get dependency bigram frequency
df_dep_bigram_freq = df_all.apply(lambda r: pd.Series(get_dep_bigram_freq(r)), axis=1)
# get pre-trained word vectors obtained from
# http://bio.nlplab.org/ for each gene
# we have dumped the vectors since the
# size of the model is too large
df_w2v_gene = get_w2v_gene()
# get word vectors trained on training text
df_w2v_alex = get_w2v_alex(dim=100)
#######################################
#
# Training phase
#
# we will train 4 xgb models here.
# 1. combine features needed by each model
# 2. do cross validation, get the
# average prediction of each fold
#
#######################################
fold = 5
repeat = 3
############### Model 1 ###############
feat_all = np.hstack([df_w2v_gene, df_dummy, df_hc2_dist, df_hc2_count,
df_tc_dist, df_tc_count, df_tfidf, var_len,
df_kf_kuo, df_dep_bigram_freq])
print(feat_all.shape)
feat_train = feat_all[:len(train)]
feat_test = feat_all[len(train):]
model_1 = Model()
preds_1 = model_1.cv(
feat_train, y, feat_test,
fold=fold, repeat=repeat
)
preds_1 = np.mean(preds_1, axis=0)
############### Model 1 ###############
############### Model 2 ###############
feat_all = np.hstack([df_w2v_gene, df_dummy, df_hc2_dist, df_hc2_count,
df_tfidf, var_len, df_kf_kuo, df_dep_bigram_freq])
print(feat_all.shape)
feat_train = feat_all[:len(train)]
feat_test = feat_all[len(train):]
model_2 = Model()
preds_2 = model_2.cv(
feat_train, y, feat_test,
fold=fold, repeat=repeat
)
preds_2 = np.mean(preds_2, axis=0)
############### Model 2 ###############
############### Model 3 ###############
feat_all = np.hstack([df_w2v_gene, df_w2v_alex, df_dummy,
df_hc2_dist, df_hc2_count, df_tfidf,
df_nc_dist, df_nc_count, df_kf_kuo
])
print(feat_all.shape)
feat_train = feat_all[:len(train)]
feat_test = feat_all[len(train):]
model_3 = Model()
preds_3 = model_3.cv(
feat_train, y, feat_test,
fold=fold, repeat=repeat
)
preds_3 = np.mean(preds_3, axis=0)
############### Model 3 ###############
############### Model 4 ###############
feat_all = np.hstack([df_w2v_gene, df_w2v_alex,
df_hc2_dist, df_hc2_count, df_tfidf,
df_nc_dist, df_nc_count, df_kf_kuo
])
print(feat_all.shape)
# we only use point substitution for training this model
point_idx = train.apply(lambda r: r['var'].type == 'point', axis=1)
feat_train = feat_all[:len(train)]
feat_train = feat_train[point_idx]
y2 = y[point_idx]
feat_test = feat_all[len(train):]
model_4 = Model()
preds_4 = model_4.cv(
feat_train, y2, feat_test,
fold=fold, repeat=repeat
)
preds_4 = np.mean(preds_4, axis=0)
############### Model 4 ###############
#######################################
#
# Ensembling phase
#
# we will ensemble our models here.
# 1. do model switching
# 2. do model averaging
# 3. write submission files
#
#######################################
# do model switching
# if "text class distribution" feature is present: Model 1
# else if not present: Model 2
pred_switch = []
test_tc_count = np.array(df_tc_count['count'])
for pred_tc, pred_notc, count in zip(preds_1, preds_2, test_tc_count):
if count == 0:
pred_switch.append(pred_notc)
else:
pred_switch.append(pred_tc)
# do model averaging
# simply take means of predictions
pred_average = np.mean([preds_1, preds_2, preds_3, preds_4], axis=0)
# write submission file
# the submission using model switching is placed at 'results/switching.csv'
# the submission using model averaging is placed at 'results/averaging.csv'
write_submit_file('results/switching.csv', pred_switch)
write_submit_file('results/averaging.csv', pred_average)