-
Notifications
You must be signed in to change notification settings - Fork 52
/
prepare-svd.py
39 lines (23 loc) · 1.04 KB
/
prepare-svd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
from util import Dataset, vstack, hstack
from sklearn.preprocessing import scale
from sklearn.decomposition import TruncatedSVD
n_components = 500 # 500 components explain 99.8% of variance
print "Loading data..."
train_num = Dataset.load_part('train', 'numeric')
train_cat = Dataset.load_part('train', 'categorical_dummy')
test_num = Dataset.load_part('test', 'numeric')
test_cat = Dataset.load_part('test', 'categorical_dummy')
train_cnt = train_num.shape[0]
print "Combining data..."
all_data = hstack((scale(vstack((train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat))))
del train_num, train_cat, test_num, test_cat
print "Fitting svd..."
svd = TruncatedSVD(n_components)
res = svd.fit_transform(all_data)
print "Explained variance ratio: %.5f" % np.sum(svd.explained_variance_ratio_)
print "Saving..."
Dataset.save_part_features('svd', ['svd%d' % i for i in xrange(n_components)])
Dataset(svd=res[:train_cnt]).save('train')
Dataset(svd=res[train_cnt:]).save('test')
print "Done."