forked from umesh-inteliment/Natural-Language-Generation
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.py
64 lines (45 loc) · 1.45 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import numpy as np
EVENTS_DATASET_PATH = './weather/all.events'
DESC_LABELS_PATH = './weather/all.text'
ordinal_mappings = { 'WNW':13,
'Def':400, 'N':1, 'SSW':14, 'S':2, 'SE':6,
'NNE':9, 'Lkly':300, 'SW':7, 'SSE':11,
'ENE':12, 'NW':8, 'E':3, 'NNW':10,
'SChc':100, 'NE':5, 'WSW':16,
'ESE':15, 'W':4,
'Chc':200,'--':-999,'':-999}
f = open(EVENTS_DATASET_PATH,'r')
data = f.read()
f = open(DESC_LABELS_PATH,'r')
labels = f.read()
lines =data.split('\n')
labels = labels.split('\n')
x = [] #features
y = [] #labels
flag = False
for i,(line,label) in enumerate(zip(lines,labels)):
attr = line.split()
vec = []
if len(attr) == 83:
try:
for at in attr:
#print(at)
vals = at.split(":")
cols = vals[0].split(".")
if cols[1] == 'mode':
vec.append(ordinal_mappings[vals[1]])
elif '-' in vals[1]:
nums = vals[1].split("-")
vec.append(int(nums[0]))
vec.append(int(nums[1]))
elif vals[1] == '--':
vec.append(-999)
else:
vec.append(int(vals[1]))
x.append(np.array(vec))
y.append(label)
except Exception as e:
pass
import pickle
with open('dataset.pkl','wb') as data:
pickle.dump((x,y),data)