-
Notifications
You must be signed in to change notification settings - Fork 98
/
preprocess_elect.py
135 lines (124 loc) · 5.4 KB
/
preprocess_elect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from datetime import datetime, timedelta
import pandas as pd
import math
import numpy as np
import random
from tqdm import trange
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from math import sqrt
from pandas import read_csv, DataFrame
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def prep_data(data, covariates, data_start, train = True):
#print("train: ", train)
time_len = data.shape[0]
#print("time_len: ", time_len)
input_size = window_size-stride_size
windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
#print("windows pre: ", windows_per_series.shape)
if train: windows_per_series -= (data_start+stride_size-1) // stride_size
#print("data_start: ", data_start.shape)
#print(data_start)
#print("windows: ", windows_per_series.shape)
#print(windows_per_series)
total_windows = np.sum(windows_per_series)
x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
label = np.zeros((total_windows, window_size), dtype='float32')
v_input = np.zeros((total_windows, 2), dtype='float32')
#cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
#cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
count = 0
if not train:
covariates = covariates[-time_len:]
for series in trange(num_series):
cov_age = stats.zscore(np.arange(total_time-data_start[series]))
if train:
covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]
else:
covariates[:, 0] = cov_age[-time_len:]
for i in range(windows_per_series[series]):
if train:
window_start = stride_size*i+data_start[series]
else:
window_start = stride_size*i
window_end = window_start+window_size
'''
print("x: ", x_input[count, 1:, 0].shape)
print("window start: ", window_start)
print("window end: ", window_end)
print("data: ", data.shape)
print("d: ", data[window_start:window_end-1, series].shape)
'''
x_input[count, 1:, 0] = data[window_start:window_end-1, series]
x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
x_input[count, :, -1] = series
label[count, :] = data[window_start:window_end, series]
nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum()
if nonzero_sum == 0:
v_input[count, 0] = 0
else:
v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
if train:
label[count, :] = label[count, :]/v_input[count, 0]
count += 1
prefix = os.path.join(save_path, 'train_' if train else 'test_')
np.save(prefix+'data_'+save_name, x_input)
np.save(prefix+'v_'+save_name, v_input)
np.save(prefix+'label_'+save_name, label)
def gen_covariates(times, num_covariates):
covariates = np.zeros((times.shape[0], num_covariates))
for i, input_time in enumerate(times):
covariates[i, 1] = input_time.weekday()
covariates[i, 2] = input_time.hour
covariates[i, 3] = input_time.month
for i in range(1,num_covariates):
covariates[:,i] = stats.zscore(covariates[:,i])
return covariates[:, :num_covariates]
def visualize(data, week_start):
x = np.arange(window_size)
f = plt.figure()
plt.plot(x, data[week_start:week_start+window_size], color='b')
f.savefig("visual.png")
plt.close()
if __name__ == '__main__':
global save_path
name = 'LD2011_2014.txt'
save_name = 'elect'
window_size = 192
stride_size = 24
num_covariates = 4
train_start = '2011-01-01 00:00:00'
train_end = '2014-08-31 23:00:00'
test_start = '2014-08-25 00:00:00' #need additional 7 days as given info
test_end = '2014-09-07 23:00:00'
pred_days = 7
given_days = 7
save_path = os.path.join('data', save_name)
if not os.path.exists(save_path):
os.makedirs(save_path)
csv_path = os.path.join(save_path, name)
if not os.path.exists(csv_path):
zipurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
with urlopen(zipurl) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
zfile.extractall(save_path)
data_frame = pd.read_csv(csv_path, sep=";", index_col=0, parse_dates=True, decimal=',')
data_frame = data_frame.resample('1H',label = 'left',closed = 'right').sum()[train_start:test_end]
data_frame.fillna(0, inplace=True)
covariates = gen_covariates(data_frame[train_start:test_end].index, num_covariates)
train_data = data_frame[train_start:train_end].values
test_data = data_frame[test_start:test_end].values
data_start = (train_data!=0).argmax(axis=0) #find first nonzero value in each time series
total_time = data_frame.shape[0] #32304
num_series = data_frame.shape[1] #370
prep_data(train_data, covariates, data_start)
prep_data(test_data, covariates, data_start, train=False)