-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_turbine.py
185 lines (151 loc) · 6.67 KB
/
clean_turbine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# -*- coding: utf-8 -*-
def clean_turbine(data_raw, wtn, ws_cut_in, ws_rated, ws_cut_out,
k_up=1.5, k_low=1.5, anomalous=False):
'''
Cleans wind turbine data.
Variables
----------
data_raw : DataFrame
Raw data to be cleaned.
wtn : int
Number of wind turbines.
ws_cut_in : float
Cut-in wind speed in m/s.
ws_rated : float
Rated wind speed in m/s.
ws_cut_out : float
Cut-out wind speed in m/s.
k_up : float, default=1.5
Multiplier of IQR to define upper threshold for outlier detection.
k_low : float, default=1.5
Multiplier of IQR to define lower threshold for outlier detection.
anomalous : bool, default False
If True, anomalous (flagged) periods are kept in the data set.
Returns
----------
data : pandas.DataFrame
Cleaned data.
Notes
-----
Wind speed values are in m/s.
Power values are normalised by rated power.
'''
# List of wind speed column names
windSpeed_cols = ['windSpeed_wt' + str(wt).zfill(2) for wt in range(1,wtn+1)] # turbine wind speed
# List of power column names
power_cols = ['power_wt' + str(wt).zfill(2) for wt in range(1,wtn+1)] # turbine active power
# - - - Calculate wind farm statistics - - -
# Average wind speed
data_raw['windSpeed_avg'] = data_raw[windSpeed_cols].mean(axis=1, skipna=False)
# Standard deviation of wind speed
data_raw['windSpeed_std'] = data_raw[windSpeed_cols].std(axis=1, skipna=False)
# Average power output
data_raw['power_avg'] = data_raw[power_cols].mean(axis=1, skipna=False)
# = = = = = DATA CLEANING = = = = =
data = data_raw.copy()
# = = = = = COMPLETENESS of data = = = = =
print("\n######################### MISSING DATA ######################### \n")
print("- - - - - Wind speed missing values - - - - -")
for wt,ws in zip(range(1, wtn+1), windSpeed_cols):
print("Turbine " + str(wt).zfill(2) + ": " +
str(data[ws].isnull().sum()) + " (" +
str(round(data[ws].isnull().sum()/float(len(data[ws]))*100, 1)) + "%)")
print("\n - - - - - Power missing values - - - - -")
for wt,pw in zip(range(1, wtn+1), power_cols):
print("Turbine " + str(wt).zfill(2) + ": " +
str(data[pw].isnull().sum()) + " (" +
str(round(data[pw].isnull().sum()/float(len(data[pw]))*100, 1)) + "%)")
print("\n###############################################################")
# = = = = = VALIDITY of data = = = = =
# - - - Univariate extreme values - - -
# Wind speed
for ws in windSpeed_cols:
data[ws].where(
(data[ws]>=0) & (data[ws]<40)
)
# Power
for pw in power_cols:
data[pw].where(
(data[pw]>=-0.02) & (data[pw]<1.01)
)
# - - - Bivariate extreme values - - -
# Rules: remove erroneous data, flag anomalous data.
# Create a flag column for each turbine and set initial value to 0.
# When the data is anomalous, the flag value is set > 0.
flag_cols = ['flag_wt' + str(wt).zfill(2) for wt in range(1,wtn+1)]
for fl in flag_cols:
data[fl] = 0
# 1. Remove instances of high power output for wind speed in [0, ws_cut_in]
for ws,pw in zip(windSpeed_cols, power_cols):
data[pw] = data[pw].mask(
(data[ws] >= 0) & (data[ws] < ws_cut_in) &
(data[pw] > 0.04)
)
# 2. Remove instances of non-zero power for wind speed > ws_cut_out + 2
for ws,pw in zip(windSpeed_cols, power_cols):
data[pw] = data[pw].mask(
(data[ws] >= ws_cut_out+2) &
(data[pw] > 0)
)
# 3. Flag instances of zero power output for wind speed in [ws_cut_in+2,
# ws_cut_out-2]
# Create a list of "temporary" columns for partially-clean power values
power_cols_tmp = ['power_wt' + str(wt).zfill(2) + "_tmp" for wt in range(1,wtn+1)]
for ws,pw,fl,pw_tmp in zip(windSpeed_cols, power_cols, flag_cols, power_cols_tmp):
data[fl] = data[fl].mask(
(data[ws] > ws_cut_in+2) & (data[ws] < ws_cut_out-2) &
(data[pw] < 0.005),
data[fl]+1
)
data[pw_tmp] = data[pw].mask(data[fl] > 0)
# 4. Flag instances of low power output (<99.5% of P_nom) for wind speed in
# [ws_rated+2, ws_cut_out-2]
for ws,pw,fl in zip(windSpeed_cols, power_cols, flag_cols):
data[fl] = data[fl].mask(
(data[ws] > ws_rated+2) & (data[ws] < ws_cut_out-2) &
(data[pw] < 0.995),
data[fl]+1
)
# 5. Remove instances of high power output for wind speed in [ws_cut_in+0.5, ws_rated]
bin_width = 0.05 # [m/s]
for ws,pw,fl,pw_tmp in zip(windSpeed_cols, power_cols, flag_cols, power_cols_tmp):
# 5.1 Group by wind speed values, bin width = bin_width
grouped = data.groupby(
pd.cut(data[ws], np.arange(ws_cut_in+0.5, ws_rated, bin_width))
)
for key,df in grouped:
# 5.2 Calculate outlier threshold (Q3 + 2.5*IQR) for each group
q25, q75 = np.percentile(df[pw_tmp].dropna(), [25,75])
iqr = q75 - q25
thresh_up = q75 + k_up*iqr
# 5.3 Remove instances where power is above the threshold
data[pw] = data[pw].mask(
(data[ws] > key.left) & (data[ws] <= key.right) &
(data[pw] > thresh_up)
)
# 6. Flag instances of low power output for wind speed in [ws_cut_in+0.5, ws_rated+2.0]
bin_width = 0.05 # [m/s]
for ws,pw,fl,pw_tmp in zip(windSpeed_cols, power_cols, flag_cols, power_cols_tmp):
# 6.1 Group by wind speed values, bin width=bin_width.
grouped = data.groupby(
pd.cut(data[ws], np.arange(ws_cut_in+0.5, ws_rated+2.0+bin_width, bin_width))
)
for key,df in grouped:
# 6.2 Calculate outlier threshold (Q1 - 2.5*IQR) for each group
q25, q75 = np.percentile(df[pw_tmp].dropna(), [25,75])
iqr = q75 - q25
thresh_low = q25 - k_low*iqr
# 6.3 Flag instances where power output is below the threshold
data[fl] = data[fl].mask(
(data[ws] > key.left) & (data[ws] <= key.right) &
(data[pw] < thresh_low),
data[fl]+1
)
# If anomalous=True, keep "flagged" periods in the data set, otherwise
# remove them.
if anomalous == True:
return(data)
else:
for pw,fl in zip(power_cols, flag_cols):
data[pw] = data[pw].mask(data[fl] > 0)
return(data)