forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimpute.py
380 lines (311 loc) · 14.2 KB
/
impute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
"""Transformers for missing value imputation"""
# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
import numpy.ma as ma
from scipy import sparse
from scipy import stats
from .base import BaseEstimator, TransformerMixin
from .utils import check_array
from .utils.sparsefuncs import _get_median
from .utils.validation import check_is_fitted
from .utils.validation import FLOAT_DTYPES
from .externals import six
zip = six.moves.zip
map = six.moves.map
__all__ = [
'SimpleImputer',
]
def _get_mask(X, value_to_mask):
"""Compute the boolean mask X == missing_values."""
if value_to_mask == "NaN" or np.isnan(value_to_mask):
return np.isnan(X)
else:
return X == value_to_mask
def _most_frequent(array, extra_value, n_repeat):
"""Compute the most frequent value in a 1d array extended with
[extra_value] * n_repeat, where extra_value is assumed to be not part
of the array."""
# Compute the most frequent value in array only
if array.size > 0:
mode = stats.mode(array)
most_frequent_value = mode[0][0]
most_frequent_count = mode[1][0]
else:
most_frequent_value = 0
most_frequent_count = 0
# Compare to array + [extra_value] * n_repeat
if most_frequent_count == 0 and n_repeat == 0:
return np.nan
elif most_frequent_count < n_repeat:
return extra_value
elif most_frequent_count > n_repeat:
return most_frequent_value
elif most_frequent_count == n_repeat:
# Ties the breaks. Copy the behaviour of scipy.stats.mode
if most_frequent_value < extra_value:
return most_frequent_value
else:
return extra_value
class SimpleImputer(BaseEstimator, TransformerMixin):
"""Imputation transformer for completing missing values.
Read more in the :ref:`User Guide <impute>`.
Parameters
----------
missing_values : integer or "NaN", optional (default="NaN")
The placeholder for the missing values. All occurrences of
`missing_values` will be imputed. For missing values encoded as np.nan,
use the string value "NaN".
strategy : string, optional (default="mean")
The imputation strategy.
- If "mean", then replace missing values using the mean along
the axis.
- If "median", then replace missing values using the median along
the axis.
- If "most_frequent", then replace missing using the most frequent
value along the axis.
axis : integer, optional (default=0)
The axis along which to impute.
- If `axis=0`, then impute along columns.
- If `axis=1`, then impute along rows.
verbose : integer, optional (default=0)
Controls the verbosity of the imputer.
copy : boolean, optional (default=True)
If True, a copy of X will be created. If False, imputation will
be done in-place whenever possible. Note that, in the following cases,
a new copy will always be made, even if `copy=False`:
- If X is not an array of floating values;
- If X is sparse and `missing_values=0`;
- If `axis=0` and X is encoded as a CSR matrix;
- If `axis=1` and X is encoded as a CSC matrix.
Attributes
----------
statistics_ : array of shape (n_features,)
The imputation fill value for each feature if axis == 0.
Notes
-----
- When ``axis=0``, columns which only contained missing values at `fit`
are discarded upon `transform`.
- When ``axis=1``, an exception is raised if there are rows for which it is
not possible to fill in the missing values (e.g., because they only
contain missing values).
"""
def __init__(self, missing_values="NaN", strategy="mean",
axis=0, verbose=0, copy=True):
self.missing_values = missing_values
self.strategy = strategy
self.axis = axis
self.verbose = verbose
self.copy = copy
def fit(self, X, y=None):
"""Fit the imputer on X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and
``n_features`` is the number of features.
Returns
-------
self : SimpleImputer
"""
# Check parameters
allowed_strategies = ["mean", "median", "most_frequent"]
if self.strategy not in allowed_strategies:
raise ValueError("Can only use these strategies: {0} "
" got strategy={1}".format(allowed_strategies,
self.strategy))
if self.axis not in [0, 1]:
raise ValueError("Can only impute missing values on axis 0 and 1, "
" got axis={0}".format(self.axis))
# Since two different arrays can be provided in fit(X) and
# transform(X), the imputation data will be computed in transform()
# when the imputation is done per sample (i.e., when axis=1).
if self.axis == 0:
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True)
if sparse.issparse(X):
self.statistics_ = self._sparse_fit(X,
self.strategy,
self.missing_values,
self.axis)
else:
self.statistics_ = self._dense_fit(X,
self.strategy,
self.missing_values,
self.axis)
return self
def _sparse_fit(self, X, strategy, missing_values, axis):
"""Fit the transformer on sparse data."""
# Imputation is done "by column", so if we want to do it
# by row we only need to convert the matrix to csr format.
if axis == 1:
X = X.tocsr()
else:
X = X.tocsc()
# Count the zeros
if missing_values == 0:
n_zeros_axis = np.zeros(X.shape[not axis], dtype=int)
else:
n_zeros_axis = X.shape[axis] - np.diff(X.indptr)
# Mean
if strategy == "mean":
if missing_values != 0:
n_non_missing = n_zeros_axis
# Mask the missing elements
mask_missing_values = _get_mask(X.data, missing_values)
mask_valids = np.logical_not(mask_missing_values)
# Sum only the valid elements
new_data = X.data.copy()
new_data[mask_missing_values] = 0
X = sparse.csc_matrix((new_data, X.indices, X.indptr),
copy=False)
sums = X.sum(axis=0)
# Count the elements != 0
mask_non_zeros = sparse.csc_matrix(
(mask_valids.astype(np.float64),
X.indices,
X.indptr), copy=False)
s = mask_non_zeros.sum(axis=0)
n_non_missing = np.add(n_non_missing, s)
else:
sums = X.sum(axis=axis)
n_non_missing = np.diff(X.indptr)
# Ignore the error, columns with a np.nan statistics_
# are not an error at this point. These columns will
# be removed in transform
with np.errstate(all="ignore"):
return np.ravel(sums) / np.ravel(n_non_missing)
# Median + Most frequent
else:
# Remove the missing values, for each column
columns_all = np.hsplit(X.data, X.indptr[1:-1])
mask_missing_values = _get_mask(X.data, missing_values)
mask_valids = np.hsplit(np.logical_not(mask_missing_values),
X.indptr[1:-1])
# astype necessary for bug in numpy.hsplit before v1.9
columns = [col[mask.astype(bool, copy=False)]
for col, mask in zip(columns_all, mask_valids)]
# Median
if strategy == "median":
median = np.empty(len(columns))
for i, column in enumerate(columns):
median[i] = _get_median(column, n_zeros_axis[i])
return median
# Most frequent
elif strategy == "most_frequent":
most_frequent = np.empty(len(columns))
for i, column in enumerate(columns):
most_frequent[i] = _most_frequent(column,
0,
n_zeros_axis[i])
return most_frequent
def _dense_fit(self, X, strategy, missing_values, axis):
"""Fit the transformer on dense data."""
X = check_array(X, force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True)
mask = _get_mask(X, missing_values)
masked_X = ma.masked_array(X, mask=mask)
# Mean
if strategy == "mean":
mean_masked = np.ma.mean(masked_X, axis=axis)
# Avoid the warning "Warning: converting a masked element to nan."
mean = np.ma.getdata(mean_masked)
mean[np.ma.getmask(mean_masked)] = np.nan
return mean
# Median
elif strategy == "median":
median_masked = np.ma.median(masked_X, axis=axis)
# Avoid the warning "Warning: converting a masked element to nan."
median = np.ma.getdata(median_masked)
median[np.ma.getmaskarray(median_masked)] = np.nan
return median
# Most frequent
elif strategy == "most_frequent":
# scipy.stats.mstats.mode cannot be used because it will no work
# properly if the first element is masked and if its frequency
# is equal to the frequency of the most frequent valid element
# See https://github.com/scipy/scipy/issues/2636
# To be able access the elements by columns
if axis == 0:
X = X.transpose()
mask = mask.transpose()
most_frequent = np.empty(X.shape[0])
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
row_mask = np.logical_not(row_mask).astype(np.bool)
row = row[row_mask]
most_frequent[i] = _most_frequent(row, np.nan, 0)
return most_frequent
def transform(self, X):
"""Impute all missing values in X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
The input data to complete.
"""
if self.axis == 0:
check_is_fitted(self, 'statistics_')
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True,
copy=self.copy)
statistics = self.statistics_
if X.shape[1] != statistics.shape[0]:
raise ValueError("X has %d features per sample, expected %d"
% (X.shape[1], self.statistics_.shape[0]))
# Since two different arrays can be provided in fit(X) and
# transform(X), the imputation data need to be recomputed
# when the imputation is done per sample
else:
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True,
copy=self.copy)
if sparse.issparse(X):
statistics = self._sparse_fit(X,
self.strategy,
self.missing_values,
self.axis)
else:
statistics = self._dense_fit(X,
self.strategy,
self.missing_values,
self.axis)
# Delete the invalid rows/columns
invalid_mask = np.isnan(statistics)
valid_mask = np.logical_not(invalid_mask)
valid_statistics = statistics[valid_mask]
valid_statistics_indexes = np.flatnonzero(valid_mask)
missing = np.arange(X.shape[not self.axis])[invalid_mask]
if self.axis == 0 and invalid_mask.any():
if self.verbose:
warnings.warn("Deleting features without "
"observed values: %s" % missing)
X = X[:, valid_statistics_indexes]
elif self.axis == 1 and invalid_mask.any():
raise ValueError("Some rows only contain "
"missing values: %s" % missing)
# Do actual imputation
if sparse.issparse(X) and self.missing_values != 0:
mask = _get_mask(X.data, self.missing_values)
indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
np.diff(X.indptr))[mask]
X.data[mask] = valid_statistics[indexes].astype(X.dtype,
copy=False)
else:
if sparse.issparse(X):
X = X.toarray()
mask = _get_mask(X, self.missing_values)
n_missing = np.sum(mask, axis=self.axis)
values = np.repeat(valid_statistics, n_missing)
if self.axis == 0:
coordinates = np.where(mask.transpose())[::-1]
else:
coordinates = mask
X[coordinates] = values
return X