Skip to content

Commit 2394855

Browse files
committed
Merge remote-tracking branch 'origin/master'
# Conflicts: # category_encoders/tests/test_encoders.py
2 parents 3d42d20 + b1993e0 commit 2394855

File tree

2 files changed

+701
-490
lines changed

2 files changed

+701
-490
lines changed

category_encoders/leave_one_out.py

+68-66
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
8080
self.drop_invariant = drop_invariant
8181
self.drop_cols = []
8282
self.verbose = verbose
83+
self.use_default_cols = cols is None # important when we call fit() repeatedly
8384
self.cols = cols
8485
self._dim = None
8586
self.mapping = None
@@ -122,12 +123,11 @@ def fit(self, X, y, **kwargs):
122123
self._dim = X.shape[1]
123124

124125
# if columns aren't passed, just use every string column
125-
if self.cols is None:
126+
if self.use_default_cols:
126127
self.cols = get_obj_cols(X)
127128

128-
_, categories = self.leave_one_out(
129+
categories = self.fit_leave_one_out(
129130
X, y,
130-
mapping=self.mapping,
131131
cols=self.cols,
132132
impute_missing=self.impute_missing,
133133
handle_unknown=self.handle_unknown
@@ -183,10 +183,9 @@ def transform(self, X, y=None):
183183

184184
if not self.cols:
185185
return X
186-
X, _ = self.leave_one_out(
186+
X = self.transform_leave_one_out(
187187
X, y,
188188
mapping=self.mapping,
189-
cols=self.cols,
190189
impute_missing=self.impute_missing,
191190
handle_unknown=self.handle_unknown
192191
)
@@ -209,74 +208,77 @@ def fit_transform(self, X, y=None, **fit_params):
209208
"""
210209
return self.fit(X, y, **fit_params).transform(X, y)
211210

212-
def leave_one_out(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'):
213-
"""
214-
Leave one out encoding uses a single column of floats to represent the means of the target variables.
215-
"""
216-
211+
def fit_leave_one_out(self, X_in, y, cols=None, impute_missing=True, handle_unknown='impute'):
217212
X = X_in.copy(deep=True)
218213

219214
if cols is None:
220215
cols = X.columns.values
221216

222-
if mapping is not None:
223-
mapping_out = mapping
224-
random_state_ = check_random_state(self.random_state)
225-
for switch in mapping:
226-
X[str(switch.get('col')) + '_tmp'] = np.nan
227-
for val in switch.get('mapping'):
228-
if y is None:
229-
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = \
230-
switch.get('mapping')[val]['mean']
231-
elif switch.get('mapping')[val]['count'] == 1:
232-
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = self._mean
233-
else:
234-
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = (
235-
(switch.get('mapping')[val]['sum'] - y[(X[switch.get('col')] == val).values]) / (
236-
switch.get('mapping')[val]['count'] - 1)
237-
)
238-
del X[switch.get('col')]
239-
X.rename(columns={str(switch.get('col')) + '_tmp': switch.get('col')}, inplace=True)
240-
241-
if impute_missing:
242-
if handle_unknown == 'impute':
243-
X[switch.get('col')].fillna(self._mean, inplace=True)
244-
elif handle_unknown == 'error':
245-
missing = X[switch.get('col')].isnull()
246-
if any(missing):
247-
raise ValueError('Unexpected categories found in column %s' % switch.get('col'))
248-
249-
if self.randomized and y is not None:
250-
X[switch.get('col')] = (X[switch.get('col')] *
251-
random_state_.normal(1., self.sigma, X[switch.get('col')].shape[0]))
252-
253-
X[switch.get('col')] = X[switch.get('col')].astype(float).values.reshape(-1, )
254-
else:
255-
self._mean = y.mean()
256-
mapping_out = []
257-
258-
for col in cols:
259-
tmp = y.groupby(X[col]).agg(['sum', 'count'])
260-
tmp['mean'] = tmp['sum'] / tmp['count']
261-
tmp = tmp.to_dict(orient='index')
262-
263-
X[str(col) + '_tmp'] = np.nan
264-
for val in tmp:
265-
"""if the val only appear once ,encoder it as mean of y"""
266-
if tmp[val]['count'] == 1:
267-
X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
268-
else:
269-
X.loc[X[col] == val, str(col) + '_tmp'] = (tmp[val]['sum'] - y.loc[X[col] == val]) / (
217+
self._mean = y.mean()
218+
mapping_out = []
219+
220+
for col in cols:
221+
tmp = y.groupby(X[col]).agg(['sum', 'count'])
222+
tmp['mean'] = tmp['sum'] / tmp['count']
223+
tmp = tmp.to_dict(orient='index')
224+
225+
X[str(col) + '_tmp'] = np.nan
226+
for val in tmp:
227+
"""if the val only appear once ,encoder it as mean of y"""
228+
if tmp[val]['count'] == 1:
229+
X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
230+
else:
231+
X.loc[X[col] == val, str(col) + '_tmp'] = (tmp[val]['sum'] - y.loc[X[col] == val]) / (
270232
tmp[val]['count'] - 1)
271-
del X[col]
272-
X.rename(columns={str(col) + '_tmp': col}, inplace=True)
233+
del X[col]
234+
X.rename(columns={str(col) + '_tmp': col}, inplace=True)
273235

274-
if impute_missing:
275-
if handle_unknown == 'impute':
276-
X[col].fillna(self._mean, inplace=True)
236+
if impute_missing:
237+
if handle_unknown == 'impute':
238+
X[col].fillna(self._mean, inplace=True)
277239

278-
X[col] = X[col].astype(float).values.reshape(-1, )
240+
X[col] = X[col].astype(float).values.reshape(-1, )
279241

280-
mapping_out.append({'col': col, 'mapping': tmp}, )
242+
mapping_out.append({'col': col, 'mapping': tmp}, )
243+
244+
return mapping_out
245+
246+
def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'):
247+
"""
248+
Leave one out encoding uses a single column of floats to represent the means of the target variables.
249+
"""
250+
251+
X = X_in.copy(deep=True)
281252

282-
return X, mapping_out
253+
random_state_ = check_random_state(self.random_state)
254+
for switch in mapping:
255+
X[str(switch.get('col')) + '_tmp'] = np.nan
256+
for val in switch.get('mapping'):
257+
if y is None:
258+
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = \
259+
switch.get('mapping')[val]['mean']
260+
elif switch.get('mapping')[val]['count'] == 1:
261+
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = self._mean
262+
else:
263+
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = (
264+
(switch.get('mapping')[val]['sum'] - y[(X[switch.get('col')] == val).values]) / (
265+
switch.get('mapping')[val]['count'] - 1)
266+
)
267+
del X[switch.get('col')]
268+
X.rename(columns={str(switch.get('col')) + '_tmp': switch.get('col')}, inplace=True)
269+
270+
if impute_missing:
271+
if handle_unknown == 'impute':
272+
X[switch.get('col')].fillna(self._mean, inplace=True)
273+
elif handle_unknown == 'error':
274+
missing = X[switch.get('col')].isnull()
275+
if any(missing):
276+
raise ValueError('Unexpected categories found in column %s' % switch.get('col'))
277+
278+
if self.randomized and y is not None:
279+
X[switch.get('col')] = (X[switch.get('col')] *
280+
random_state_.normal(1., self.sigma, X[switch.get('col')].shape[0]))
281+
282+
X[switch.get('col')] = X[switch.get('col')].astype(float).values.reshape(-1, )
283+
284+
return X

0 commit comments

Comments
 (0)