@@ -80,6 +80,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
80
80
self .drop_invariant = drop_invariant
81
81
self .drop_cols = []
82
82
self .verbose = verbose
83
+ self .use_default_cols = cols is None # important when we call fit() repeatedly
83
84
self .cols = cols
84
85
self ._dim = None
85
86
self .mapping = None
@@ -122,12 +123,11 @@ def fit(self, X, y, **kwargs):
122
123
self ._dim = X .shape [1 ]
123
124
124
125
# if columns aren't passed, just use every string column
125
- if self .cols is None :
126
+ if self .use_default_cols :
126
127
self .cols = get_obj_cols (X )
127
128
128
- _ , categories = self .leave_one_out (
129
+ categories = self .fit_leave_one_out (
129
130
X , y ,
130
- mapping = self .mapping ,
131
131
cols = self .cols ,
132
132
impute_missing = self .impute_missing ,
133
133
handle_unknown = self .handle_unknown
@@ -183,10 +183,9 @@ def transform(self, X, y=None):
183
183
184
184
if not self .cols :
185
185
return X
186
- X , _ = self .leave_one_out (
186
+ X = self .transform_leave_one_out (
187
187
X , y ,
188
188
mapping = self .mapping ,
189
- cols = self .cols ,
190
189
impute_missing = self .impute_missing ,
191
190
handle_unknown = self .handle_unknown
192
191
)
@@ -209,74 +208,77 @@ def fit_transform(self, X, y=None, **fit_params):
209
208
"""
210
209
return self .fit (X , y , ** fit_params ).transform (X , y )
211
210
212
- def leave_one_out (self , X_in , y , mapping = None , cols = None , impute_missing = True , handle_unknown = 'impute' ):
213
- """
214
- Leave one out encoding uses a single column of floats to represent the means of the target variables.
215
- """
216
-
211
+ def fit_leave_one_out (self , X_in , y , cols = None , impute_missing = True , handle_unknown = 'impute' ):
217
212
X = X_in .copy (deep = True )
218
213
219
214
if cols is None :
220
215
cols = X .columns .values
221
216
222
- if mapping is not None :
223
- mapping_out = mapping
224
- random_state_ = check_random_state (self .random_state )
225
- for switch in mapping :
226
- X [str (switch .get ('col' )) + '_tmp' ] = np .nan
227
- for val in switch .get ('mapping' ):
228
- if y is None :
229
- X .loc [X [switch .get ('col' )] == val , str (switch .get ('col' )) + '_tmp' ] = \
230
- switch .get ('mapping' )[val ]['mean' ]
231
- elif switch .get ('mapping' )[val ]['count' ] == 1 :
232
- X .loc [X [switch .get ('col' )] == val , str (switch .get ('col' )) + '_tmp' ] = self ._mean
233
- else :
234
- X .loc [X [switch .get ('col' )] == val , str (switch .get ('col' )) + '_tmp' ] = (
235
- (switch .get ('mapping' )[val ]['sum' ] - y [(X [switch .get ('col' )] == val ).values ]) / (
236
- switch .get ('mapping' )[val ]['count' ] - 1 )
237
- )
238
- del X [switch .get ('col' )]
239
- X .rename (columns = {str (switch .get ('col' )) + '_tmp' : switch .get ('col' )}, inplace = True )
240
-
241
- if impute_missing :
242
- if handle_unknown == 'impute' :
243
- X [switch .get ('col' )].fillna (self ._mean , inplace = True )
244
- elif handle_unknown == 'error' :
245
- missing = X [switch .get ('col' )].isnull ()
246
- if any (missing ):
247
- raise ValueError ('Unexpected categories found in column %s' % switch .get ('col' ))
248
-
249
- if self .randomized and y is not None :
250
- X [switch .get ('col' )] = (X [switch .get ('col' )] *
251
- random_state_ .normal (1. , self .sigma , X [switch .get ('col' )].shape [0 ]))
252
-
253
- X [switch .get ('col' )] = X [switch .get ('col' )].astype (float ).values .reshape (- 1 , )
254
- else :
255
- self ._mean = y .mean ()
256
- mapping_out = []
257
-
258
- for col in cols :
259
- tmp = y .groupby (X [col ]).agg (['sum' , 'count' ])
260
- tmp ['mean' ] = tmp ['sum' ] / tmp ['count' ]
261
- tmp = tmp .to_dict (orient = 'index' )
262
-
263
- X [str (col ) + '_tmp' ] = np .nan
264
- for val in tmp :
265
- """if the val only appear once ,encoder it as mean of y"""
266
- if tmp [val ]['count' ] == 1 :
267
- X .loc [X [col ] == val , str (col ) + '_tmp' ] = self ._mean
268
- else :
269
- X .loc [X [col ] == val , str (col ) + '_tmp' ] = (tmp [val ]['sum' ] - y .loc [X [col ] == val ]) / (
217
+ self ._mean = y .mean ()
218
+ mapping_out = []
219
+
220
+ for col in cols :
221
+ tmp = y .groupby (X [col ]).agg (['sum' , 'count' ])
222
+ tmp ['mean' ] = tmp ['sum' ] / tmp ['count' ]
223
+ tmp = tmp .to_dict (orient = 'index' )
224
+
225
+ X [str (col ) + '_tmp' ] = np .nan
226
+ for val in tmp :
227
+ """if the val only appear once ,encoder it as mean of y"""
228
+ if tmp [val ]['count' ] == 1 :
229
+ X .loc [X [col ] == val , str (col ) + '_tmp' ] = self ._mean
230
+ else :
231
+ X .loc [X [col ] == val , str (col ) + '_tmp' ] = (tmp [val ]['sum' ] - y .loc [X [col ] == val ]) / (
270
232
tmp [val ]['count' ] - 1 )
271
- del X [col ]
272
- X .rename (columns = {str (col ) + '_tmp' : col }, inplace = True )
233
+ del X [col ]
234
+ X .rename (columns = {str (col ) + '_tmp' : col }, inplace = True )
273
235
274
- if impute_missing :
275
- if handle_unknown == 'impute' :
276
- X [col ].fillna (self ._mean , inplace = True )
236
+ if impute_missing :
237
+ if handle_unknown == 'impute' :
238
+ X [col ].fillna (self ._mean , inplace = True )
277
239
278
- X [col ] = X [col ].astype (float ).values .reshape (- 1 , )
240
+ X [col ] = X [col ].astype (float ).values .reshape (- 1 , )
279
241
280
- mapping_out .append ({'col' : col , 'mapping' : tmp }, )
242
+ mapping_out .append ({'col' : col , 'mapping' : tmp }, )
243
+
244
+ return mapping_out
245
+
246
+ def transform_leave_one_out (self , X_in , y , mapping = None , impute_missing = True , handle_unknown = 'impute' ):
247
+ """
248
+ Leave one out encoding uses a single column of floats to represent the means of the target variables.
249
+ """
250
+
251
+ X = X_in .copy (deep = True )
281
252
282
- return X , mapping_out
253
+ random_state_ = check_random_state (self .random_state )
254
+ for switch in mapping :
255
+ X [str (switch .get ('col' )) + '_tmp' ] = np .nan
256
+ for val in switch .get ('mapping' ):
257
+ if y is None :
258
+ X .loc [X [switch .get ('col' )] == val , str (switch .get ('col' )) + '_tmp' ] = \
259
+ switch .get ('mapping' )[val ]['mean' ]
260
+ elif switch .get ('mapping' )[val ]['count' ] == 1 :
261
+ X .loc [X [switch .get ('col' )] == val , str (switch .get ('col' )) + '_tmp' ] = self ._mean
262
+ else :
263
+ X .loc [X [switch .get ('col' )] == val , str (switch .get ('col' )) + '_tmp' ] = (
264
+ (switch .get ('mapping' )[val ]['sum' ] - y [(X [switch .get ('col' )] == val ).values ]) / (
265
+ switch .get ('mapping' )[val ]['count' ] - 1 )
266
+ )
267
+ del X [switch .get ('col' )]
268
+ X .rename (columns = {str (switch .get ('col' )) + '_tmp' : switch .get ('col' )}, inplace = True )
269
+
270
+ if impute_missing :
271
+ if handle_unknown == 'impute' :
272
+ X [switch .get ('col' )].fillna (self ._mean , inplace = True )
273
+ elif handle_unknown == 'error' :
274
+ missing = X [switch .get ('col' )].isnull ()
275
+ if any (missing ):
276
+ raise ValueError ('Unexpected categories found in column %s' % switch .get ('col' ))
277
+
278
+ if self .randomized and y is not None :
279
+ X [switch .get ('col' )] = (X [switch .get ('col' )] *
280
+ random_state_ .normal (1. , self .sigma , X [switch .get ('col' )].shape [0 ]))
281
+
282
+ X [switch .get ('col' )] = X [switch .get ('col' )].astype (float ).values .reshape (- 1 , )
283
+
284
+ return X
0 commit comments