diff --git a/.travis.yml b/.travis.yml index 86a71a65..3694c279 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,10 +15,10 @@ env: matrix: # The versions should match the minimal requirements in requirements.txt and setup.py - DISTRIB="conda" PYTHON_VERSION="2.7" CYTHON_VERSION="0.21" - NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.20.1" PATSY_VERSION="0.4.1" + NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.21.1" PATSY_VERSION="0.4.1" SCIKIT_VERSION="0.17.1" SCIPY_VERSION="0.17.0" STATSMODELS_VERSION="0.6.1" - DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" CYTHON_VERSION="0.23.4" - NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.20.1" PATSY_VERSION="0.4.1" + NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.21.1" PATSY_VERSION="0.4.1" SCIKIT_VERSION="0.17.1" SCIPY_VERSION="0.17.0" STATSMODELS_VERSION="0.6.1" install: source ci_scripts/install.sh diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 285a7cf6..2705ef9d 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -24,11 +24,13 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause + unexpected changes in dimension in some cases. + handle_missing: str + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example @@ -82,14 +84,15 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -128,12 +131,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -141,9 +148,11 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').get_values() - column_mapping = self.fit_backward_difference_coding(values) - mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) + values = switch.get('mapping') + col = switch.get('col') + + column_mapping = self.fit_backward_difference_coding(col, values, self.handle_missing, self.handle_unknown) + mappings_out.append({'col': col, 'mapping': column_mapping, }) self.mapping = mappings_out @@ -180,6 +189,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -194,6 +207,11 @@ def transform(self, X, override_return_df=False): return X X = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.backward_difference_coding(X, mapping=self.mapping) if self.drop_invariant: @@ -206,14 +224,32 @@ def transform(self, X, override_return_df=False): return X.values @staticmethod - def fit_backward_difference_coding(values): + def fit_backward_difference_coding(col, values, handle_missing, handle_unknown): + if handle_missing == 'value': + values = values[values > 0] + + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) + + backwards_difference_matrix = Diff().code_without_intercept(values_to_encode) + df = pd.DataFrame(data=backwards_difference_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i, ) for i in range(len(backwards_difference_matrix.column_suffixes))]) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values_to_encode) - 1) + + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values_to_encode) - 1) - backwards_difference_matrix = Diff().code_without_intercept(values) - df = pd.DataFrame(data=backwards_difference_matrix.matrix, columns=backwards_difference_matrix.column_suffixes) - df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) return df @staticmethod @@ -230,19 +266,17 @@ def backward_difference_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) def get_feature_names(self): """ diff --git a/category_encoders/basen.py b/category_encoders/basen.py index bf038db8..7ff83f81 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -6,6 +6,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util +import warnings __author__ = 'willmcginnis' @@ -28,11 +29,9 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). base: int when the downstream model copes well with nonlinearities (like decision tree), use higher base. - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example @@ -49,20 +48,20 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): RangeIndex: 506 entries, 0 to 505 Data columns (total 18 columns): - CHAS_0 506 non-null int64 - CHAS_1 506 non-null int64 - RAD_0 506 non-null int64 - RAD_1 506 non-null int64 - RAD_2 506 non-null int64 - RAD_3 506 non-null int64 - RAD_4 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 + CHAS_0 506 non-null int64 + CHAS_1 506 non-null int64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 + RAD_0 506 non-null int64 + RAD_1 506 non-null int64 + RAD_2 506 non-null int64 + RAD_3 506 non-null int64 + RAD_4 506 non-null int64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 @@ -73,20 +72,20 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2, impute_missing=True, - handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols + self.mapping = mapping self.ordinal_encoder = None self._dim = None self.base = base self._encoded_columns = None - self.digits_per_col = {} self.feature_names = None def fit(self, X, y=None, **kwargs): @@ -121,17 +120,20 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) - for col in self.cols: - self.digits_per_col[col] = self.calc_required_digits(X, col) + self.mapping = self.fit_base_n_encoding(X) # do a transform on the training data to get a column list X_temp = self.transform(X, override_return_df=True) @@ -152,6 +154,38 @@ def fit(self, X, y=None, **kwargs): return self + def fit_base_n_encoding(self, X): + mappings_out = [] + + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') + + if self.handle_missing == 'value': + values = values[values > 0] + + if self.handle_unknown == 'indicator': + values = np.append(values, -1) + + digits = self.calc_required_digits(values) + X_unique = pd.DataFrame(index=values, + columns=[str(col) + '_%d' % x for x in range(digits)], + data=np.array([self.col_transform(x, digits) for x in range(1, len(values) + 1)])) + + if self.handle_unknown == 'return_nan': + X_unique.loc[-1] = np.nan + elif self.handle_unknown == 'value': + X_unique.loc[-1] = 0 + + if self.handle_missing == 'return_nan': + X_unique.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + X_unique.loc[-2] = 0 + + mappings_out.append({'col': col, 'mapping': X_unique}) + + return mappings_out + def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. @@ -168,6 +202,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -182,6 +220,11 @@ def transform(self, X, override_return_df=False): return X X_out = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X_out[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X_out = self.basen_encode(X_out, cols=self.cols) if self.drop_invariant: @@ -189,8 +232,8 @@ def transform(self, X, override_return_df=False): X_out.drop(col, 1, inplace=True) # impute missing values only in the generated columns - generated_cols = util.get_generated_cols(X, X_out, self.cols) - X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0) + # generated_cols = util.get_generated_cols(X, X_out, self.cols) + # X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0) if self.return_df or override_return_df: return X_out @@ -232,25 +275,25 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': - for col in self.cols: - if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) - for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) + if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': + for col in self.cols: + if X[switch.get('col')].isnull().any(): + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + return X if self.return_df else X.values - def calc_required_digits(self, X, col): + def calc_required_digits(self, values): # figure out how many digits we need to represent the classes present if self.base == 1: - digits = len(X[col].unique()) + 1 + digits = len(values) + 1 else: - digits = int(np.ceil(math.log(len(X[col].unique()), self.base))) + 1 + digits = int(np.ceil(math.log(len(values), self.base))) + 1 return digits @@ -270,30 +313,20 @@ def basen_encode(self, X_in, cols=None): X = X_in.copy(deep=True) - if cols is None: - cols = X.columns.values - pass_thru = [] - else: - pass_thru = [col for col in X.columns.values if col not in cols] - - bin_cols = [] - for col in cols: - # get how many digits we need to represent the classes present - digits = self.calc_required_digits(X, col) + cols = X.columns.values.tolist() - # map the ordinal column into a list of these digits, of length digits - X[col] = X[col].map(lambda x: self.col_transform(x, digits)) + for switch in self.mapping: + col = switch.get('col') + mod = switch.get('mapping') - for dig in range(digits): - X[str(col) + '_%d' % (dig,)] = X[col].map(lambda r: int(r[dig]) if r is not None else None) - bin_cols.append(str(col) + '_%d' % (dig,)) + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) - if self._encoded_columns is None: - X = X.reindex(columns=bin_cols + pass_thru) - else: - X = X.reindex(columns=self._encoded_columns) + old_column_index = cols.index(col) + cols[old_column_index: old_column_index + 1] = mod.columns - return X + return X.reindex(columns=cols) def basen_to_integer(self, X, cols, base): """ @@ -316,10 +349,7 @@ def basen_to_integer(self, X, cols, base): for col in cols: col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))] - for col0 in col_list: - if any(X[col0].isnull()): - raise ValueError("inverse_transform is not supported because transform impute" - "the unknown category -1 when encode %s" % (col,)) + if base == 1: value_array = np.array([int(col0.split('_')[-1]) for col0 in col_list]) else: diff --git a/category_encoders/binary.py b/category_encoders/binary.py index f89fb66c..71683892 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -1,11 +1,9 @@ """Binary encoding""" -import copy import pandas as pd -import numpy as np from sklearn.base import BaseEstimator, TransformerMixin -from category_encoders.ordinal import OrdinalEncoder -import category_encoders.utils as util + +import category_encoders as ce __author__ = 'willmcginnis' @@ -24,11 +22,13 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause + unexpected changes in dimension in some cases. + handle_missing: str + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example @@ -45,20 +45,20 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): RangeIndex: 506 entries, 0 to 505 Data columns (total 18 columns): - CHAS_0 506 non-null int64 - CHAS_1 506 non-null int64 - RAD_0 506 non-null int64 - RAD_1 506 non-null int64 - RAD_2 506 non-null int64 - RAD_3 506 non-null int64 - RAD_4 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 + CHAS_0 506 non-null int64 + CHAS_1 506 non-null int64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 + RAD_0 506 non-null int64 + RAD_1 506 non-null int64 + RAD_2 506 non-null int64 + RAD_3 506 non-null int64 + RAD_4 506 non-null int64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 @@ -69,19 +69,11 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute'): - self.return_df = return_df - self.drop_invariant = drop_invariant - self.drop_cols = [] - self.verbose = verbose - self.impute_missing = impute_missing - self.handle_unknown = handle_unknown - self.cols = cols - self.ordinal_encoder = None - self._dim = None - self.digits_per_col = {} - self.feature_names = None + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): + self.base_n_encoder = ce.BaseNEncoder(base=2, verbose=verbose, cols=cols, mapping=mapping, + drop_invariant=drop_invariant, return_df=return_df, + handle_unknown=handle_unknown, handle_missing=handle_missing) def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. @@ -103,45 +95,7 @@ def fit(self, X, y=None, **kwargs): """ - # if the input dataset isn't already a dataframe, convert it to one (using default column names) - # first check the type - X = util.convert_input(X) - - self._dim = X.shape[1] - - # if columns aren't passed, just use every string column - if self.cols is None: - self.cols = util.get_obj_cols(X) - else: - self.cols = util.convert_cols_to_list(self.cols) - - # train an ordinal pre-encoder - self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown - ) - X = X.drop_duplicates(subset=self.cols) if self.cols else X - self.ordinal_encoder = self.ordinal_encoder.fit(X) - - for col in self.cols: - self.digits_per_col[col] = self.calc_required_digits(X, col) - - X_temp = self.transform(X, override_return_df=True) - self.feature_names = X_temp.columns.tolist() - - # drop all output columns with 0 variance. - if self.drop_invariant: - self.drop_cols = [] - generated_cols = util.get_generated_cols(X, X_temp, self.cols) - self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] - try: - [self.feature_names.remove(x) for x in self.drop_cols] - except KeyError as e: - if self.verbose > 0: - print("Could not remove column from feature names." - "Not found in generated cols.\n{}".format(e)) + self.base_n_encoder.fit(X, y, **kwargs) return self @@ -161,31 +115,7 @@ def transform(self, X, override_return_df=False): """ - if self._dim is None: - raise ValueError('Must train encoder before it can be used to transform data.') - - # first check the type - X = util.convert_input(X) - - # then make sure that it is the right size - if X.shape[1] != self._dim: - raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) - - if not self.cols: - return X if self.return_df else X.values - - X = self.ordinal_encoder.transform(X) - - X = self.binary(X, cols=self.cols) - - if self.drop_invariant: - for col in self.drop_cols: - X.drop(col, 1, inplace=True) - - if self.return_df or override_return_df: - return X - else: - return X.values + return self.base_n_encoder.transform(X) def inverse_transform(self, X_in): """ @@ -199,142 +129,9 @@ def inverse_transform(self, X_in): ------- p: array, the same size of X_in - """ - X = X_in.copy(deep=True) - - # first check the type - X = util.convert_input(X) - - if self._dim is None: - raise ValueError('Must train encoder before it can be used to inverse_transform data') - - X = self.binary_to_integer(X, self.cols) - - # then make sure that it is the right size - if X.shape[1] != self._dim: - if self.drop_invariant: - raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " - "set as False when transform data" % (X.shape[1],)) - else: - raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) - - if not self.cols: - return X if self.return_df else X.values - - if self.impute_missing and self.handle_unknown == 'impute': - for col in self.cols: - if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) - - for switch in self.ordinal_encoder.mapping: - column_mapping = switch.get('mapping') - inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) - X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) - - return X if self.return_df else X.values - - def binary(self, X_in, cols=None): - """ - Binary encoding encodes the integers as binary code with one column per digit. - - Parameters - ---------- - X_in: DataFrame - cols: list-like, default None - Column names in the DataFrame to be encoded - Returns - ------- - dummies : DataFrame - """ - - X = X_in.copy(deep=True) - - if cols is None: - cols = X.columns.values - pass_thru = [] - else: - pass_thru = [col for col in X.columns.values if col not in cols] - - output = [] - bin_cols = [] - for col in cols: - # get how many digits we need to represent the classes present - digits = self.digits_per_col[col] - - X_unique = pd.DataFrame(index=X[col].unique()) - # map the ordinal column into a list of these digits, of length digits - X_unique_to_cols = X_unique.index.map(lambda x: self.col_transform(x, digits)) - - for dig in range(digits): - X_unique[str(col) + '_%d' % (dig, )] = X_unique_to_cols.map( - lambda r: int(r[dig]) if r is not None else None) - bin_cols.append(str(col) + '_%d' % (dig,)) - - output.append(X[[col]].merge( - X_unique, how='left', left_on=col, right_index=True).drop(labels=col, axis=1)) - - if pass_thru: - output.append(X[pass_thru]) - X = pd.concat(output, axis=1).reindex(columns=bin_cols + pass_thru) - - return X - - def binary_to_integer(self, X, cols): - """ - Convert binary code as integers. - - Parameters - ---------- - X : DataFrame - encoded data - cols : list-like - Column names in the DataFrame that be encoded - - Returns - ------- - numerical: DataFrame - """ - out_cols = X.columns.values - - for col in cols: - col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))] - for col0 in col_list: - if any(X[col0].isnull()): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) - - len0 = len(col_list) - value_array = np.array([2 ** (len0 - 1 - i) for i in range(len0)]) - - X[col] = np.dot(X[col_list].values, value_array.T) - out_cols = [col0 for col0 in out_cols if col0 not in col_list] - - X = X.reindex(columns=out_cols + cols) - - return X - - @staticmethod - def calc_required_digits(X, col): - """ - figure out how many digits we need to represent the classes present - """ - return int(np.ceil(np.log2(len(X[col].unique())))) + 1 - - @staticmethod - def col_transform(col, digits): - """ - The lambda body to transform the column values """ - if col is None or np.isnan(col) or float(col) < 0.0: - return None - else: - col = list("{0:b}".format(int(col))) - if len(col) == digits: - return col - else: - return [str(0) for _ in range(digits - len(col))] + col + return self.base_n_encoder.inverse_transform(X_in) def get_feature_names(self): """ @@ -347,7 +144,4 @@ def get_feature_names(self): Note: potentially dropped features are not included! """ - if not isinstance(self.feature_names, list): - raise ValueError('Must fit data first. Affected feature names are not known before.') - else: - return self.feature_names + return self.base_n_encoder.get_feature_names() diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index fd1b9d88..5ec83fae 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -25,11 +25,13 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause + unexpected changes in dimension in some cases. + handle_missing: str + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example @@ -40,7 +42,7 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) - >>> enc = HelmertEncoder(cols=['CHAS', 'RAD']).fit(X, y) + >>> enc = HelmertEncoder(cols=['CHAS', 'RAD'], handle_unknown='value', handle_missing='value').fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) @@ -82,14 +84,15 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='indicator', handle_missing='indicator'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -126,11 +129,15 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -138,10 +145,11 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: + values = switch.get('mapping') + col = switch.get('col') - values = switch.get('mapping').get_values() - column_mapping = self.fit_helmert_coding(values) - mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) + column_mapping = self.fit_helmert_coding(col, values, self.handle_missing, self.handle_unknown) + mappings_out.append({'col': col, 'mapping': column_mapping, }) self.mapping = mappings_out @@ -177,6 +185,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -192,6 +204,10 @@ def transform(self, X, override_return_df=False): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.helmert_coding(X, mapping=self.mapping) if self.drop_invariant: @@ -204,14 +220,33 @@ def transform(self, X, override_return_df=False): return X.values @staticmethod - def fit_helmert_coding(values): + def fit_helmert_coding(col, values, handle_missing, handle_unknown): + if handle_missing == 'value': + values = values[values > 0] + + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) + + helmert_contrast_matrix = Helmert().code_without_intercept(values_to_encode) + df = pd.DataFrame(data=helmert_contrast_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i,) for i in + range(len(helmert_contrast_matrix.column_suffixes))]) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values_to_encode) - 1) + + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values_to_encode) - 1) - helmert_contrast_matrix = Helmert().code_without_intercept(values) - df = pd.DataFrame(data=helmert_contrast_matrix.matrix, columns=helmert_contrast_matrix.column_suffixes) - df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) return df @staticmethod @@ -228,19 +263,17 @@ def helmert_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) def get_feature_names(self): """ diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index a8231830..202c3e96 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -26,12 +26,8 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes - unexpected changes in dimension in some cases. + options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the target mean. sigma: float adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma gives the standard deviation (spread or "width") of the normal distribution. @@ -74,8 +70,8 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154. """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', random_state=None, sigma=None): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value', random_state=None, sigma=None): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -84,8 +80,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.cols = cols self._dim = None self.mapping = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self._mean = None self.random_state = random_state self.sigma = sigma @@ -116,7 +112,7 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0].astype(float) else: - y = pd.Series(y, name='target', dtype=float) + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -128,6 +124,10 @@ def fit(self, X, y, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + categories = self.fit_leave_one_out( X, y, cols=self.cols @@ -170,6 +170,10 @@ def transform(self, X, y=None, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -185,7 +189,7 @@ def transform(self, X, y=None, override_return_df=False): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0].astype(float) else: - y = pd.Series(y, name='target', dtype=float) + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -193,9 +197,7 @@ def transform(self, X, y=None, override_return_df=False): return X X = self.transform_leave_one_out( X, y, - mapping=self.mapping, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + mapping=self.mapping ) if self.drop_invariant: @@ -223,9 +225,24 @@ def fit_leave_one_out(self, X_in, y, cols=None): cols = X.columns.values self._mean = y.mean() - return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols} - def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'): + return {col: self.fit_column_map(X[col], y) for col in cols} + + def fit_column_map(self, series, y): + category = pd.Categorical(series) + + categories = category.categories + codes = category.codes.copy() + + codes[codes == -1] = len(categories) + categories = np.append(categories, np.nan) + + return_map = pd.Series(dict([(code, category) for code, category in enumerate(categories)])) + + result = y.groupby(codes).agg(['sum', 'count']) + return result.rename(return_map) + + def transform_leave_one_out(self, X_in, y, mapping=None): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ @@ -235,6 +252,16 @@ def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, ha for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 + + unique_train = colmap.index + unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train]) + + is_nan = X[col].isnull() + is_unknown_value = X[col].isin(unseen_values.dropna()) + + if self.handle_unknown == 'error' and is_unknown_value.any(): + raise ValueError('Columns to be encoded can not contain new values') + if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean) X[col] = X[col].map(level_means) @@ -245,12 +272,15 @@ def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, ha # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) - if impute_missing: - if handle_unknown == 'impute': - X[col].fillna(self._mean, inplace=True) - elif handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) + if self.handle_unknown == 'value': + X.loc[is_unknown_value, col] = self._mean + elif self.handle_unknown == 'return_nan': + X.loc[is_unknown_value, col] = np.nan + + if self.handle_missing == 'value': + X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean + elif self.handle_missing == 'return_nan': + X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 1e1fedb8..bc64c5c5 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -1,7 +1,7 @@ """One-hot or dummy coding""" import numpy as np import pandas as pd -import copy +import warnings from sklearn.base import BaseEstimator, TransformerMixin from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util @@ -23,11 +23,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. use_cat_names: bool if True, category values will be included in the encoded column names. Since this can result into duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated. @@ -41,7 +39,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) - >>> enc = OneHotEncoder(cols=['CHAS', 'RAD']).fit(X, y) + >>> enc = OneHotEncoder(cols=['CHAS', 'RAD'], handle_unknown='indicator').fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) @@ -86,8 +84,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): """ - - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute', use_cat_names=False): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_missing='value', handle_unknown='value', use_cat_names=False): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -96,8 +94,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.cols = cols self.ordinal_encoder = None self._dim = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.use_cat_names = use_cat_names self.feature_names = None @@ -136,13 +134,15 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) - # Indicate no transformation has been applied yet + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() @@ -169,13 +169,18 @@ def generate_mapping(self): for switch in self.ordinal_encoder.mapping: col = switch.get('col') - column_mapping = switch.get('mapping').copy(deep=True) + values = switch.get('mapping').copy(deep=True) + + if self.handle_missing == 'value': + values = values[values > 0] + + if len(values) == 0: + continue - if self.handle_unknown == 'impute': - column_mapping = column_mapping.append(pd.Series(data=[-1], index=['-1'])) + index = [] + new_columns = [] - col_mappings = [] - for cat_name, class_ in column_mapping.iteritems(): + for cat_name, class_ in values.iteritems(): if self.use_cat_names: n_col_name = str(col) + '_%s' % (cat_name,) found_count = found_column_counts.get(n_col_name, 0) @@ -183,9 +188,34 @@ def generate_mapping(self): n_col_name += '#' * found_count else: n_col_name = str(col) + '_%s' % (class_,) - col_mappings.append({'new_col_name': n_col_name, 'val': class_}) - mapping.append({'col': col, 'mapping': col_mappings}) + index.append(class_) + new_columns.append(n_col_name) + + if self.handle_unknown == 'indicator': + n_col_name = str(col) + '_%s' % (-1,) + if self.use_cat_names: + found_count = found_column_counts.get(n_col_name, 0) + found_column_counts[n_col_name] = found_count + 1 + n_col_name += '#' * found_count + new_columns.append(n_col_name) + index.append(-1) + + base_matrix = np.eye(N=len(index), dtype=np.int) + base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) + + if self.handle_unknown == 'value': + base_df.loc[-1] = 0 + elif self.handle_unknown == 'return_nan': + base_df.loc[-1] = np.nan + + if self.handle_missing == 'return_nan': + base_df.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + base_df.loc[-2] = 0 + + mapping.append({'col': col, 'mapping': base_df}) + return mapping def transform(self, X, override_return_df=False): @@ -204,6 +234,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') @@ -221,7 +255,11 @@ def transform(self, X, override_return_df=False): X = self.ordinal_encoder.transform(X) - X = self.get_dummies(X, mapping=self.mapping) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + + X = self.get_dummies(X) if self.drop_invariant: for col in self.drop_cols: @@ -268,20 +306,20 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': - for col in self.cols: - if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s"%(col,)) - for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) + if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': + for col in self.cols: + if X[switch.get('col')].isnull().any(): + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + return X if self.return_df else X.values - def get_dummies(self, X_in, mapping): + def get_dummies(self, X_in): """ Convert numerical variable into dummy variables @@ -300,19 +338,20 @@ def get_dummies(self, X_in, mapping): cols = X.columns.values.tolist() - for switch in mapping: + for switch in self.mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for column_mapping in mod: - new_col_name = column_mapping['new_col_name'] - val = column_mapping['val'] - X[new_col_name] = (X[col] == val).astype(int) - new_columns.append(new_col_name) + + base_df = mod.reindex(X[col]) + base_df = base_df.set_index(X.index) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns + + X = X.reindex(columns=cols) - return X.reindex(columns=cols) + return X def reverse_dummies(self, X, mapping): """ @@ -338,9 +377,11 @@ def reverse_dummies(self, X, mapping): cols.append(col) X[col] = 0 - for column_mapping in mod: - existing_col = column_mapping.get('new_col_name') - val = column_mapping.get('val') + positive_indexes = mod.index[mod.index > 0] + for i in range(positive_indexes.shape[0]): + existing_col = mod.columns[i] + val = positive_indexes[i] + X.loc[X[existing_col] == 1, col] = val mapped_columns.append(existing_col) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index b811051b..c2d2e75b 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -4,6 +4,7 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin import category_encoders.utils as util +import warnings __author__ = 'willmcginnis' @@ -81,16 +82,16 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute'): + def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self._dim = None self.feature_names = None @@ -129,12 +130,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + _, categories = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + handle_missing=self.handle_missing ) self.mapping = categories @@ -175,6 +180,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') @@ -193,8 +202,8 @@ def transform(self, X, override_return_df=False): X, mapping=self.mapping, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + handle_missing=self.handle_missing ) if self.drop_invariant: @@ -208,7 +217,9 @@ def transform(self, X, override_return_df=False): def inverse_transform(self, X_in): """ - Perform the inverse transformation to encoded data. + Perform the inverse transformation to encoded data. Will attempt best case reconstruction, which means + it will return nan for handle_missing and handle_unknown settings that break the bijection. We issue + warnings when some of those cases occur. Parameters ---------- @@ -239,11 +250,17 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': + if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category -1 when encode %s" % (col,)) + + if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': + for col in self.cols: + if X[col].isnull().any(): + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) for switch in self.mapping: column_mapping = switch.get('mapping') @@ -253,13 +270,15 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values @staticmethod - def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'): + def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='value'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random. """ + return_nan_series = pd.Series(data=[np.nan], index=[-2]) + X = X_in.copy(deep=True) if cols is None: @@ -276,34 +295,37 @@ def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_ except ValueError as e: X[column] = X[column].astype(float) - if impute_missing: - if handle_unknown == 'impute': - X[column].fillna(0, inplace=True) - elif handle_unknown == 'error': - missing = X[column].isnull() - if any(missing): - raise ValueError('Unexpected categories found in column %s' % column) + if handle_unknown == 'value': + X[column].fillna(-1, inplace=True) + elif handle_unknown == 'error': + missing = X[column].isnull() + if any(missing): + raise ValueError('Unexpected categories found in column %s' % column) + + if handle_missing == 'return_nan': + X[column] = X[column].map(return_nan_series).where(X[column] == -2, X[column]) else: mapping_out = [] for col in cols: + nan_identity = np.nan + if util.is_category(X[col].dtype): categories = X[col].cat.categories else: - categories = [x for x in pd.unique( - X[col].values) if x is not None] + categories = X[col].unique() - index = [] - values = [] + index = pd.Series(categories).fillna(nan_identity).unique() - for i in range(len(categories)): - index.append(categories[i]) - values.append(i + 1) + data = pd.Series(index=index, data=range(1, len(index) + 1)) - mapping = pd.Series(data=values, index=index) + if handle_missing == 'value' and ~data.index.isnull().any(): + data.loc[nan_identity] = -2 + elif handle_missing == 'return_nan': + data.loc[nan_identity] = -2 - mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) + mapping_out.append({'col': col, 'mapping': data, 'data_type': X[col].dtype}, ) return X, mapping_out diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index bb5bd070..7b3b8316 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -24,12 +24,14 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. + handle_missing: str + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause + unexpected changes in dimension in some cases. Example ------- @@ -81,14 +83,15 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -126,12 +129,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -139,8 +146,9 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').get_values() - column_mapping = self.fit_polynomial_coding(values) + values = switch.get('mapping') + col = switch.get('col') + column_mapping = self.fit_polynomial_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -178,6 +186,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -193,6 +205,10 @@ def transform(self, X, override_return_df=False): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.polynomial_coding(X, self.mapping) if self.drop_invariant: @@ -205,14 +221,32 @@ def transform(self, X, override_return_df=False): return X.values @staticmethod - def fit_polynomial_coding(values): + def fit_polynomial_coding(col, values, handle_missing, handle_unknown): + if handle_missing == 'value': + values = values[values > 0] + + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) + + polynomial_contrast_matrix = Poly().code_without_intercept(values_to_encode) + df = pd.DataFrame(data=polynomial_contrast_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i, ) for i in range(len(polynomial_contrast_matrix.column_suffixes))]) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values_to_encode) - 1) + + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values_to_encode) - 1) - polynomial_contrast_matrix = Poly().code_without_intercept(values) - df = pd.DataFrame(data=polynomial_contrast_matrix.matrix, columns=polynomial_contrast_matrix.column_suffixes) - df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) return df @staticmethod @@ -229,19 +263,17 @@ def polynomial_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) def get_feature_names(self): """ diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index f8d25226..d207ee67 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -24,12 +24,14 @@ class SumEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. + handle_missing: str + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause + unexpected changes in dimension in some cases. Example ------- @@ -81,14 +83,15 @@ class SumEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing=handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -126,12 +129,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -139,8 +146,9 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').tolist() - column_mapping = self.fit_sum_coding(values) + values = switch.get('mapping') + col = switch.get('col') + column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -179,6 +187,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -194,6 +206,10 @@ def transform(self, X, override_return_df=False): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.sum_coding(X, mapping=self.mapping) if self.drop_invariant: @@ -206,14 +222,32 @@ def transform(self, X, override_return_df=False): return X.values @staticmethod - def fit_sum_coding(values): + def fit_sum_coding(col, values, handle_missing, handle_unknown): + if handle_missing == 'value': + values = values[values > 0] + + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) + + sum_contrast_matrix = Sum().code_without_intercept(values_to_encode.tolist()) + df = pd.DataFrame(data=sum_contrast_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i, ) for i in range(len(sum_contrast_matrix.column_suffixes))]) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values_to_encode) - 1) + + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values_to_encode) - 1) - sum_contrast_matrix = Sum().code_without_intercept(values) - df = pd.DataFrame(data=sum_contrast_matrix.matrix, columns=sum_contrast_matrix.column_suffixes) - df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) return df @staticmethod @@ -230,19 +264,17 @@ def sum_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) def get_feature_names(self): """ diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 5c36c871..80c812d4 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -2,14 +2,15 @@ import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util __author__ = 'chappers' class TargetEncoder(BaseEstimator, TransformerMixin): - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', min_samples_leaf=1, smoothing=1.0): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', + handle_unknown='value', min_samples_leaf=1, smoothing=1.0): """Target encoding for categorical features. For the case of categorical target: features are replaced with a blend of posterior probability of the target given particular categorical value and prior probability of the target over all the training data. @@ -27,12 +28,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in the dimension in some cases. + options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float @@ -81,12 +78,13 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.drop_cols = [] self.verbose = verbose self.cols = cols + self.ordinal_encoder = None self.min_samples_leaf = min_samples_leaf self.smoothing = float(smoothing) # Make smoothing a float so that python 2 does not treat as integer division self._dim = None self.mapping = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing=handle_missing self._mean = None self.feature_names = None @@ -110,7 +108,7 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -122,21 +120,26 @@ def fit(self, X, y, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) - _, self.mapping = self.target_encode( - X, y, - mapping=None, + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + + self.ordinal_encoder = OrdinalEncoder( + verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown, - smoothing_in=self.smoothing, - min_samples_leaf=self.min_samples_leaf + handle_unknown='value', + handle_missing='value' ) - + self.ordinal_encoder = self.ordinal_encoder.fit(X) + X_ordinal = self.ordinal_encoder.transform(X) + self.mapping = self.fit_target_encoding(X_ordinal, y) + X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] + X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: @@ -148,6 +151,35 @@ def fit(self, X, y, **kwargs): return self + def fit_target_encoding(self, X, y): + mapping = {} + + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') + + prior = self._mean = y.mean() + + stats = y.groupby(X[col]).agg(['count', 'mean']) + + smoove = 1 / (1 + np.exp(-(stats['count'] - self.min_samples_leaf) / self.smoothing)) + smoothing = prior * (1 - smoove) + stats['mean'] * smoove + smoothing[stats['count'] == 1] = prior + + if self.handle_unknown == 'return_nan': + smoothing.loc[-1] = np.nan + elif self.handle_unknown == 'value': + smoothing.loc[-1] = prior + + if self.handle_missing == 'return_nan': + smoothing.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + smoothing.loc[-2] = prior + + mapping[col] = smoothing + + return mapping + def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters @@ -155,13 +187,17 @@ def transform(self, X, y=None, override_return_df=False): X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target info (such as transform test set) - + Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -183,15 +219,14 @@ def transform(self, X, y=None, override_return_df=False): if not self.cols: return X - X, _ = self.target_encode( - X, y, - mapping=self.mapping, - cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown, - min_samples_leaf=self.min_samples_leaf, - smoothing_in=self.smoothing - ) + + X = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Unexpected categories found in dataframe') + + X = self.target_encode(X) if self.drop_invariant: for col in self.drop_cols: @@ -211,31 +246,13 @@ def fit_transform(self, X, y=None, **fit_params): """ return self.fit(X, y, **fit_params).transform(X, y) - def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='impute', min_samples_leaf=1, smoothing_in=1.0): + def target_encode(self, X_in): X = X_in.copy(deep=True) - if cols is None: - cols = X.columns.values - - if mapping is not None: - for col in cols: - X[col] = X[col].map(mapping[col]) - if impute_missing: - if handle_unknown == 'impute': - X[col].fillna(self._mean, inplace=True) - elif handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) - else: - mapping = {} - prior = self._mean = y.mean() - for col in cols: - stats = y.groupby(X[col]).agg(['count', 'mean']) - smoove = 1 / (1 + np.exp(-(stats['count'] - min_samples_leaf) / smoothing_in)) - smoothing = prior * (1 - smoove) + stats['mean'] * smoove - smoothing[stats['count'] == 1] = prior - mapping[col] = smoothing - - return X, mapping + + for col in self.cols: + X[col] = X[col].map(self.mapping[col]) + + return X def get_feature_names(self): """ diff --git a/category_encoders/tests/test_backward_difference.py b/category_encoders/tests/test_backward_difference.py index a3fb7cef..23f21961 100644 --- a/category_encoders/tests/test_backward_difference.py +++ b/category_encoders/tests/test_backward_difference.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders @@ -10,7 +10,7 @@ def test_backwards_difference_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -23,7 +23,7 @@ def test_backwards_difference_encoder_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -36,7 +36,7 @@ def test_backwards_difference_encoder_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -50,7 +50,7 @@ def test_backwards_difference_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -63,7 +63,7 @@ def test_backwards_difference_encoder_preserve_dimension_4(self): def test_backwards_difference_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -80,9 +80,66 @@ def test_backwards_difference_encoder_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') + result = encoder.fit_transform(train) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0], + [1, 1 / 3.0, 2 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') + result = encoder.fit_transform(train) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0], + [1, 1 / 3.0, 2 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='indicator') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0], + [1, 1 / 3.0, 2 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='indicator') + result = encoder.fit_transform(train) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0]] + self.assertEqual(result.values.tolist(), expected) diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py new file mode 100644 index 00000000..abca9a10 --- /dev/null +++ b/category_encoders/tests/test_basen.py @@ -0,0 +1,143 @@ +import pandas as pd +from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ +import numpy as np +import category_encoders as encoders +import warnings + + +class TestBaseNEncoder(TestCase): + + def test_fit_transform_have_base_2_expect_Correct_Encoding(self): + train = pd.Series(['a', 'b', 'c', 'd']) + + result = encoders.BaseNEncoder(base=2).fit_transform(train) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_inverse_transform_HaveData_ExpectResultReturned(self): + train = pd.Series(list('abcd')).to_frame('letter') + + enc = encoders.BaseNEncoder(base=2) + result = enc.fit_transform(train) + inversed_result = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, inversed_result) + + def test_HaveIndicatorAndNanValue_ExpectNewColumn(self): + train = pd.Series(['a', 'b', 'c', np.nan]) + + result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_HandleMissingIndicator_HaveNoNan_ExpectThirdColumn(self): + train = pd.Series(['a', 'b', 'c']) + + result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) + + self.assertEqual(3, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = pd.Series(['a', 'b', 'c']) + test = pd.Series(['a', 'b', 'c', np.nan]) + + encoder = encoders.BaseNEncoder(handle_missing='indicator') + encoder.fit(train) + result = encoder.transform(test) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_HandleUnknown_HaveUnknown_ExpectIndicatorInTest(self): + train = ['A', 'B', 'C'] + test = ['A', 'B', 'C', 'D'] + + encoder = encoders.BaseNEncoder(handle_unknown='indicator') + encoder.fit(train) + result = encoder.transform(test) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.BaseNEncoder(handle_unknown='indicator') + result = encoder.fit_transform(train) + + self.assertEqual(2, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city', str(w[0].message)) + + def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + + enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(expected, original) diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index 5c6db408..8b61b25c 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -84,7 +84,7 @@ def test_impact_encoders(self): tu.verify_numeric(enc.transform(X_t, y_t)) # when we run transform(X, y) and there is a new value in X, something is wrong and we raise an error - enc = getattr(encoders, encoder_name)(impute_missing=True, handle_unknown='error', cols=['extra']) + enc = getattr(encoders, encoder_name)(handle_unknown='error', cols=['extra']) enc.fit(X, y) self.assertRaises(ValueError, enc.transform, (X_t, y_t)) @@ -128,6 +128,81 @@ def test_handle_unknown_error(self): with self.assertRaises(ValueError): _ = enc.transform(X_t) + def test_handle_missing_error(self): + non_null = pd.DataFrame({'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]}) # only 'city' column is going to be transformed + has_null = pd.DataFrame({'city': ['chicago', np.nan], 'color': ['red', np.nan]}) + y = pd.Series([1, 0]) + + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + + enc = getattr(encoders, encoder_name)(handle_missing='error', cols='city') + with self.assertRaises(ValueError): + enc.fit(has_null, y) + + enc.fit(non_null, y) # we raise an error only if a missing value is in one of the transformed columns + with self.assertRaises(ValueError): + enc.transform(has_null) + + def test_handle_unknown_return_nan(self): + train = pd.DataFrame({'city': ['chicago', 'los angeles']}) + test = pd.DataFrame({'city': ['chicago', 'denver']}) + y = pd.Series([1, 0]) + + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + + enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') + enc.fit(train, y) + result = enc.transform(test).iloc[1, :] + + if len(result) == 1: + self.assertTrue(result.isnull().all()) + else: + self.assertTrue(result[1:].isnull().all()) + + def test_handle_missing_return_nan_train(self): + X = pd.DataFrame({'city': ['chicago', 'los angeles', None]}) + y = pd.Series([1, 0, 1]) + + for encoder_name in ( set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + enc = getattr(encoders, encoder_name)(handle_missing='return_nan') + result = enc.fit_transform(X, y).iloc[2, :] + + if len(result) == 1: + self.assertTrue(result.isnull().all()) + else: + self.assertTrue(result[1:].isnull().all()) + + def test_handle_missing_return_nan_test(self): + X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']}) + X_t = pd.DataFrame({'city': ['chicago', 'los angeles', None]}) + y = pd.Series([1, 0, 1]) + + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + enc = getattr(encoders, encoder_name)(handle_missing='return_nan') + result = enc.fit(X, y).transform(X_t).iloc[2, :] + + if len(result) == 1: + self.assertTrue(result.isnull().all()) + else: + self.assertTrue(result[1:].isnull().all()) + + def test_handle_unknown_value(self): + train = pd.DataFrame({'city': ['chicago', 'los angeles']}) + test = pd.DataFrame({'city': ['chicago', 'denver']}) + y = pd.Series([1, 0]) + + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + + enc = getattr(encoders, encoder_name)(handle_unknown='value') + enc.fit(train, y) + result = enc.transform(test) + self.assertFalse(result.iloc[1, :].isnull().all()) + def test_sklearn_compliance(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): @@ -157,11 +232,6 @@ def test_inverse_transform(self): enc.fit(X) tu.verify_inverse_transform(X_t, enc.inverse_transform(enc.transform(X_t))) - # when a new value is encountered, do not raise an exception - enc = getattr(encoders, encoder_name)(verbose=1, cols=cols) - enc.fit(X, y) - _ = enc.inverse_transform(enc.transform(X_t_extra)) - def test_types(self): X = pd.DataFrame({ 'Int': [1, 2, 1, 2], @@ -193,7 +263,7 @@ def test_preserve_column_order(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - print(encoder_name) + encoder = getattr(encoders, encoder_name)() result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) columns = result.columns.values diff --git a/category_encoders/tests/test_helmert.py b/category_encoders/tests/test_helmert.py index cb3fb52a..c62447fd 100644 --- a/category_encoders/tests/test_helmert.py +++ b/category_encoders/tests/test_helmert.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders @@ -10,7 +10,7 @@ def test_helmert_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -23,7 +23,7 @@ def test_helmert_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -36,7 +36,7 @@ def test_helmert_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -50,7 +50,7 @@ def test_helmert_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -63,7 +63,7 @@ def test_helmert_preserve_dimension_4(self): def test_helmert_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -80,9 +80,66 @@ def test_helmert_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, -1, -1], + [1, 1, -1], + [1, 0, 2]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, -1, -1], + [1, 1, -1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -1, -1], + [1, 1, -1], + [1, 0, 2]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -1, -1], + [1, 1, -1], + [1, 0, 2]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectExtraColumn(self): + train = ['A', 'B'] + + encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') + result = encoder.fit_transform(train) + + expected = [[1, -1, -1], + [1, 1, -1]] + self.assertEqual(result.values.tolist(), expected) diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py index 12e2f5af..4eb254fb 100644 --- a/category_encoders/tests/test_leave_one_out.py +++ b/category_encoders/tests/test_leave_one_out.py @@ -47,16 +47,72 @@ def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): mapping = encoder.mapping self.assertEqual(1, len(mapping)) self.assertIn('col_b', mapping) # the model should have the updated mapping - expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2']) - pd.testing.assert_frame_equal(expected, mapping['col_b'], check_like=True) + expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'], columns=['sum', 'count']) + np.testing.assert_equal(expected.values, mapping['col_b'].values) def test_leave_one_out_unique(self): X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col']) y = np.array([1, 0, 1, 0, 1]) - encoder = encoders.LeaveOneOutEncoder(impute_missing=False) + encoder = encoders.LeaveOneOutEncoder(handle_unknown='value') result = encoder.fit(X, y).transform(X, y) self.assertFalse(result.isnull().any().any(), 'There should not be any missing value') expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col']) pd.testing.assert_frame_equal(expected, result) + + def test_HandleMissingIsValueAndNanInTrain_ExpectAtValueSet(self): + df = pd.DataFrame({ + 'color': [np.nan, np.nan, np.nan, "b", "b", "b"], + 'outcome': [2, 2, 0, 1, 0, 1]}) + + X = df.drop('outcome', axis=1) + y = df.drop('color', axis=1) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + obtained = ce_leave.fit_transform(X, y['outcome']) + + self.assertEqual([1, 1, 2, 0.5, 1.0, 0.5], list(obtained['color'])) + + def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): + df = pd.DataFrame({ + 'color': ["a", "a", "a", "b", "b", "b"], + 'outcome': [1.6, 0, 0, 1, 0, 1]}) + + train = df.drop('outcome', axis=1) + target = df.drop('color', axis=1) + test = pd.Series([np.nan, 'b'], name='color') + test_target = pd.Series([0, 0]) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + ce_leave.fit(train, target['outcome']) + obtained = ce_leave.transform(test, test_target) + + self.assertEqual([.6, 1.0], list(obtained['color'])) + + def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self): + df = pd.DataFrame({ + 'color': ["a", "a", "a", "b", "b", "b"], + 'outcome': [1, 0, 0, 1, 0, 1]}) + + train = df.drop('outcome', axis=1) + target = df.drop('color', axis=1) + test = pd.Series([np.nan, 'b'], name='color') + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + ce_leave.fit(train, target['outcome']) + obtained = ce_leave.transform(test) + + self.assertEqual([.5, 2/3.0], list(obtained['color'])) + + def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): + train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') + target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') + test = pd.Series(['b', 'c'], name='color') + test_target = pd.Series([0, 0]) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_unknown='value') + ce_leave.fit(train, target) + obtained = ce_leave.transform(test, test_target) + + self.assertEqual([1.0, .6], list(obtained['color'])) diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index af663295..e7c248b6 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -1,7 +1,7 @@ import pandas as pd from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ import numpy as np - +import warnings import category_encoders.tests.test_utils as tu import category_encoders as encoders @@ -26,28 +26,29 @@ def test_one_hot(self): enc.transform(X_t[X_t['extra'] != 'A']).shape[1], 'We have to get the same count of columns') - enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True) + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='indicator') enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='return_nan') enc.fit(X) out = enc.transform(X_t) self.assertEqual(len([x for x in out.columns.values if str(x).startswith('extra_')]), 3) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='error') + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='error') # The exception is already raised in fit() because transform() is called there to get # feature_names right. + enc.fit(X) with self.assertRaises(ValueError): - enc.fit(X_t) + enc.transform(X_t) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore', use_cat_names=True) + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='return_nan', use_cat_names=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_A', out.columns.values) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True) + enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True, handle_unknown='indicator') enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) @@ -55,7 +56,6 @@ def test_one_hot(self): # test inverse_transform X_i = tu.create_dataset(n_rows=100, has_none=False) X_i_t = tu.create_dataset(n_rows=50, has_none=False) - X_i_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False) cols = ['underscore', 'none', 'extra', 321, 'categorical'] enc = encoders.OneHotEncoder(verbose=1, use_cat_names=True, cols=cols) @@ -64,7 +64,7 @@ def test_one_hot(self): tu.verify_inverse_transform(X_i_t, obtained) def test_fit_transform_HaveMissingValuesAndUseCatNames_ExpectCorrectValue(self): - encoder = encoders.OneHotEncoder(cols=[0], use_cat_names=True) + encoder = encoders.OneHotEncoder(cols=[0], use_cat_names=True, handle_unknown='indicator') result = encoder.fit_transform([[-1]]) @@ -89,10 +89,171 @@ def test_inverse_transform_HaveNoCatNames_ExpectCorrectInverseTransform(self): assert value.equals(inverse_transformed) def test_fit_transform_HaveColumnAppearTwice_ExpectColumnsDeduped(self): - encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=True) - value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series(-1)}) + encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=True, handle_unknown='indicator') + value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series('-1')}) result = encoder.fit_transform(value) columns = result.columns.tolist() self.assertSetEqual({'match_box_-1', 'match_-1', 'match_box_-1#', 'match_box_-1##'}, set(columns)) + + def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 0]}, + columns=['city_1', 'city_2']) + + enc = encoders.OneHotEncoder(handle_unknown='value') + result = enc.fit(train).transform(test) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_HaveHandleUnknownValueAndSeenValues_ExpectMappingUsed(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 1]}, + columns=['city_1', 'city_2']) + + enc = encoders.OneHotEncoder(handle_unknown='value') + result = enc.fit(train).transform(train) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_HaveHandleUnknownIndicatorAndNoMissingValue_ExpectExtraColumn(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 1], + 'city_-1': [0, 0]}, + columns=['city_1', 'city_2', 'city_-1']) + + enc = encoders.OneHotEncoder(handle_unknown='indicator') + result = enc.fit(train).transform(train) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_HaveHandleUnknownIndicatorAndMissingValue_ExpectValueSet(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 0], + 'city_-1': [0, 1]}, + columns=['city_1', 'city_2', 'city_-1']) + + enc = encoders.OneHotEncoder(handle_unknown='indicator') + result = enc.fit(train).transform(test) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], + [0, 1, 0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.OneHotEncoder(handle_unknown='indicator', handle_missing='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.OneHotEncoder(handle_unknown='indicator', handle_missing='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], + [0, 1, 0]] + self.assertEqual(result.values.tolist(), expected) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city', str(w[0].message)) + + def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + + enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(expected, original) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index e6fa0143..eba8d5d6 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -2,7 +2,7 @@ from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ import category_encoders.tests.test_utils as tu import numpy as np - +import warnings import category_encoders as encoders @@ -20,22 +20,22 @@ class TestOrdinalEncoder(TestCase): def test_ordinal(self): - enc = encoders.OrdinalEncoder(verbose=1, return_df=True, impute_missing=True) + enc = encoders.OrdinalEncoder(verbose=1, return_df=True) enc.fit(X) out = enc.transform(X_t) self.assertEqual(len(set(out['extra'].values)), 4) - self.assertIn(0, set(out['extra'].values)) + self.assertIn(-1, set(out['extra'].values)) self.assertFalse(enc.mapping is None) self.assertTrue(len(enc.mapping) > 0) - enc = encoders.OrdinalEncoder(verbose=1, mapping=enc.mapping, return_df=True, impute_missing=True) + enc = encoders.OrdinalEncoder(verbose=1, mapping=enc.mapping, return_df=True) enc.fit(X) out = enc.transform(X_t) self.assertEqual(len(set(out['extra'].values)), 4) - self.assertIn(0, set(out['extra'].values)) + self.assertIn(-1, set(out['extra'].values)) self.assertTrue(len(enc.mapping) > 0) - enc = encoders.OrdinalEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') + enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return _nan') enc.fit(X) out = enc.transform(X_t) out_cats = [x for x in set(out['extra'].values) if np.isfinite(x)] @@ -44,20 +44,19 @@ def test_ordinal(self): def test_ordinal_dist(self): data = np.array([ - ['apple', None], - ['peach', 'lemon'] + ['apple', 'lemon'], + ['peach', None] ]) - encoder = encoders.OrdinalEncoder(impute_missing=True) - encoder.fit(data) - a = encoder.transform(data) - self.assertEqual(a.values[0, 1], 0) - self.assertEqual(a.values[1, 1], 1) - - encoder = encoders.OrdinalEncoder(impute_missing=False) - encoder.fit(data) - a = encoder.transform(data) - self.assertTrue(np.isnan(a.values[0, 1])) - self.assertEqual(a.values[1, 1], 1) + encoder = encoders.OrdinalEncoder() + result = encoder.fit_transform(data) + self.assertEqual(2, len(result[0].unique()), "We expect two unique values in the column") + self.assertEqual(2, len(result[1].unique()), "We expect two unique values in the column") + self.assertFalse(np.isnan(result.values[1, 1])) + + encoder = encoders.OrdinalEncoder(handle_missing='return_nan') + result = encoder.fit_transform(data) + self.assertEqual(2, len(result[0].unique()), "We expect two unique values in the column") + self.assertEqual(2, len(result[1].unique()), "We expect two unique values in the column") def test_pandas_categorical(self): X = pd.DataFrame({ @@ -73,3 +72,130 @@ def test_pandas_categorical(self): self.assertEqual(3, out['Categorical'][1]) self.assertEqual(1, out['Categorical'][2]) self.assertEqual(2, out['Categorical'][3]) + + def test_handle_missing_have_nan_fit_time_expect_as_category(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value') + out = enc.fit_transform(train) + + self.assertListEqual([1, 2], out['city'].tolist()) + + def test_handle_missing_have_nan_transform_time_expect_negative_2(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value') + enc.fit(train) + out = enc.transform(test) + + self.assertListEqual([1, -2], out['city'].tolist()) + + def test_handle_unknown_have_new_value_expect_negative_1(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + expected = [1.0, -1.0] + + enc = encoders.OrdinalEncoder(handle_missing='return_nan') + enc.fit(train) + result = enc.transform(test)['city'].tolist() + + self.assertEqual(expected, result) + + def test_HaveNegativeOneInTrain_ExpectCodedAsOne(self): + train = pd.DataFrame({'city': [-1]}) + expected = [1] + + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(expected, result) + + def test_HaveNaNInTrain_ExpectCodedAsOne(self): + train = pd.DataFrame({'city': [np.nan]}) + expected = [1] + + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(expected, result) + + def test_HaveNoneAndNan_ExpectCodesAsOne(self): + train = pd.DataFrame({'city': [np.nan, None]}) + expected = [1, 1] + + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(expected, result) + + def test_inverse_transform_HaveUnknown_ExpectWarning(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='value') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category -1 when encode city', str(w[0].message)) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city', str(w[0].message)) + + def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(expected, original) diff --git a/category_encoders/tests/test_polynomial.py b/category_encoders/tests/test_polynomial.py index 3690072c..8a1af859 100644 --- a/category_encoders/tests/test_polynomial.py +++ b/category_encoders/tests/test_polynomial.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders from category_encoders.tests.test_utils import deep_round @@ -16,7 +16,7 @@ def test_polynomial_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -29,7 +29,7 @@ def test_polynomial_encoder_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -42,7 +42,7 @@ def test_polynomial_encoder_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -56,7 +56,7 @@ def test_polynomial_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -69,7 +69,7 @@ def test_polynomial_encoder_preserve_dimension_4(self): def test_polynomial_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -86,9 +86,66 @@ def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.PolynomialEncoder(handle_unknown='indicator') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.PolynomialEncoder(handle_unknown='indicator') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) diff --git a/category_encoders/tests/test_sum_coding.py b/category_encoders/tests/test_sum_coding.py index caa7f206..900e5956 100644 --- a/category_encoders/tests/test_sum_coding.py +++ b/category_encoders/tests/test_sum_coding.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders a_encoding = [1, 1, 0] @@ -14,7 +14,7 @@ def test_sum_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -27,7 +27,7 @@ def test_sum_encoder_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -40,7 +40,7 @@ def test_sum_encoder_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -54,7 +54,7 @@ def test_sum_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -67,7 +67,7 @@ def test_sum_encoder_preserve_dimension_4(self): def test_sum_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -84,9 +84,67 @@ def test_sum_encoder_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.SumEncoder(handle_unknown='indicator', handle_missing='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.SumEncoder(handle_unknown='indicator', handle_missing='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(result.values.tolist(), expected) + diff --git a/category_encoders/tests/test_target_encoder.py b/category_encoders/tests/test_target_encoder.py index bbd206dc..c6fc5b2e 100644 --- a/category_encoders/tests/test_target_encoder.py +++ b/category_encoders/tests/test_target_encoder.py @@ -34,9 +34,11 @@ def test_target_encoder_fit_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectU encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) encoder.fit(binary_cat_example, binary_cat_example['target']) trend_mapping = encoder.mapping['Trend'] - self.assertAlmostEqual(0.4125, trend_mapping['DOWN'], delta=1e-4) - self.assertEqual(0.5, trend_mapping['FLAT']) - self.assertAlmostEqual(0.5874, trend_mapping['UP'], delta=1e-4) + ordinal_mapping = encoder.ordinal_encoder.category_mapping[0]['mapping'] + + self.assertAlmostEqual(0.4125, trend_mapping[ordinal_mapping.loc['DOWN']], delta=1e-4) + self.assertEqual(0.5, trend_mapping[ordinal_mapping.loc['FLAT']]) + self.assertAlmostEqual(0.5874, trend_mapping[ordinal_mapping.loc['UP']], delta=1e-4) def test_target_encoder_fit_transform_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectCorrectValueInResult(self): k = 2 @@ -67,8 +69,49 @@ def test_target_encoder_fit_transform_HaveCategoricalColumn_ExpectCorrectValueIn self.assertAlmostEqual(0.4125, values[2], delta=1e-4) self.assertEqual(0.5, values[3]) + def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(self): + k = 2 + f = 10 + binary_cat_example = pd.DataFrame( + {'Trend': pd.Series([np.nan, np.nan, 'DOWN', 'FLAT', 'DOWN', np.nan, 'DOWN', 'FLAT', 'FLAT', 'FLAT']), + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) + result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) + values = result['Trend'].values + self.assertAlmostEqual(0.5874, values[0], delta=1e-4) + self.assertAlmostEqual(0.5874, values[1], delta=1e-4) + self.assertAlmostEqual(0.4125, values[2], delta=1e-4) + self.assertEqual(0.5, values[3]) + def test_target_encoder_noncontiguous_index(self): data = pd.DataFrame({'x': ['a', 'b', np.nan, 'd', 'e'], 'y': range(5)}).dropna() result = encoders.TargetEncoder(cols=['x']).fit_transform(data[['x']], data['y']) self.assertTrue(np.allclose(result, 2.0)) + def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): + df = pd.DataFrame({ + 'color': ["a", "a", "a", "b", "b", "b"], + 'outcome': [1.6, 0, 0, 1, 0, 1]}) + + train = df.drop('outcome', axis=1) + target = df.drop('color', axis=1) + test = pd.Series([np.nan, 'b'], name='color') + test_target = pd.Series([0, 0]) + + enc = encoders.TargetEncoder(cols=['color'], handle_missing='value') + enc.fit(train, target['outcome']) + obtained = enc.transform(test, test_target) + + self.assertEqual(.6, list(obtained['color'])[0]) + + def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): + train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') + target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') + test = pd.Series(['c', 'b'], name='color') + test_target = pd.Series([0, 0]) + + enc = encoders.TargetEncoder(cols=['color'], handle_unknown='value') + enc.fit(train, target) + obtained = enc.transform(test, test_target) + + self.assertEqual(.6, list(obtained['color'])[0]) diff --git a/category_encoders/tests/test_woe.py b/category_encoders/tests/test_woe.py index c5de98ea..5873c0ad 100644 --- a/category_encoders/tests/test_woe.py +++ b/category_encoders/tests/test_woe.py @@ -95,7 +95,7 @@ def test_woe(self): enc.fit(X_balanced, y_missing) # impute missing - enc = encoders.WOEEncoder(impute_missing=False) + enc = encoders.WOEEncoder(handle_missing='return_nan') enc.fit(X, np_y) X1 = enc.transform(X_t) tu.verify_numeric(X1) @@ -108,3 +108,47 @@ def test_woe(self): self.assertTrue(X1.isnull().values.any()) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') + + def test_HaveArrays_ExpectCalculatedProperly(self): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + enc = encoders.WOEEncoder() + + result = enc.fit_transform(X, y) + + expected = pd.Series([0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119], name=0) + pd.testing.assert_series_equal(expected, result[0]) + + def test_HandleMissingValue_HaveMissingInTrain_ExpectEncoded(self): + X = ['a', 'a', np.nan, np.nan] + y = [1, 0, 0, 0] + enc = encoders.WOEEncoder(handle_missing='value') + + result = enc.fit_transform(X, y) + + expected = pd.Series([0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119], name=0) + pd.testing.assert_series_equal(expected, result[0]) + + def test_HandleMissingValue_HaveMissingInTest_ExpectEncodedWithZero(self): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + test = ['a', np.nan] + enc = encoders.WOEEncoder(handle_missing='value') + + enc.fit(X, y) + result = enc.transform(test) + + expected = pd.Series([0.5108256237659906, 0], name=0) + pd.testing.assert_series_equal(expected, result[0]) + + def test_HandleUnknownValue_HaveUnknown_ExpectEncodedWithZero(self): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + test = ['a', 'c'] + enc = encoders.WOEEncoder(handle_unknown='value') + + enc.fit(X, y) + result = enc.transform(test) + + expected = pd.Series([0.5108256237659906, 0], name=0) + pd.testing.assert_series_equal(expected, result[0]) diff --git a/category_encoders/utils.py b/category_encoders/utils.py index 2805ecb0..4fbcd601 100644 --- a/category_encoders/utils.py +++ b/category_encoders/utils.py @@ -46,7 +46,7 @@ def convert_input(X): """ if not isinstance(X, pd.DataFrame): if isinstance(X, list): - X = pd.DataFrame(np.array(X)) + X = pd.DataFrame(X) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) elif isinstance(X, csr_matrix): diff --git a/category_encoders/woe.py b/category_encoders/woe.py index 1d09e1b2..4614a21f 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util from sklearn.utils.random import check_random_state @@ -21,10 +22,8 @@ class WOEEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'ignore', 'error' and 'impute', defaults to 'impute', which will assume WOE=0. + options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0. randomized: bool, adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float @@ -73,17 +72,18 @@ class WOEEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', random_state=None, randomized=False, sigma=0.05, regularization=1.0): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0): self.verbose = verbose self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.cols = cols + self.ordinal_encoder = None self._dim = None self.mapping = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self._sum = None self._count = None self.random_state = random_state @@ -118,7 +118,7 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.DataFrame): y = y.iloc[:,0] else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', index=X.index) # The lengths must be equal if X.shape[0] != y.shape[0]: @@ -143,8 +143,21 @@ def fit(self, X, y, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + + self.ordinal_encoder = OrdinalEncoder( + verbose=self.verbose, + cols=self.cols, + handle_unknown='value', + handle_missing='value' + ) + self.ordinal_encoder = self.ordinal_encoder.fit(X) + X_ordinal = self.ordinal_encoder.transform(X) + # Training - self.mapping = self._train(X, y, cols=self.cols) + self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() @@ -183,6 +196,10 @@ def transform(self, X, y=None, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -198,7 +215,7 @@ def transform(self, X, y=None, override_return_df=False): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -208,6 +225,12 @@ def transform(self, X, y=None, override_return_df=False): # Do not modify the input argument X = X.copy(deep=True) + X = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Unexpected categories found in dataframe') + # Loop over columns and replace nominal values with WOE X = self._score(X, y) @@ -231,15 +254,17 @@ def fit_transform(self, X, y=None, **fit_params): """ return self.fit(X, y, **fit_params).transform(X, y) - def _train(self, X, y, cols=None): + def _train(self, X, y): # Initialize the output mapping = {} # Calculate global statistics - self._sum = y.sum() + self._sum = y.sum() self._count = y.count() - for col in cols: + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['sum', 'count']) # Count of x_{i,+} and x_i @@ -253,6 +278,16 @@ def _train(self, X, y, cols=None): # Ignore unique values. This helps to prevent overfitting on id-like columns. woe[stats['count'] == 1] = 0 + if self.handle_unknown == 'return_nan': + woe.loc[-1] = np.nan + elif self.handle_unknown == 'value': + woe.loc[-1] = 0 + + if self.handle_missing == 'return_nan': + woe.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + woe.loc[-2] = 0 + # Store WOE for transform() function mapping[col] = woe @@ -263,14 +298,6 @@ def _score(self, X, y): # Score the column X[col] = X[col].map(self.mapping[col]) - # Replace missing values only in the computed columns - if self.impute_missing: - if self.handle_unknown == 'impute': - X[col].fillna(0, inplace=True) - elif self.handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) - # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) diff --git a/requirements.txt b/requirements.txt index 28f4c0c1..89e38a18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,5 @@ numpy>=1.11.1 scikit-learn>=0.17.1 scipy>=0.17.0 statsmodels>=0.6.1 -pandas>=0.20.1 +pandas>=0.21.1 patsy>=0.4.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 960b707b..b32cbd29 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ 'scikit-learn>=0.17.1', 'scipy>=0.17.0', 'statsmodels>=0.6.1', - 'pandas>=0.20.1', + 'pandas>=0.21.1', 'patsy>=0.4.1', ], author_email='will@pedalwrencher.com'