Skip to content

Commit

Permalink
update column_names for math_features
Browse files Browse the repository at this point in the history
  • Loading branch information
Eden Wu committed Aug 26, 2024
1 parent 059d71d commit f1af9ff
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 14 deletions.
44 changes: 32 additions & 12 deletions alpha_automl/builtin_primitives/math_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,69 @@
from feature_engine.creation import MathFeatures

class MathFeaturesSum(BasePrimitive):
def __init__(self, columns):
self.columns = columns
self.math_features = MathFeatures(variables=self.columns, func='sum')
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='sum')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)

class MathFeaturesMean(BasePrimitive):
def __init__(self, columns):
self.columns = columns
self.math_features = MathFeatures(variables=self.columns, func='mean')
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='mean')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)

class MathFeaturesStd(BasePrimitive):
def __init__(self, columns):
self.columns = columns
self.math_features = MathFeatures(variables=self.columns, func='std')
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='std')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)

class MathFeaturesProd(BasePrimitive):
def __init__(self, columns):
self.columns = columns
self.math_features = MathFeatures(variables=self.columns, func='prod')
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='prod')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)
4 changes: 3 additions & 1 deletion alpha_automl/data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def profile_data(X):
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []}
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': [], 'column_names': []}
mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}

Expand Down Expand Up @@ -46,6 +46,8 @@ def profile_data(X):
metadata['numeric_columns'] = [(index_column, column_name) for index_column, column_name in enumerate(X.columns) if X[column_name].dtype in ['int64', 'float64']]
metadata['categorical_columns'] = [(index_column, column_name) for index_column, column_name in enumerate(X.columns) if X[column_name].dtype in ['object', 'category']]

metadata['column_names'] = list(X.columns)

logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
f'useless columns = {str(metadata["useless_columns"])}, '
f'missing values = {str(metadata["missing_values"])}')
Expand Down
3 changes: 2 additions & 1 deletion alpha_automl/pipeline_synthesis/pipeline_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def make_primitive_objects(self, primitives):
nonnumeric_columns = self.metadata['nonnumeric_columns']
useless_columns = self.metadata['useless_columns']
numeric_columns = self.metadata['numeric_columns']
column_names = self.metadata['column_names']

if len(useless_columns) > 0 and len(nonnumeric_columns) == 0: # Add the transformer to the first step
selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
Expand All @@ -108,7 +109,7 @@ def make_primitive_objects(self, primitives):
estimators = extract_estimators(pipeline_primitives, self.all_primitives)
primitive_object = create_object(primitive_name, {'estimators': estimators})
elif "alpha_automl.builtin_primitives.math_features" in primitive_name:
primitive_object = create_object(primitive_name, {'columns': [column_name for _, column_name in numeric_columns]})
primitive_object = create_object(primitive_name, {'numeric_columns': [column_name for _, column_name in numeric_columns], 'column_names': column_names})
elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE: # It's an installed primitive
primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
else:
Expand Down

0 comments on commit f1af9ff

Please # to comment.