Skip to content

Commit 048656e

Browse files
authored
[ADD] feature preprocessors from autosklearn (#378)
* in progress * add remaining preprocessors * fix flake and mypy after rebase * Fix tests and add documentation * fix tests bug * fix bug in tests * fix bug where search space updates were not honoured * handle check for score func in feature preprocessors * address comments from shuhei * apply suggestions from code review * add documentation for feature preprocessors with percent to int value range * fix tests * fix tests * address comments from shuhei * fix tests which fail due to scaler
1 parent 1b8e76a commit 048656e

27 files changed

+1742
-115
lines changed

autoPyTorch/datasets/tabular_dataset.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def __init__(self,
8989
seed=seed, train_transforms=train_transforms,
9090
dataset_name=dataset_name,
9191
val_transforms=val_transforms)
92+
self.issigned = bool(np.any((X.data if self.issparse else X) < 0))
9293
if self.output_type is not None:
9394
if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
9495
self.task_type = TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION]
@@ -127,6 +128,7 @@ def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]:
127128
info.update({
128129
'numerical_columns': self.numerical_columns,
129130
'categorical_columns': self.categorical_columns,
130-
'task_type': self.task_type
131+
'task_type': self.task_type,
132+
'issigned': self.issigned
131133
})
132134
return info

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
7878
else:
7979
X_train = X['backend'].load_datamanager().train_tensors[0]
8080

81-
self.preprocessor.fit(X_train)
81+
if 'y_train' in X:
82+
y_train = subsampler(X['y_train'], X['train_indices'])
83+
else:
84+
y_train = X['backend'].load_datamanager().train_tensors[1]
85+
86+
self.preprocessor.fit(X_train, y=y_train)
87+
8288
return self
8389

8490
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import (
5+
CategoricalHyperparameter,
6+
UniformFloatHyperparameter,
7+
UniformIntegerHyperparameter,
8+
)
9+
10+
import numpy as np
11+
12+
from sklearn.base import BaseEstimator
13+
from sklearn.ensemble import ExtraTreesClassifier
14+
from sklearn.feature_selection import SelectFromModel
15+
16+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
17+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
18+
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
19+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
20+
utils import NoneType_
21+
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none
22+
23+
24+
CRITERION_CHOICES = ("gini", "entropy")
25+
26+
27+
class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent):
28+
"""
29+
Select features based on importance weights calculated using extra trees
30+
"""
31+
def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
32+
criterion: str = "gini", max_features: float = 0.5,
33+
max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2,
34+
min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0,
35+
max_leaf_nodes: Union[int, NoneType_] = "none",
36+
min_impurity_decrease: float = 0, oob_score: bool = False,
37+
verbose: int = 0,
38+
random_state: Optional[np.random.RandomState] = None):
39+
self.bootstrap = bootstrap
40+
self.n_estimators = n_estimators
41+
if criterion not in CRITERION_CHOICES:
42+
raise ValueError(f"`criterion` of {self.__class__.__name__} "
43+
f"must be in {CRITERION_CHOICES}, but got: {criterion}")
44+
self.criterion = criterion
45+
self.max_features = max_features
46+
self.min_impurity_decrease = min_impurity_decrease
47+
self.max_depth = max_depth
48+
self.min_samples_split = min_samples_split
49+
self.min_samples_leaf = min_samples_leaf
50+
self.min_weight_fraction_leaf = min_weight_fraction_leaf
51+
self.max_leaf_nodes = max_leaf_nodes
52+
self.oob_score = oob_score
53+
self.verbose = verbose
54+
55+
super().__init__(random_state=random_state)
56+
57+
def get_components_kwargs(self) -> Dict[str, Any]:
58+
"""
59+
returns keyword arguments required by the feature preprocessor
60+
61+
Returns:
62+
Dict[str, Any]: kwargs
63+
"""
64+
return dict(
65+
bootstrap=self.bootstrap,
66+
n_estimators=self.n_estimators,
67+
criterion=self.criterion,
68+
max_features=self.max_features,
69+
min_impurity_decrease=self.min_impurity_decrease,
70+
max_depth=self.max_depth,
71+
min_samples_split=self.min_samples_split,
72+
min_samples_leaf=self.min_samples_leaf,
73+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
74+
max_leaf_nodes=self.max_leaf_nodes,
75+
oob_score=self.oob_score,
76+
verbose=self.verbose,
77+
random_state=self.random_state,
78+
)
79+
80+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
81+
82+
if check_none(self.max_leaf_nodes):
83+
self.max_leaf_nodes = None
84+
elif isinstance(self.max_leaf_nodes, int):
85+
self.max_leaf_nodes = int(self.max_leaf_nodes)
86+
else:
87+
raise ValueError(f"Expected `max_leaf_nodes` to be either "
88+
f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}")
89+
90+
if check_none(self.max_depth):
91+
self.max_depth = None
92+
elif isinstance(self.max_depth, int):
93+
self.max_depth = int(self.max_depth)
94+
else:
95+
raise ValueError(f"Expected `max_depth` to be either "
96+
f"in ('None', 'none', None) or an integer, got {self.max_depth}")
97+
98+
# TODO: add class_weights
99+
estimator = ExtraTreesClassifier(**self.get_components_kwargs())
100+
101+
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
102+
threshold='mean',
103+
prefit=False)
104+
return self
105+
106+
@staticmethod
107+
def get_hyperparameter_search_space(
108+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
109+
bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
110+
value_range=(True, False),
111+
default_value=True,
112+
),
113+
n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
114+
value_range=(10, 100),
115+
default_value=10,
116+
),
117+
max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
118+
value_range=("none",),
119+
default_value="none",
120+
),
121+
max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features',
122+
value_range=(0, 1),
123+
default_value=0.5,
124+
),
125+
min_impurity_decrease: HyperparameterSearchSpace = HyperparameterSearchSpace(
126+
hyperparameter='min_impurity_decrease',
127+
value_range=(0,),
128+
default_value=0),
129+
criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
130+
value_range=CRITERION_CHOICES,
131+
default_value="gini",
132+
),
133+
min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
134+
value_range=(2, 20),
135+
default_value=2,
136+
),
137+
min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
138+
value_range=(1, 20),
139+
default_value=1,
140+
),
141+
min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
142+
hyperparameter='min_weight_fraction_leaf',
143+
value_range=(0,),
144+
default_value=0),
145+
max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
146+
value_range=("none",),
147+
default_value="none",
148+
),
149+
) -> ConfigurationSpace:
150+
151+
cs = ConfigurationSpace()
152+
add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
153+
add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
154+
add_hyperparameter(cs, max_features, UniformFloatHyperparameter)
155+
add_hyperparameter(cs, min_impurity_decrease, UniformFloatHyperparameter)
156+
add_hyperparameter(cs, criterion, CategoricalHyperparameter)
157+
add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
158+
add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
159+
add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
160+
add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
161+
add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
162+
163+
return cs
164+
165+
@staticmethod
166+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
167+
return {'shortname': 'ETC',
168+
'name': 'Extra Trees Classifier Preprocessing',
169+
'handles_sparse': True,
170+
'handles_regression': False,
171+
'handles_classification': True
172+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
from typing import Any, Dict, List, Optional, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import (
5+
CategoricalHyperparameter,
6+
UniformFloatHyperparameter,
7+
UniformIntegerHyperparameter,
8+
)
9+
10+
import numpy as np
11+
12+
from sklearn.base import BaseEstimator
13+
from sklearn.ensemble import ExtraTreesRegressor
14+
from sklearn.feature_selection import SelectFromModel
15+
16+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
17+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
18+
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
19+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
20+
utils import NoneType_
21+
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none
22+
23+
24+
CRITERION_CHOICES = ('mse', 'friedman_mse', 'mae')
25+
26+
27+
class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent):
28+
"""
29+
Selects features based on importance weights using extra trees
30+
"""
31+
def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
32+
criterion: str = "mse", max_features: float = 1,
33+
max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2,
34+
min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0,
35+
max_leaf_nodes: Union[int, NoneType_] = "none",
36+
oob_score: bool = False, verbose: int = 0,
37+
random_state: Optional[np.random.RandomState] = None):
38+
self.bootstrap = bootstrap
39+
self.n_estimators = n_estimators
40+
if criterion not in CRITERION_CHOICES:
41+
raise ValueError(f"`criterion` of {self.__class__.__name__} "
42+
f"must be in {CRITERION_CHOICES}, but got: {criterion}")
43+
self.criterion = criterion
44+
self.max_features = max_features
45+
self.max_depth = max_depth
46+
self.min_samples_split = min_samples_split
47+
self.min_samples_leaf = min_samples_leaf
48+
self.min_weight_fraction_leaf = min_weight_fraction_leaf
49+
self.max_leaf_nodes = max_leaf_nodes
50+
self.oob_score = oob_score
51+
self.verbose = verbose
52+
53+
super().__init__(random_state=random_state)
54+
55+
self.add_fit_requirements([
56+
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
57+
58+
def get_components_kwargs(self) -> Dict[str, Any]:
59+
"""
60+
returns keyword arguments required by the feature preprocessor
61+
62+
Returns:
63+
Dict[str, Any]: kwargs
64+
"""
65+
return dict(
66+
bootstrap=self.bootstrap,
67+
n_estimators=self.n_estimators,
68+
criterion=self.criterion,
69+
max_features=self.max_features,
70+
max_depth=self.max_depth,
71+
min_samples_split=self.min_samples_split,
72+
min_samples_leaf=self.min_samples_leaf,
73+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
74+
max_leaf_nodes=self.max_leaf_nodes,
75+
oob_score=self.oob_score,
76+
verbose=self.verbose,
77+
random_state=self.random_state,
78+
)
79+
80+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
81+
82+
self.check_requirements(X, y)
83+
84+
if check_none(self.max_leaf_nodes):
85+
self.max_leaf_nodes = None
86+
elif isinstance(self.max_leaf_nodes, int):
87+
self.max_leaf_nodes = int(self.max_leaf_nodes)
88+
else:
89+
raise ValueError(f"Expected `max_leaf_nodes` to be either "
90+
f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}")
91+
92+
if check_none(self.max_depth):
93+
self.max_depth = None
94+
elif isinstance(self.max_depth, int):
95+
self.max_depth = int(self.max_depth)
96+
else:
97+
raise ValueError(f"Expected `max_depth` to be either "
98+
f"in ('None', 'none', None) or an integer, got {self.max_depth}")
99+
100+
num_features = len(X['dataset_properties']['numerical_columns'])
101+
max_features = int(
102+
float(self.max_features) * (np.log(num_features) + 1))
103+
# Use at most half of the features
104+
max_features = max(1, min(int(num_features / 2), max_features))
105+
106+
# TODO: add class_weights
107+
estimator = ExtraTreesRegressor(**self.get_components_kwargs())
108+
109+
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
110+
threshold='mean',
111+
prefit=False)
112+
return self
113+
114+
@staticmethod
115+
def get_hyperparameter_search_space(
116+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
117+
bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
118+
value_range=(True, False),
119+
default_value=True,
120+
),
121+
n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
122+
value_range=(100,),
123+
default_value=100,
124+
),
125+
max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
126+
value_range=("none",),
127+
default_value="none",
128+
),
129+
max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features',
130+
value_range=(0.1, 1),
131+
default_value=1,
132+
),
133+
criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
134+
value_range=CRITERION_CHOICES,
135+
default_value="mse",
136+
),
137+
min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
138+
value_range=(2, 20),
139+
default_value=2,
140+
),
141+
min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
142+
value_range=(1, 20),
143+
default_value=1,
144+
),
145+
min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
146+
hyperparameter='min_weight_fraction_leaf',
147+
value_range=(0,),
148+
default_value=0),
149+
max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
150+
value_range=("none",),
151+
default_value="none",
152+
),
153+
) -> ConfigurationSpace:
154+
155+
cs = ConfigurationSpace()
156+
add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
157+
add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
158+
add_hyperparameter(cs, max_features, UniformFloatHyperparameter)
159+
add_hyperparameter(cs, criterion, CategoricalHyperparameter)
160+
add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
161+
add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
162+
add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
163+
add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
164+
add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
165+
166+
return cs
167+
168+
@staticmethod
169+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
170+
return {'shortname': 'ETR',
171+
'name': 'Extra Trees Regressor Preprocessing',
172+
'handles_sparse': True,
173+
'handles_regression': True,
174+
'handles_classification': False
175+
}

0 commit comments

Comments
 (0)