Skip to content

Commit 2601421

Browse files
[ADD] scalers from autosklearn (#372)
* Add new scalers * fix flake and mypy * Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> * add robust scaler * fix documentation * remove power transformer from feature preprocessing * fix tests * check for default in include and exclude * Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
1 parent 466bc18 commit 2601421

File tree

8 files changed

+363
-52
lines changed

8 files changed

+363
-52
lines changed

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py

-49
This file was deleted.

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ def get_hyperparameter_search_space(self,
7272
'RandomKitchenSinks',
7373
'Nystroem',
7474
'PolynomialFeatures',
75-
'PowerTransformer',
7675
'TruncatedSVD',
7776
]
7877
for default_ in defaults:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
import numpy as np
4+
5+
from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
6+
7+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
8+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
9+
10+
11+
class PowerTransformer(BaseScaler):
12+
"""
13+
Map data to as close to a Gaussian distribution as possible
14+
in order to reduce variance and minimize skewness.
15+
16+
Uses `yeo-johnson` power transform method. Also, data is normalised
17+
to zero mean and unit variance.
18+
"""
19+
def __init__(self,
20+
random_state: Optional[np.random.RandomState] = None):
21+
super().__init__()
22+
self.random_state = random_state
23+
24+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
25+
26+
self.check_requirements(X, y)
27+
28+
self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
29+
return self
30+
31+
@staticmethod
32+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
33+
) -> Dict[str, Union[str, bool]]:
34+
return {
35+
'shortname': 'PowerTransformer',
36+
'name': 'PowerTransformer',
37+
'handles_sparse': False
38+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import (
5+
CategoricalHyperparameter,
6+
UniformIntegerHyperparameter
7+
)
8+
9+
import numpy as np
10+
11+
from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer
12+
13+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
14+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
15+
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
16+
17+
18+
class QuantileTransformer(BaseScaler):
19+
"""
20+
Transform the features to follow a uniform or a normal distribution
21+
using quantiles information.
22+
23+
For more details of each attribute, see:
24+
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
25+
"""
26+
def __init__(
27+
self,
28+
n_quantiles: int = 1000,
29+
output_distribution: str = "normal", # Literal["normal", "uniform"]
30+
random_state: Optional[np.random.RandomState] = None
31+
):
32+
super().__init__()
33+
self.random_state = random_state
34+
self.n_quantiles = n_quantiles
35+
self.output_distribution = output_distribution
36+
37+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
38+
39+
self.check_requirements(X, y)
40+
41+
self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
42+
output_distribution=self.output_distribution,
43+
copy=False)
44+
return self
45+
46+
@staticmethod
47+
def get_hyperparameter_search_space(
48+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
49+
n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
50+
value_range=(10, 2000),
51+
default_value=1000,
52+
),
53+
output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
54+
value_range=("uniform", "normal"),
55+
default_value="normal",
56+
)
57+
) -> ConfigurationSpace:
58+
cs = ConfigurationSpace()
59+
60+
# TODO parametrize like the Random Forest as n_quantiles = n_features^param
61+
add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
62+
add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)
63+
64+
return cs
65+
66+
@staticmethod
67+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
68+
) -> Dict[str, Union[str, bool]]:
69+
return {
70+
'shortname': 'QuantileTransformer',
71+
'name': 'QuantileTransformer',
72+
'handles_sparse': False
73+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import (
5+
UniformFloatHyperparameter,
6+
)
7+
8+
import numpy as np
9+
10+
from sklearn.preprocessing import RobustScaler as SklearnRobustScaler
11+
12+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
13+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
14+
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
15+
16+
17+
class RobustScaler(BaseScaler):
18+
"""
19+
Remove the median and scale features according to the quantile_range to make
20+
the features robust to outliers.
21+
22+
For more details of the preprocessor, see:
23+
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
24+
"""
25+
def __init__(
26+
self,
27+
q_min: float = 0.25,
28+
q_max: float = 0.75,
29+
random_state: Optional[np.random.RandomState] = None
30+
):
31+
super().__init__()
32+
self.add_fit_requirements([
33+
FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
34+
self.random_state = random_state
35+
self.q_min = q_min
36+
self.q_max = q_max
37+
38+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
39+
40+
self.check_requirements(X, y)
41+
with_centering = bool(not X['dataset_properties']['issparse'])
42+
43+
self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
44+
with_centering=with_centering,
45+
copy=False)
46+
47+
return self
48+
49+
@staticmethod
50+
def get_hyperparameter_search_space(
51+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
52+
q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min",
53+
value_range=(0.001, 0.3),
54+
default_value=0.25),
55+
q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max",
56+
value_range=(0.7, 0.999),
57+
default_value=0.75)
58+
) -> ConfigurationSpace:
59+
cs = ConfigurationSpace()
60+
61+
add_hyperparameter(cs, q_min, UniformFloatHyperparameter)
62+
add_hyperparameter(cs, q_max, UniformFloatHyperparameter)
63+
64+
return cs
65+
66+
@staticmethod
67+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
68+
) -> Dict[str, Union[str, bool]]:
69+
return {
70+
'shortname': 'RobustScaler',
71+
'name': 'RobustScaler',
72+
'handles_sparse': True
73+
}

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,21 @@ def get_hyperparameter_search_space(self,
6666
raise ValueError("no scalers found, please add a scaler")
6767

6868
if default is None:
69-
defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
69+
defaults = [
70+
'StandardScaler',
71+
'Normalizer',
72+
'MinMaxScaler',
73+
'PowerTransformer',
74+
'QuantileTransformer',
75+
'RobustScaler',
76+
'NoScaler'
77+
]
7078
for default_ in defaults:
7179
if default_ in available_scalers:
80+
if include is not None and default_ not in include:
81+
continue
82+
if exclude is not None and default_ in exclude:
83+
continue
7284
default = default_
7385
break
7486

test/test_pipeline/components/preprocessing/test_feature_preprocessor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def random_state():
2020
return 11
2121

2222

23-
@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer',
23+
@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures',
2424
'Nystroem', 'KernelPCA', 'RandomKitchenSinks'])
2525
def preprocessor(request):
2626
return request.param

0 commit comments

Comments
 (0)