[ADD] scalers from autosklearn (#372)

ravinkohli · nabenabe0928 · web-flow · commit 2601421f16ff · 2022-02-09T11:56:24.000+01:00
* Add new scalers

* fix flake and mypy

* Apply suggestions from code review

Co-authored-by: nabenabe0928 &lt;47781922+nabenabe0928@users.noreply.github.com&gt;

* add robust scaler

* fix documentation

* remove power transformer from feature preprocessing

* fix tests

* check for default in include and exclude

* Apply suggestions from code review

Co-authored-by: nabenabe0928 &lt;47781922+nabenabe0928@users.noreply.github.com&gt;

Co-authored-by: nabenabe0928 &lt;47781922+nabenabe0928@users.noreply.github.com&gt;
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py
@@ -72,7 +72,6 @@ def get_hyperparameter_search_space(self,
                         'RandomKitchenSinks',
                         'Nystroem',
                         'PolynomialFeatures',
-                        'PowerTransformer',
                         'TruncatedSVD',
                         ]
             for default_ in defaults:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+
+
+class PowerTransformer(BaseScaler):
+    """
+    Map data to as close to a Gaussian distribution as possible
+    in order to reduce variance and minimize skewness.
+
+    Uses `yeo-johnson` power transform method. Also, data is normalised
+    to zero mean and unit variance.
+    """
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'PowerTransformer',
+            'name': 'PowerTransformer',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
@@ -0,0 +1,73 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import numpy as np
+
+from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+class QuantileTransformer(BaseScaler):
+    """
+    Transform the features to follow a uniform or a normal distribution
+    using quantiles information.
+
+    For more details of each attribute, see:
+    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
+    """
+    def __init__(
+        self,
+        n_quantiles: int = 1000,
+        output_distribution: str = "normal",  # Literal["normal", "uniform"]
+        random_state: Optional[np.random.RandomState] = None
+    ):
+        super().__init__()
+        self.random_state = random_state
+        self.n_quantiles = n_quantiles
+        self.output_distribution = output_distribution
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
+                                                                    output_distribution=self.output_distribution,
+                                                                    copy=False)
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
+                                                                           value_range=(10, 2000),
+                                                                           default_value=1000,
+                                                                           ),
+        output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
+                                                                                   value_range=("uniform", "normal"),
+                                                                                   default_value="normal",
+                                                                                   )
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        # TODO parametrize like the Random Forest as n_quantiles = n_features^param
+        add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'QuantileTransformer',
+            'name': 'QuantileTransformer',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
@@ -0,0 +1,73 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.preprocessing import RobustScaler as SklearnRobustScaler
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
+
+
+class RobustScaler(BaseScaler):
+    """
+    Remove the median and scale features according to the quantile_range to make
+    the features robust to outliers.
+
+    For more details of the preprocessor, see:
+    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
+    """
+    def __init__(
+        self,
+        q_min: float = 0.25,
+        q_max: float = 0.75,
+        random_state: Optional[np.random.RandomState] = None
+    ):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
+        self.random_state = random_state
+        self.q_min = q_min
+        self.q_max = q_max
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+        with_centering = bool(not X['dataset_properties']['issparse'])
+
+        self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
+                                                             with_centering=with_centering,
+                                                             copy=False)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min",
+                                                                     value_range=(0.001, 0.3),
+                                                                     default_value=0.25),
+        q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max",
+                                                                     value_range=(0.7, 0.999),
+                                                                     default_value=0.75)
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, q_min, UniformFloatHyperparameter)
+        add_hyperparameter(cs, q_max, UniformFloatHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'RobustScaler',
+            'name': 'RobustScaler',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py
@@ -66,9 +66,21 @@ def get_hyperparameter_search_space(self,
             raise ValueError("no scalers found, please add a scaler")
 
         if default is None:
-            defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
+            defaults = [
+                'StandardScaler',
+                'Normalizer',
+                'MinMaxScaler',
+                'PowerTransformer',
+                'QuantileTransformer',
+                'RobustScaler',
+                'NoScaler'
+            ]
             for default_ in defaults:
                 if default_ in available_scalers:
+                    if include is not None and default_ not in include:
+                        continue
+                    if exclude is not None and default_ in exclude:
+                        continue
                     default = default_
                     break
 
diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py
@@ -20,7 +20,7 @@ def random_state():
     return 11
 
 
-@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer',
+@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures',
                         'Nystroem', 'KernelPCA', 'RandomKitchenSinks'])
 def preprocessor(request):
     return request.param
diff --git a/test/test_pipeline/components/preprocessing/test_scalers.py b/test/test_pipeline/components/preprocessing/test_scalers.py

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,6 @@ def get_hyperparameter_search_space(self,`
`72`	`72`	`'RandomKitchenSinks',`
`73`	`73`	`'Nystroem',`
`74`	`74`	`'PolynomialFeatures',`
`75`		`- 'PowerTransformer',`
`76`	`75`	`'TruncatedSVD',`
`77`	`76`	`]`
`78`	`77`	`for default_ in defaults:`