[ADD] variance thresholding (#373)

ravinkohli · nabenabe0928 · web-flow · commit 466bc18b0d2e · 2022-02-09T10:50:58.000+01:00
* add variance thresholding

* fix flake and mypy

* Apply suggestions from code review

Co-authored-by: nabenabe0928 &lt;47781922+nabenabe0928@users.noreply.github.com&gt;

Co-authored-by: nabenabe0928 &lt;47781922+nabenabe0928@users.noreply.github.com&gt;
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
+    autoPyTorchTabularPreprocessingComponent
+
+
+class VarianceThreshold(autoPyTorchTabularPreprocessingComponent):
+    """
+    Removes features that have the same value in the training data.
+    """
+    def __init__(self, random_state: Optional[np.random.RandomState] = None):
+        super().__init__()
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold':
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnVarianceThreshold(
+            threshold=0.0
+        )
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        if self.preprocessor['numerical'] is None:
+            raise ValueError("cannot call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'variance_threshold': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+
+        return {
+            'shortname': 'Variance Threshold',
+            'name': 'Variance Threshold (constant feature removal)',
+            'handles_sparse': True,
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
@@ -27,6 +27,8 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -307,6 +309,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py
@@ -27,6 +27,8 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -257,6 +259,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py
@@ -6,6 +6,8 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 
 
@@ -28,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
 
         steps.extend([
             ("imputer", SimpleImputer()),
+            ("variance_threshold", VarianceThreshold()),
             ("encoder", EncoderChoice(default_dataset_properties)),
             ("scaler", ScalerChoice(default_dataset_properties)),
             ("tabular_transformer", TabularColumnTransformer()),
diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
@@ -0,0 +1,49 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
+
+
+def test_variance_threshold():
+    data = np.array([[1, 2, 1],
+                     [7, 8, 9],
+                     [4, 5, 1],
+                     [11, 12, 1],
+                     [17, 18, 19],
+                     [14, 15, 16]])
+    numerical_columns = [0, 1, 2]
+    train_indices = np.array([0, 2, 3])
+    test_indices = np.array([1, 4, 5])
+    dataset_properties = {
+        'categorical_columns': [],
+        'numerical_columns': numerical_columns,
+    }
+    X = {
+        'X_train': data[train_indices],
+        'dataset_properties': dataset_properties
+    }
+    component = VarianceThreshold()
+
+    component = component.fit(X)
+    X = component.transform(X)
+    variance_threshold = X['variance_threshold']['numerical']
+
+    # check if the fit dictionary X is modified as expected
+    assert isinstance(X['variance_threshold'], dict)
+    assert isinstance(variance_threshold, BaseEstimator)
+
+    # make column transformer with returned encoder to fit on data
+    column_transformer = make_column_transformer((variance_threshold,
+                                                  X['dataset_properties']['numerical_columns']),
+                                                 remainder='passthrough')
+    column_transformer = column_transformer.fit(X['X_train'])
+    transformed = column_transformer.transform(data[test_indices])
+
+    assert_array_equal(transformed, np.array([[7, 8],
+                                              [17, 18],
+                                              [14, 15]]))