[fix] [test] Adapt the modification of targets to scipy.sparse.xxx_matrix

nabenabe0928 · nabenabe0928 · commit 91bcd374ee75 · 2022-02-23T21:14:45.000+09:00
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -5,25 +5,13 @@
 
 import pandas as pd
 
-import scipy.sparse
-
 from sklearn.base import BaseEstimator
 
+from autoPyTorch.utils.common import SparseMatrixType
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SupportedFeatTypes = Union[
-    List,
-    pd.DataFrame,
-    np.ndarray,
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType]
 
 
 class BaseFeatureValidator(BaseEstimator):
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -5,26 +5,13 @@
 
 import pandas as pd
 
-import scipy.sparse
-
 from sklearn.base import BaseEstimator
 
+from autoPyTorch.utils.common import SparseMatrixType
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SupportedTargetTypes = Union[
-    List,
-    pd.Series,
-    pd.DataFrame,
-    np.ndarray,
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType]
 
 
 class BaseTargetValidator(BaseEstimator):
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, cast
+from typing import List, Optional, Union, cast
 
 import numpy as np
 
@@ -14,13 +14,37 @@
 from sklearn.utils.multiclass import type_of_target
 
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
+from autoPyTorch.utils.common import SparseMatrixType
 
 
-def _check_and_to_numpy(y: SupportedTargetTypes) -> np.ndarray:
+ArrayType = Union[np.ndarray, SparseMatrixType]
+
+
+def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
     """ sklearn check array will make sure we have the correct numerical features for the array """
     return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
 
 
+def _modify_regression_target(y: ArrayType) -> ArrayType:
+    # Regression targets must have numbers after a decimal point.
+    # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+    y_min = np.abs(y).min()
+    offset = y_min * 1e-16  # Sufficiently small number
+    if y_min > 1e15:
+        raise ValueError(
+            "The minimum value for the target labels of regression tasks must be smaller than "
+            f"1e15 to avoid errors caused by an overflow, but got {y_min}"
+        )
+
+    # Since it is all integer, we can just add a random small number
+    if isinstance(y, np.ndarray):
+        y = y.astype(dtype=np.float64) + offset
+    else:
+        y.data = y.data.astype(dtype=np.float64) + offset
+
+    return y
+
+
 class TabularTargetValidator(BaseTargetValidator):
     def _fit(
         self,
@@ -101,7 +125,7 @@ def _fit(
 
     def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         if self.encoder is None:
-            return _check_and_to_numpy(y)
+            return _check_and_to_array(y)
 
         # remove ravel warning from pandas Series
         shape = np.shape(y)
@@ -115,12 +139,9 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         else:
             y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
 
-        return _check_and_to_numpy(y)
+        return _check_and_to_array(y)
 
-    def transform(
-        self,
-        y: SupportedTargetTypes,
-    ) -> np.ndarray:
+    def transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -146,24 +167,11 @@ def transform(
             y = np.ravel(y)
 
         if not self.is_classification and "continuous" not in type_of_target(y):
-            # Regression targets must have numbers after a decimal point.
-            # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
-            y_min = np.abs(y).min()
-            offset = y_min * 1e-16  # Sufficiently small number
-            if y_min > 1e15:
-                raise ValueError(
-                    "The minimum value for the target labels of regression tasks must be smaller than "
-                    f"1e15 to avoid errors caused by an overflow, but got {y_min}"
-                )
-
-            y = y.astype(dtype=np.float64) + offset  # Since it is all integer, we can just add a random small number
+            y = _modify_regression_target(y)
 
         return y
 
-    def inverse_transform(
-        self,
-        y: SupportedTargetTypes,
-    ) -> np.ndarray:
+    def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array
 
@@ -197,10 +205,7 @@ def inverse_transform(
             y = y.astype(self.dtype)
         return y
 
-    def _check_data(
-        self,
-        y: SupportedTargetTypes,
-    ) -> None:
+    def _check_data(self, y: SupportedTargetTypes) -> None:
         """
         Perform dimensionality and data type checks on the targets
 
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
@@ -20,6 +20,15 @@
 from torch.utils.data.dataloader import default_collate
 
 HyperparameterValueType = Union[int, str, float]
+SparseMatrixType = Union[
+    scipy.sparse.bsr_matrix,
+    scipy.sparse.coo_matrix,
+    scipy.sparse.csc_matrix,
+    scipy.sparse.csr_matrix,
+    scipy.sparse.dia_matrix,
+    scipy.sparse.dok_matrix,
+    scipy.sparse.lil_matrix,
+]
 
 
 class FitRequirement(NamedTuple):
diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py
@@ -150,17 +150,17 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest)
     assert validator.encoder is None
 
     if hasattr(input_data_targettest, "iloc"):
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(input_data_targettest.to_numpy()),
             np.ravel(transformed_y)
         )
     elif sparse.issparse(input_data_targettest):
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(input_data_targettest.todense()),
             np.ravel(transformed_y.todense())
         )
     else:
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(np.array(input_data_targettest)),
             np.ravel(transformed_y)
         )