[fix] Address Ravin's comments and loosen the small number choice

nabenabe0928 · nabenabe0928 · commit 4072b6f355c1 · 2022-02-24T01:38:40.000+09:00
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
@@ -29,11 +29,11 @@ def _modify_regression_target(y: ArrayType) -> ArrayType:
     # Regression targets must have numbers after a decimal point.
     # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
     y_min = np.abs(y).min()
-    offset = y_min * 1e-16  # Sufficiently small number
-    if y_min > 1e15:
+    offset = max(y_min, 1e-13) * 1e-13  # Sufficiently small number
+    if y_min > 1e12:
         raise ValueError(
             "The minimum value for the target labels of regression tasks must be smaller than "
-            f"1e15 to avoid errors caused by an overflow, but got {y_min}"
+            f"1e12 to avoid errors caused by an overflow, but got {y_min}"
         )
 
     # Since it is all integer, we can just add a random small number
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -69,7 +69,7 @@ def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, st
         target_labels = np.array(train_tensors[1])
 
     output_type: str = type_of_target(target_labels)
-    if STRING_TO_OUTPUT_TYPES.get(output_type, None) in CLASSIFICATION_OUTPUTS:
+    if STRING_TO_OUTPUT_TYPES[output_type] in CLASSIFICATION_OUTPUTS:
         output_dim = len(np.unique(target_labels))
     elif target_labels.ndim > 1:
         output_dim = target_labels.shape[-1]
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
@@ -912,23 +912,21 @@ def test_tabular_classification_test_evaluator(openml_id, backend, n_samples):
 )
 def test_task_inference(ans, task_class, backend):
     # Get the data and check that contents of data-manager make sense
-    X = np.random.random((5, 1))
-    y = np.array([0, 1, 2, 3, 4]) + 10 ** 15
-
-    X_train, _, y_train, _ = sklearn.model_selection.train_test_split(X, y, random_state=42)
+    X = np.random.random((6, 1))
+    y = np.array([-10 ** 12, 0, 1, 2, 3, 4], dtype=np.int64) + 10 ** 12
 
     estimator = task_class(
         backend=backend,
         resampling_strategy=HoldoutValTypes.holdout_validation,
         resampling_strategy_args=None,
         seed=42,
     )
-    dataset = estimator.get_dataset(X_train, y_train)
+    dataset = estimator.get_dataset(X, y)
     assert dataset.output_type == ans
 
-    y_train += 1
+    y += 10 ** 12 + 10  # Check if the function catches overflow possibilities
     if ans == 'continuous':
         with pytest.raises(ValueError):  # ValueError due to `Too large value`
-            estimator.get_dataset(X_train, y_train)
+            estimator.get_dataset(X, y)
     else:
-        estimator.get_dataset(X_train, y_train)
+        estimator.get_dataset(X, y)