Skip to content

Commit 2306c45

Browse files
authored
[feat] Add new task inference for APT (#386)
* [fix] Fix the task inference issue mentioned in #352 Since sklearn task inference regards targets with integers as a classification task, I modified target_validator so that we always cast targets for regression to float. This workaround is mentioned in the reference below: scikit-learn/scikit-learn#8952 * [fix] [test] Add a small number to label for regression and add tests Since target labels are required to be float and sklearn requires numbers after a decimal point, I added a workaround to add the almost possible minimum fraction to array so that we can avoid a mis-inference of task type from sklearn. Plus, I added tests to check if we get the expected results for extreme cases. * [fix] [test] Adapt the modification of targets to scipy.sparse.xxx_matrix * [fix] Address Ravin's comments and loosen the small number choice
1 parent 4a0c773 commit 2306c45

10 files changed

+199
-131
lines changed

autoPyTorch/data/base_feature_validator.py

+10-22
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,13 @@
55

66
import pandas as pd
77

8-
import scipy.sparse
9-
108
from sklearn.base import BaseEstimator
119

10+
from autoPyTorch.utils.common import SparseMatrixType
1211
from autoPyTorch.utils.logging_ import PicklableClientLogger
1312

1413

15-
SUPPORTED_FEAT_TYPES = Union[
16-
List,
17-
pd.DataFrame,
18-
np.ndarray,
19-
scipy.sparse.bsr_matrix,
20-
scipy.sparse.coo_matrix,
21-
scipy.sparse.csc_matrix,
22-
scipy.sparse.csr_matrix,
23-
scipy.sparse.dia_matrix,
24-
scipy.sparse.dok_matrix,
25-
scipy.sparse.lil_matrix,
26-
]
14+
SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType]
2715

2816

2917
class BaseFeatureValidator(BaseEstimator):
@@ -68,19 +56,19 @@ def __init__(
6856

6957
def fit(
7058
self,
71-
X_train: SUPPORTED_FEAT_TYPES,
72-
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
59+
X_train: SupportedFeatTypes,
60+
X_test: Optional[SupportedFeatTypes] = None,
7361
) -> BaseEstimator:
7462
"""
7563
Validates and fit a categorical encoder (if needed) to the features.
7664
The supported data types are List, numpy arrays and pandas DataFrames.
7765
CSR sparse data types are also supported
7866
7967
Args:
80-
X_train (SUPPORTED_FEAT_TYPES):
68+
X_train (SupportedFeatTypes):
8169
A set of features that are going to be validated (type and dimensionality
8270
checks) and a encoder fitted in the case the data needs encoding
83-
X_test (Optional[SUPPORTED_FEAT_TYPES]):
71+
X_test (Optional[SupportedFeatTypes]):
8472
A hold out set of data used for checking
8573
"""
8674

@@ -109,11 +97,11 @@ def fit(
10997

11098
def _fit(
11199
self,
112-
X: SUPPORTED_FEAT_TYPES,
100+
X: SupportedFeatTypes,
113101
) -> BaseEstimator:
114102
"""
115103
Args:
116-
X (SUPPORTED_FEAT_TYPES):
104+
X (SupportedFeatTypes):
117105
A set of features that are going to be validated (type and dimensionality
118106
checks) and a encoder fitted in the case the data needs encoding
119107
Returns:
@@ -124,11 +112,11 @@ def _fit(
124112

125113
def transform(
126114
self,
127-
X: SUPPORTED_FEAT_TYPES,
115+
X: SupportedFeatTypes,
128116
) -> np.ndarray:
129117
"""
130118
Args:
131-
X_train (SUPPORTED_FEAT_TYPES):
119+
X_train (SupportedFeatTypes):
132120
A set of features, whose categorical features are going to be
133121
transformed
134122

autoPyTorch/data/base_target_validator.py

+13-26
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,13 @@
55

66
import pandas as pd
77

8-
import scipy.sparse
9-
108
from sklearn.base import BaseEstimator
119

10+
from autoPyTorch.utils.common import SparseMatrixType
1211
from autoPyTorch.utils.logging_ import PicklableClientLogger
1312

1413

15-
SUPPORTED_TARGET_TYPES = Union[
16-
List,
17-
pd.Series,
18-
pd.DataFrame,
19-
np.ndarray,
20-
scipy.sparse.bsr_matrix,
21-
scipy.sparse.coo_matrix,
22-
scipy.sparse.csc_matrix,
23-
scipy.sparse.csr_matrix,
24-
scipy.sparse.dia_matrix,
25-
scipy.sparse.dok_matrix,
26-
scipy.sparse.lil_matrix,
27-
]
14+
SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType]
2815

2916

3017
class BaseTargetValidator(BaseEstimator):
@@ -69,17 +56,17 @@ def __init__(self,
6956

7057
def fit(
7158
self,
72-
y_train: SUPPORTED_TARGET_TYPES,
73-
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
59+
y_train: SupportedTargetTypes,
60+
y_test: Optional[SupportedTargetTypes] = None,
7461
) -> BaseEstimator:
7562
"""
7663
Validates and fit a categorical encoder (if needed) to the targets
7764
The supported data types are List, numpy arrays and pandas DataFrames.
7865
7966
Args:
80-
y_train (SUPPORTED_TARGET_TYPES)
67+
y_train (SupportedTargetTypes)
8168
A set of targets set aside for training
82-
y_test (Union[SUPPORTED_TARGET_TYPES])
69+
y_test (Union[SupportedTargetTypes])
8370
A hold out set of data used of the targets. It is also used to fit the
8471
categories of the encoder.
8572
"""
@@ -128,26 +115,26 @@ def fit(
128115

129116
def _fit(
130117
self,
131-
y_train: SUPPORTED_TARGET_TYPES,
132-
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
118+
y_train: SupportedTargetTypes,
119+
y_test: Optional[SupportedTargetTypes] = None,
133120
) -> BaseEstimator:
134121
"""
135122
Args:
136-
y_train (SUPPORTED_TARGET_TYPES)
123+
y_train (SupportedTargetTypes)
137124
The labels of the current task. They are going to be encoded in case
138125
of classification
139-
y_test (Optional[SUPPORTED_TARGET_TYPES])
126+
y_test (Optional[SupportedTargetTypes])
140127
A holdout set of labels
141128
"""
142129
raise NotImplementedError()
143130

144131
def transform(
145132
self,
146-
y: Union[SUPPORTED_TARGET_TYPES],
133+
y: Union[SupportedTargetTypes],
147134
) -> np.ndarray:
148135
"""
149136
Args:
150-
y (SUPPORTED_TARGET_TYPES)
137+
y (SupportedTargetTypes)
151138
A set of targets that are going to be encoded if the current task
152139
is classification
153140
Returns:
@@ -158,7 +145,7 @@ def transform(
158145

159146
def inverse_transform(
160147
self,
161-
y: SUPPORTED_TARGET_TYPES,
148+
y: SupportedTargetTypes,
162149
) -> np.ndarray:
163150
"""
164151
Revert any encoding transformation done on a target array

autoPyTorch/data/base_validator.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
from sklearn.base import BaseEstimator
88
from sklearn.exceptions import NotFittedError
99

10-
from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
11-
from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
10+
from autoPyTorch.data.base_feature_validator import SupportedFeatTypes
11+
from autoPyTorch.data.base_target_validator import SupportedTargetTypes
1212

1313

1414
class BaseInputValidator(BaseEstimator):
@@ -40,10 +40,10 @@ def __init__(
4040

4141
def fit(
4242
self,
43-
X_train: SUPPORTED_FEAT_TYPES,
44-
y_train: SUPPORTED_TARGET_TYPES,
45-
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
46-
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
43+
X_train: SupportedFeatTypes,
44+
y_train: SupportedTargetTypes,
45+
X_test: Optional[SupportedFeatTypes] = None,
46+
y_test: Optional[SupportedTargetTypes] = None,
4747
) -> BaseEstimator:
4848
"""
4949
Validates and fit a categorical encoder (if needed) to the features, and
@@ -59,15 +59,15 @@ def fit(
5959
+ If performing a classification task, the data is going to be encoded
6060
6161
Args:
62-
X_train (SUPPORTED_FEAT_TYPES):
62+
X_train (SupportedFeatTypes):
6363
A set of features that are going to be validated (type and dimensionality
6464
checks). If this data contains categorical columns, an encoder is going to
6565
be instantiated and trained with this data.
66-
y_train (SUPPORTED_TARGET_TYPES):
66+
y_train (SupportedTargetTypes):
6767
A set of targets that are going to be encoded if the task is for classification
68-
X_test (Optional[SUPPORTED_FEAT_TYPES]):
68+
X_test (Optional[SupportedFeatTypes]):
6969
A hold out set of features used for checking
70-
y_test (SUPPORTED_TARGET_TYPES):
70+
y_test (SupportedTargetTypes):
7171
A hold out set of targets used for checking. Additionally, if the current task
7272
is a classification task, this y_test categories are also going to be used to
7373
fit a pre-processing encoding (to prevent errors on unseen classes).
@@ -96,16 +96,16 @@ def fit(
9696

9797
def transform(
9898
self,
99-
X: SUPPORTED_FEAT_TYPES,
100-
y: Optional[SUPPORTED_TARGET_TYPES] = None,
99+
X: SupportedFeatTypes,
100+
y: Optional[SupportedTargetTypes] = None,
101101
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
102102
"""
103103
Transform the given target or features to a numpy array
104104
105105
Args:
106-
X (SUPPORTED_FEAT_TYPES):
106+
X (SupportedFeatTypes):
107107
A set of features to transform
108-
y (Optional[SUPPORTED_TARGET_TYPES]):
108+
y (Optional[SupportedTargetTypes]):
109109
A set of targets to transform
110110
111111
Returns:

autoPyTorch/data/tabular_feature_validator.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from sklearn.impute import SimpleImputer
1717
from sklearn.pipeline import make_pipeline
1818

19-
from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
19+
from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
2020

2121

2222
def _create_column_transformer(
@@ -117,15 +117,15 @@ def _comparator(cmp1: str, cmp2: str) -> int:
117117

118118
def _fit(
119119
self,
120-
X: SUPPORTED_FEAT_TYPES,
120+
X: SupportedFeatTypes,
121121
) -> BaseEstimator:
122122
"""
123123
In case input data is a pandas DataFrame, this utility encodes the user provided
124124
features (from categorical for example) to a numerical value that further stages
125125
will be able to use
126126
127127
Args:
128-
X (SUPPORTED_FEAT_TYPES):
128+
X (SupportedFeatTypes):
129129
A set of features that are going to be validated (type and dimensionality
130130
checks) and an encoder fitted in the case the data needs encoding
131131
@@ -204,14 +204,14 @@ def _fit(
204204

205205
def transform(
206206
self,
207-
X: SUPPORTED_FEAT_TYPES,
207+
X: SupportedFeatTypes,
208208
) -> np.ndarray:
209209
"""
210210
Validates and fit a categorical encoder (if needed) to the features.
211211
The supported data types are List, numpy arrays and pandas DataFrames.
212212
213213
Args:
214-
X_train (SUPPORTED_FEAT_TYPES):
214+
X_train (SupportedFeatTypes):
215215
A set of features, whose categorical features are going to be
216216
transformed
217217
@@ -276,13 +276,13 @@ def transform(
276276

277277
def _check_data(
278278
self,
279-
X: SUPPORTED_FEAT_TYPES,
279+
X: SupportedFeatTypes,
280280
) -> None:
281281
"""
282282
Feature dimensionality and data type checks
283283
284284
Args:
285-
X (SUPPORTED_FEAT_TYPES):
285+
X (SupportedFeatTypes):
286286
A set of features that are going to be validated (type and dimensionality
287287
checks) and an encoder fitted in the case the data needs encoding
288288
"""
@@ -429,19 +429,19 @@ def _get_columns_to_encode(
429429

430430
def list_to_dataframe(
431431
self,
432-
X_train: SUPPORTED_FEAT_TYPES,
433-
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
432+
X_train: SupportedFeatTypes,
433+
X_test: Optional[SupportedFeatTypes] = None,
434434
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
435435
"""
436436
Converts a list to a pandas DataFrame. In this process, column types are inferred.
437437
438438
If test data is provided, we proactively match it to train data
439439
440440
Args:
441-
X_train (SUPPORTED_FEAT_TYPES):
441+
X_train (SupportedFeatTypes):
442442
A set of features that are going to be validated (type and dimensionality
443443
checks) and a encoder fitted in the case the data needs encoding
444-
X_test (Optional[SUPPORTED_FEAT_TYPES]):
444+
X_test (Optional[SupportedFeatTypes]):
445445
A hold out set of data used for checking
446446
447447
Returns:

0 commit comments

Comments
 (0)