From d5bdfd93584882e88febe3f6cfb480e841b3f006 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 14 Sep 2021 09:28:09 +0200 Subject: [PATCH 1/9] [doc] Add module doc-string and TODOs to base_task.py --- autoPyTorch/api/base_task.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 94add94bd..8687b77cd 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1,3 +1,18 @@ +"""Base class for tasks to solve +* The shared components among all the tasks +* This module provides the optimization given a pipeline +* This module plays a role of communicating with + distributed clients + +TODO: + * Separate the training procedure by another class and encapsulate it + * Separate _do_dummy_prediction and refactor it + * Separate _do_traditional_prediction and refactor it + * Refactor _search + * Reduce unimportant instance variables + * Use private variables and public variables by _ +""" + import copy import json import logging.handlers From 13899df6928dfe2e7c19eb5e85a2ec7ad6a76047 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 14 Sep 2021 10:31:24 +0200 Subject: [PATCH 2/9] [doc] Add module doc-string and TODOs to base_feature_validator.py --- autoPyTorch/data/base_feature_validator.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 2ef02ceba..6955dff8b 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,3 +1,15 @@ +"""Base class for the feature validator given a task +* A wrapper class of the sklearn.base.BaseEstimator +* The feature validator for each task inherits this class +* Check if the provided feature can be processed in AutoPytorch + +TODO: + * SUPPORTED_FEAT_TYPES --> Enumerator + * Describe the shape of X + * typing. --> + * logging.Logger --> Logger +""" + import logging import typing From f8957453396e3a88a2a5202ed5da0440e72e4a3a Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 14 Sep 2021 10:41:24 +0200 Subject: [PATCH 3/9] [doc] Add module doc-string and TODOs to base_target_validator.py --- autoPyTorch/data/base_target_validator.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index 44e73d42a..e3018e839 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,3 +1,19 @@ +"""Base class for the target (or label) validator given a task +* A wrapper class of the sklearn.base.BaseEstimator +* The target validator for each task inherits this class +* Check if the provided targets (or labels) are compatible in both + training and test + +TODO: + * SUPPORTED_FEAT_TYPES --> Enumerator + * Describe the shape of y + * typing. --> + * logging.Logger --> Logger + * Rename classes_ --> get_classes + * Check the return of classes_ + * is_single_column_target --> is_target_scalar +""" + import logging import typing @@ -31,12 +47,13 @@ class BaseTargetValidator(BaseEstimator): """ A class to pre-process targets. It validates the data provided during fit (to make sure it matches AutoPyTorch expectation) as well as encoding the targets in case of classification + Attributes: is_classification (bool): A bool that indicates if the validator should operate in classification mode. During classification, the targets are encoded. encoder (typing.Optional[BaseEstimator]): - Host a encoder object if the data requires transformation (for example, + Host an encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) enc_columns (typing.List[str]) List of columns that where encoded @@ -175,7 +192,7 @@ def classes_(self) -> np.ndarray: Complies with scikit learn classes_ attribute, which consist of a ndarray of shape (n_classes,) where n_classes are the number of classes seen while fitting - a encoder to the targets. + an encoder to the targets. Returns: classes_: np.ndarray The unique classes seen during encoding of a classifier From f3282d9cf63ec37b8d98dc3c234755a394c8721f Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 14 Sep 2021 10:47:43 +0200 Subject: [PATCH 4/9] [doc] Add module doc-string and TODOs to base_validator.py --- autoPyTorch/data/base_validator.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py index 7528d56ab..ff782c526 100644 --- a/autoPyTorch/data/base_validator.py +++ b/autoPyTorch/data/base_validator.py @@ -1,3 +1,16 @@ +"""Base class for the input validator given a task +* A wrapper class of the sklearn.base.BaseEstimator +* The input validator for each task inherits this class +* Check if the provided data are compatible with AutoPytorch implementation +* Manage both target_ and feature_validator in this class + +TODO: + * typing. --> + * logging.Logger --> Logger + * Inherit feature_validator and target_validator from a child class + via super().__init__() +""" + # -*- encoding: utf-8 -*- import logging.handlers import typing From 8f864e6897f746187b48931a9284c496b90abfb4 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 14 Sep 2021 11:08:10 +0200 Subject: [PATCH 5/9] [doc] Add module doc-string and TODOs to base_dataset.py --- autoPyTorch/constants.py | 7 +++++++ autoPyTorch/datasets/base_dataset.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py index 652a546b9..de77f440d 100644 --- a/autoPyTorch/constants.py +++ b/autoPyTorch/constants.py @@ -1,3 +1,10 @@ +"""Constant variables in AutoPytorch + +TODO: + * Make everything enumerators + * Avoid the usage of integers +""" + TABULAR_CLASSIFICATION = 1 IMAGE_CLASSIFICATION = 2 TABULAR_REGRESSION = 3 diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 15a6dedf9..8393140e0 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -1,3 +1,19 @@ +"""Base class of the provided dataset +* Provide data validation splits based on types of data +* Provide API to return training and validation splits +* Storage the properties of the dataset which are required + in AutoPytorch implementation + +TODO: + * Address: https://github.com/automl/Auto-PyTorch/pull/108/ + * Make BaseDatasetPropertiesType more informative + * Use private variables and public variables properly + * Consider more memory-efficient way to store splits + ==> It will be so much memory consumption for huge datasets + * Check the usage of validation and test because cross validation + only uses the training dataset +""" + import os import uuid from abc import ABCMeta From f5765ba509b25689f19e16bef360dd4bb1985aad Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Wed, 15 Sep 2021 09:56:43 +0200 Subject: [PATCH 6/9] [doc] Add module doc-string and TODOs to resampling_strategy.py --- autoPyTorch/datasets/resampling_strategy.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index ac96c934a..0df54866c 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -1,3 +1,19 @@ +"""Functions for resampling strategy or cross validation +* Each function is used in BaseDataset to provide dataset splits + +TODO: + * DEFAULT_RESAMPLING_PARAMETERS --> keyword arguments + * documentation strings + * Make shuffle and stratified arguments rather than + independent methods + * Force the instantiation of each splitting methods + ==> instance variables tell you what kind of splitting + * Delete protocol and enumerator because we do not need + once we make them classes that require instantiation + * resampling_strategy --> splitting_fn + * resampling_strategy_args --> splitting_params +""" + from enum import IntEnum from typing import Any, Dict, List, Optional, Tuple, Union From 151055ee5eab8c3898f78abea37eb7edb3203d1e Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Wed, 15 Sep 2021 13:18:49 +0200 Subject: [PATCH 7/9] [doc] Add module doc-string and TODOs to ensemble/ except ensemble_builder.py --- autoPyTorch/ensemble/abstract_ensemble.py | 8 +++++++ autoPyTorch/ensemble/ensemble_selection.py | 23 +++++++++++++++++++++ autoPyTorch/ensemble/singlebest_ensemble.py | 12 +++++++++++ 3 files changed, 43 insertions(+) diff --git a/autoPyTorch/ensemble/abstract_ensemble.py b/autoPyTorch/ensemble/abstract_ensemble.py index 072b6d260..6c22d5ced 100644 --- a/autoPyTorch/ensemble/abstract_ensemble.py +++ b/autoPyTorch/ensemble/abstract_ensemble.py @@ -1,3 +1,11 @@ +"""The abstract class of ensemble classes +* Provide methods that must be overridden by the child class + +TODO: + * Add `raise NotImplementedError` + * model_identifiers --> List[] +""" + from abc import ABCMeta, abstractmethod from typing import Any, Dict, List, Tuple, Union diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py index b8f379e55..607533651 100644 --- a/autoPyTorch/ensemble/ensemble_selection.py +++ b/autoPyTorch/ensemble/ensemble_selection.py @@ -1,3 +1,26 @@ +"""The title of the module description # noqa +* Describe at the beginning of the source code. +* Describe before the package imports + +TODO: + * Add the following + References: + Title: Ensemble Selection from Libraries of Models + Authors: Rich Caruana et. al. + URL: https://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf + + * `A copy of self` --> check if it is really true + * Change `_` to `_` + * get_models_with_weights --> looks sort by descending of weights + * soft voting ==> explanation + References: + Title: Consensus Based Ensembles of Soft Clusterings + Authors: Kunal Punera and Joydeep Ghosh + URL: https://www.researchgate.net/profile/Joydeep-Ghosh-8/publication/221188694_Consensus_Based_Ensembles_of_Soft_Clusterings/links/02e7e521fe367e06c3000000/Consensus-Based-Ensembles-of-Soft-Clusterings.pdf + * _calculate_weights ==> what about np.sum(weights) > 1?? + * Refactor _fit() and add the shape of predictions +""" + from collections import Counter from typing import Any, Dict, List, Tuple, Union diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py index 881ae5fd2..78c8cd5d7 100644 --- a/autoPyTorch/ensemble/singlebest_ensemble.py +++ b/autoPyTorch/ensemble/singlebest_ensemble.py @@ -1,3 +1,15 @@ +"""Backup solution class for the crached searching +* Provide the best configuration instead of an ensemble + with multiple models + +TODO: + * Change `_` to `_` + * Add more `raise ` since this class is supposed + to be used in very specific situations + * Check the contexts where this class is called because + self.weights_ and self.indices_ are not clear enough +""" + import os from typing import Any, Dict, List, Tuple, Union From 65bb6724eb24dd11e0d8ac05d6262ba5e01c7e3f Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Thu, 16 Sep 2021 09:54:13 +0200 Subject: [PATCH 8/9] [doc] Add module doc-string and TODOs to ensemble_builder.py --- autoPyTorch/ensemble/ensemble_builder.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index a22d413f7..163eb97df 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -1,3 +1,30 @@ +"""The module that enables a build ensemble +* EnsembleBuilderManager serves as a central system that + submit an EnsembleBuilder to dask +* EnsembleBuilder builds an ensemble using pynisher + so that we can easily suppress the memory usage and runtime +* EnsembleBuilder builds an ensemble using the configurations + that are observed in HPO + +TODO: + * Unused arguments in EnsembleBuilderManager.__call__ + * Remove the argument `unit_test` and separate methods + with patch.object(, '', side_effect=MemoryError): + inst = (arguments) + inst.() <== MemoryError + + * Remove unneeded comments + * Make precision in a better way (enum, np.int32 ...) + * Separate `raise Error` methods in EnsembleBuilder + + run + + main + + compute_loss_per_model + + get_n_best_preds + * Separate more general function from EnsembleBuilder + + get_disk_consumption + + _read_np_fn +""" + # -*- encoding: utf-8 -*- import glob import gzip From aeedc2f8eed1f1c37754e5bfc5f059b84f78d346 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 27 Sep 2021 20:05:34 +0900 Subject: [PATCH 9/9] [doc/WIP] Add module doc string and ToDos in autoPytorch/evaluation/abstract_evaluator.py --- autoPyTorch/evaluation/abstract_evaluator.py | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 0ba588276..893f20a14 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -1,3 +1,39 @@ +"""This module provides model estimator pipelines +This module has the following pipelines: + - MyTraditionalTabularClassificationPipeline + Wrapper class for traditional ML classification methods + such as CatBoost, RandomForest + - MyTraditionalTabularRegressionPipeline + Wrapper class for traditional ML regression methods + such as RandomForest + - DummyClassificationPipeline + Wrapper class for dummy classifier in sklearn + - DummyRegressionPipeline + Wrapper class for dummy regressor in sklearn + - AbstractEvaluator + The interface for the pipeline evaluators + to optimize via SMAC + +Note: Dummy model is an estimator using a very simple rule + and this is used for the minimum baseline for each task. + https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html # noqa: W291 + https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html # noqa: W291 + +TODO: + * Describe the definition of sample_weight + * import autoPyTorch.pipeline.xxx as shorter names + * Describe the shape of returns in predict and predict_proba + * Improve the documentation of additional_run_info + * Change get_pipeline_representation --> __repr__ + * delete self.random_state, self.init_params, self.config, + self.dataset_properties, + (because they are not used) + * [named_step](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) # noqa: W291 + * The typing of config in DummyXXXPipeline + * Add enumerator for additional_run_info + * Rename fit_and_suppress_warnings +""" + import logging.handlers import time import warnings