automl · nabenabe0928 · Sep 14, 2021 · Sep 14, 2021 · Sep 14, 2021 · Sep 14, 2021
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -1,3 +1,18 @@
+"""Base class for tasks to solve
+* The shared components among all the tasks
+* This module provides the optimization given a pipeline
+* This module plays a role of communicating with
+  distributed clients
+
+TODO:
+    * Separate the training procedure by another class and encapsulate it
+    * Separate _do_dummy_prediction and refactor it
+    * Separate _do_traditional_prediction and refactor it
+    * Refactor _search
+    * Reduce unimportant instance variables
+    * Use private variables and public variables by _<var name>
+"""
+
 import copy
 import json
 import logging.handlers

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
@@ -1,3 +1,10 @@
+"""Constant variables in AutoPytorch
+
+TODO:
+    * Make everything enumerators
+    * Avoid the usage of integers
+"""
+
 TABULAR_CLASSIFICATION = 1
 IMAGE_CLASSIFICATION = 2
 TABULAR_REGRESSION = 3

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -1,3 +1,15 @@
+"""Base class for the feature validator given a task
+* A wrapper class of the sklearn.base.BaseEstimator
+* The feature validator for each task inherits this class
+* Check if the provided feature can be processed in AutoPytorch
-* Check if the provided feature can be processed in AutoPytorch
+* Check if the provided train and test features can be processed in AutoPytorch
-* Check if the provided feature can be processed in AutoPytorch
+* Check if the provided train and test features can be processed in AutoPytorch
+
+TODO:
+    * SUPPORTED_FEAT_TYPES --> Enumerator
+    * Describe the shape of X
+    * typing.<type> --> <type>
+    * logging.Logger --> Logger
+"""
+
 import logging
 import typing
 

diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -1,3 +1,19 @@
+"""Base class for the target (or label) validator given a task
+* A wrapper class of the sklearn.base.BaseEstimator
+* The target validator for each task inherits this class
+* Check if the provided targets (or labels) are compatible in both
+  training and test
+
+TODO:
+    * SUPPORTED_FEAT_TYPES --> Enumerator
+    * Describe the shape of y
+    * typing.<type> --> <type>
+    * logging.Logger --> Logger
+    * Rename classes_ --> get_classes
+    * Check the return of classes_
+    * is_single_column_target --> is_target_scalar
+"""
+
 import logging
 import typing
 
@@ -31,12 +47,13 @@ class BaseTargetValidator(BaseEstimator):
     """
     A class to pre-process targets. It validates the data provided during fit (to make sure
     it matches AutoPyTorch expectation) as well as encoding the targets in case of classification
+
     Attributes:
         is_classification (bool):
             A bool that indicates if the validator should operate in classification mode.
             During classification, the targets are encoded.
         encoder (typing.Optional[BaseEstimator]):
-            Host a encoder object if the data requires transformation (for example,
+            Host an encoder object if the data requires transformation (for example,
             if provided a categorical column in a pandas DataFrame)
         enc_columns (typing.List[str])
             List of columns that where encoded
@@ -175,7 +192,7 @@ def classes_(self) -> np.ndarray:
         Complies with scikit learn classes_ attribute,
         which consist of a ndarray of shape (n_classes,)
         where n_classes are the number of classes seen while fitting
-        a encoder to the targets.
+        an encoder to the targets.
         Returns:
             classes_: np.ndarray
                 The unique classes seen during encoding of a classifier

diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py
@@ -1,3 +1,16 @@
+"""Base class for the input validator given a task
+* A wrapper class of the sklearn.base.BaseEstimator
+* The input validator for each task inherits this class
+* Check if the provided data are compatible with AutoPytorch implementation
+* Manage both target_ and feature_validator in this class
+
+TODO:
+    * typing.<type> --> <type>
+    * logging.Logger --> Logger
+    * Inherit feature_validator and target_validator from a child class
+      via super().__init__()
+"""
+
 # -*- encoding: utf-8 -*-
 import logging.handlers
 import typing

diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -1,3 +1,19 @@
+"""Base class of the provided dataset
+* Provide data validation splits based on types of data
-* Provide data validation splits based on types of data
+* Provide data validation splits based on types of data and user's choice for example, holdout.
-* Provide data validation splits based on types of data
+* Provide data validation splits based on types of data and user's choice for example, holdout.
+* Provide API to return training and validation splits
+* Storage the properties of the dataset which are required
+  in AutoPytorch implementation
+
+TODO:
+    * Address: https://github.com/automl/Auto-PyTorch/pull/108/
+    * Make BaseDatasetPropertiesType more informative
+    * Use private variables and public variables properly
+    * Consider more memory-efficient way to store splits
+        ==> It will be so much memory consumption for huge datasets
+    * Check the usage of validation and test because cross validation
+      only uses the training dataset
+"""
+
 import os
 import uuid
 from abc import ABCMeta

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
@@ -1,3 +1,19 @@
+"""Functions for resampling strategy or cross validation
+* Each function is used in BaseDataset to provide dataset splits
+
+TODO:
+    * DEFAULT_RESAMPLING_PARAMETERS --> keyword arguments
+    * documentation strings
+    * Make shuffle and stratified arguments rather than
+      independent methods
+    * Force the instantiation of each splitting methods
+        ==> instance variables tell you what kind of splitting
+    * Delete protocol and enumerator because we do not need
+      once we make them classes that require instantiation
+    * resampling_strategy --> splitting_fn
+    * resampling_strategy_args --> splitting_params
+"""
+
 from enum import IntEnum
 from typing import Any, Dict, List, Optional, Tuple, Union
 

diff --git a/autoPyTorch/ensemble/abstract_ensemble.py b/autoPyTorch/ensemble/abstract_ensemble.py
@@ -1,3 +1,11 @@
+"""The abstract class of ensemble classes
+* Provide methods that must be overridden by the child class
+
+TODO:
+    * Add `raise NotImplementedError`
+    * model_identifiers --> List[<NamedTuple with an appropriate name>]
+"""
+
 from abc import ABCMeta, abstractmethod
 from typing import Any, Dict, List, Tuple, Union
 

diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
@@ -1,3 +1,30 @@
+"""The module that enables a build ensemble
+* EnsembleBuilderManager serves as a central system that
+  submit an EnsembleBuilder to dask
+* EnsembleBuilder builds an ensemble using pynisher
+  so that we can easily suppress the memory usage and runtime
+* EnsembleBuilder builds an ensemble using the configurations
+  that are observed in HPO
+
+TODO:
+    * Unused arguments in EnsembleBuilderManager.__call__
+    * Remove the argument `unit_test` and separate methods
+    with patch.object(<class name>, '<method name>', side_effect=MemoryError):
+        inst = <class name>(arguments)
+        inst.<method name>()  <== MemoryError
+
+    * Remove unneeded comments
+    * Make precision in a better way (enum, np.int32 ...)
+    * Separate `raise Error` methods in EnsembleBuilder
+        + run
+        + main
+        + compute_loss_per_model
+        + get_n_best_preds
+    * Separate more general function from EnsembleBuilder
+        + get_disk_consumption
+        + _read_np_fn
+"""
+
 # -*- encoding: utf-8 -*-
 import glob
 import gzip

diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py
@@ -1,3 +1,26 @@
+"""The title of the module description  # noqa
+* Describe at the beginning of the source code.
+* Describe before the package imports
+
+TODO:
+    * Add the following
+    References:
+        Title: Ensemble Selection from Libraries of Models
+        Authors: Rich Caruana et. al.
+        URL: https://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf
+
+    * `A copy of self` --> check if it is really true
+    * Change `<variable>_` to `_<variable>`
+    * get_models_with_weights --> looks sort by descending of weights
+    * soft voting ==> explanation
+    References:
+        Title: Consensus Based Ensembles of Soft Clusterings
+        Authors: Kunal Punera and Joydeep Ghosh
+        URL: https://www.researchgate.net/profile/Joydeep-Ghosh-8/publication/221188694_Consensus_Based_Ensembles_of_Soft_Clusterings/links/02e7e521fe367e06c3000000/Consensus-Based-Ensembles-of-Soft-Clusterings.pdf
+    * _calculate_weights ==> what about np.sum(weights) > 1??
+    * Refactor _fit() and add the shape of predictions
+"""
+
 from collections import Counter
 from typing import Any, Dict, List, Tuple, Union
 

diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py
@@ -1,3 +1,15 @@
+"""Backup solution class for the crached searching
+* Provide the best configuration instead of an ensemble
+  with multiple models
+
+TODO:
+    * Change `<variable>_` to `_<variable>`
+    * Add more `raise <Error>` since this class is supposed
+      to be used in very specific situations
+    * Check the contexts where this class is called because
+      self.weights_ and self.indices_ are not clear enough
+"""
+
 import os
 from typing import Any, Dict, List, Tuple, Union
 

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -1,3 +1,39 @@
+"""This module provides model estimator pipelines
+This module has the following pipelines:
+    - MyTraditionalTabularClassificationPipeline
+        Wrapper class for traditional ML classification methods
+        such as CatBoost, RandomForest
+    - MyTraditionalTabularRegressionPipeline
+        Wrapper class for traditional ML regression methods
+        such as RandomForest
+    - DummyClassificationPipeline
+        Wrapper class for dummy classifier in sklearn
+    - DummyRegressionPipeline
+        Wrapper class for dummy regressor in sklearn
+    - AbstractEvaluator
+        The interface for the pipeline evaluators
+        to optimize via SMAC
+
+Note: Dummy model is an estimator using a very simple rule
+      and this is used for the minimum baseline for each task.
+      https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html  # noqa: W291
+      https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html  # noqa: W291
+
+TODO:
+    * Describe the definition of sample_weight
+    * import autoPyTorch.pipeline.xxx as shorter names
+    * Describe the shape of returns in predict and predict_proba
+    * Improve the documentation of additional_run_info
+    * Change get_pipeline_representation --> __repr__
+    * delete self.random_state, self.init_params, self.config,
+      self.dataset_properties, 
+      (because they are not used)
+    * [named_step](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)  # noqa: W291
+    * The typing of config in DummyXXXPipeline
+    * Add enumerator for additional_run_info
+    * Rename fit_and_suppress_warnings
+"""
+
 import logging.handlers
 import time
 import warnings