modAL-python · cosmic-cortex · Nov 1, 2020 · Sep 24, 2020 · Sep 25, 2020 · Sep 28, 2020
diff --git a/README.md b/README.md
@@ -100,12 +100,11 @@ import numpy as np
 X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1)
 y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)
 ```
-For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```.
+For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```.
 ```python
 def GP_regression_std(regressor, X):
     _, std = regressor.predict(X, return_std=True)
-    query_idx = np.argmax(std)
-    return query_idx, X[query_idx]
+    return np.argmax(std)
 ```
 After setting up the query strategy and the data, the active learner can be initialized.
 ```python

diff --git a/docs/source/content/examples/active_regression.ipynb b/docs/source/content/examples/active_regression.ipynb
@@ -70,7 +70,7 @@
    "metadata": {},
    "source": [
     "## Uncertainty measure and query strategy for Gaussian processes\n",
-    "For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```."
+    "For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```."
    ]
   },
   {
@@ -81,8 +81,7 @@
    "source": [
     "def GP_regression_std(regressor, X):\n",
     "    _, std = regressor.predict(X, return_std=True)\n",
-    "    query_idx = np.argmax(std)\n",
-    "    return query_idx, X[query_idx]"
+    "    return np.argmax(std)"
    ]
   },
   {
@@ -234,4 +233,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/docs/source/content/overview/Extending-modAL.ipynb b/docs/source/content/overview/Extending-modAL.ipynb
@@ -27,11 +27,8 @@
     "    # measure the utility of each instance in the pool\n",
     "    utility = utility_measure(classifier, X)\n",
     "\n",
-    "    # select the indices of the instances to be queried\n",
-    "    query_idx = select_instances(utility)\n",
-    "\n",
-    "    # return the indices and the instances\n",
-    "    return query_idx, X[query_idx]"
+    "    # select and return the indices of the instances to be queried\n",
+    "    return select_instances(utility)"
    ]
   },
   {
@@ -213,8 +210,7 @@
     "# classifier uncertainty and classifier margin\n",
     "def custom_query_strategy(classifier, X, n_instances=1):\n",
     "    utility = linear_combination(classifier, X)\n",
-    "    query_idx = multi_argmax(utility, n_instances=n_instances)\n",
-    "    return query_idx, X[query_idx]\n",
+    "    return multi_argmax(utility, n_instances=n_instances)\n",
     "\n",
     "custom_query_learner = ActiveLearner(\n",
     "    estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),\n",
@@ -299,4 +295,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/docs/source/content/overview/modAL-in-a-nutshell.rst b/docs/source/content/overview/modAL-in-a-nutshell.rst
@@ -118,15 +118,13 @@ the *noisy sine* function:
 For active learning, we shall define a custom query strategy tailored to
 Gaussian processes. In a nutshell, a *query stategy* in modAL is a
 function taking (at least) two arguments (an estimator object and a pool
-of examples), outputting the index of the queried instance and the
-instance itself. In our case, the arguments are ``regressor`` and ``X``.
+of examples), outputting the index of the queried instance. In our case, the arguments are ``regressor`` and ``X``.
 
 .. code:: python
 
     def GP_regression_std(regressor, X):
         _, std = regressor.predict(X, return_std=True)
-        query_idx = np.argmax(std)
-        return query_idx, X[query_idx]
+        return np.argmax(std)
 
 After setting up the query strategy and the data, the active learner can
 be initialized.

diff --git a/examples/active_regression.py b/examples/active_regression.py
@@ -12,8 +12,7 @@
 # query strategy for regression
 def GP_regression_std(regressor, X):
     _, std = regressor.predict(X, return_std=True)
-    query_idx = np.argmax(std)
-    return query_idx, X[query_idx]
+    return np.argmax(std)
 
 
 # generating the data

diff --git a/examples/custom_query_strategies.py b/examples/custom_query_strategies.py
@@ -5,18 +5,16 @@
 
 The first two arguments of a query strategy function is always the estimator and the pool
 of instances to be queried from. Additional arguments are accepted as keyword arguments.
-A valid query strategy function always returns a tuple of the indices of the queried
-instances and the instances themselves.
+A valid query strategy function always returns indices of the queried
+instances.
 
 def custom_query_strategy(classifier, X, a_keyword_argument=42):
     # measure the utility of each instance in the pool
     utility = utility_measure(classifier, X)
 
-    # select the indices of the instances to be queried
-    query_idx = select_instances(utility)
+    # select and return the indices of the instances to be queried
+    return select_instances(utility)
 
-    # return the indices and the instances
-    return query_idx, X[query_idx]
 
 This function can be used in the active learning workflow.
 
@@ -97,8 +95,7 @@ def custom_query_strategy(classifier, X, a_keyword_argument=42):
 # classifier uncertainty and classifier margin
 def custom_query_strategy(classifier, X, n_instances=1):
     utility = linear_combination(classifier, X)
-    query_idx = multi_argmax(utility, n_instances=n_instances)
-    return query_idx, X[query_idx]
+    return multi_argmax(utility, n_instances=n_instances)
 
 custom_query_learner = ActiveLearner(
     estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),

diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py
@@ -62,12 +62,10 @@ def max_entropy(learner, X, n_instances=1, T=100):
     expected_p = np.mean(MC_samples, axis=0)
     acquisition = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1)  # [batch size]
     idx = (-acquisition).argsort()[:n_instances]
-    query_idx = random_subset[idx]
-    return query_idx, X[query_idx]
+    return random_subset[idx]
 
 def uniform(learner, X, n_instances=1):
-    query_idx = np.random.choice(range(len(X)), size=n_instances, replace=False)
-    return query_idx, X[query_idx]
+    return np.random.choice(range(len(X)), size=n_instances, replace=False)
 
 """
 Training the ActiveLearner

diff --git a/examples/shape_learning.py b/examples/shape_learning.py
@@ -57,8 +57,7 @@
 
 
 def random_sampling(classsifier, X):
-    query_idx = np.random.randint(len(X))
-    return query_idx, X[query_idx]
+    return np.random.randint(len(X))
 
 
 X_pool = deepcopy(X_full)

diff --git a/modAL/acquisition.py b/modAL/acquisition.py
@@ -104,7 +104,7 @@ def optimizer_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1) -> np.
 
 
 def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
-           n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+           n_instances: int = 1) -> np.ndarray:
     """
     Maximum PI query strategy. Selects the instance with highest probability of improvement.
 
@@ -118,13 +118,11 @@ def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
         The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
     """
     pi = optimizer_PI(optimizer, X, tradeoff=tradeoff)
-    query_idx = multi_argmax(pi, n_instances=n_instances)
-
-    return query_idx, X[query_idx]
+    return multi_argmax(pi, n_instances=n_instances)
 
 
 def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
-           n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+           n_instances: int = 1) -> np.ndarray:
     """
     Maximum EI query strategy. Selects the instance with highest expected improvement.
 
@@ -138,13 +136,11 @@ def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
         The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
     """
     ei = optimizer_EI(optimizer, X, tradeoff=tradeoff)
-    query_idx = multi_argmax(ei, n_instances=n_instances)
-
-    return query_idx, X[query_idx]
+    return multi_argmax(ei, n_instances=n_instances)
 
 
 def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1,
-            n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+            n_instances: int = 1) -> np.ndarray:
     """
     Maximum UCB query strategy. Selects the instance with highest upper confidence bound.
 
@@ -158,6 +154,4 @@ def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1,
         The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
     """
     ucb = optimizer_UCB(optimizer, X, beta=beta)
-    query_idx = multi_argmax(ucb, n_instances=n_instances)
-
-    return query_idx, X[query_idx]
+    return multi_argmax(ucb, n_instances=n_instances)
diff --git a/modAL/batch.py b/modAL/batch.py
@@ -114,7 +114,7 @@ def select_instance(
     unlabeled_indices = [i for i in range(n_pool) if mask[i]]
     best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled]
     mask[best_instance_index] = 0
-    return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask
+    return best_instance_index, X_pool[[best_instance_index]], mask
 
 
 def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
@@ -142,11 +142,16 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
     """
     # Make a local copy of our classifier's training data.
     # Define our record container and record the best cold start instance in the case of cold start.
+
+    # transform unlabeled data if needed
+    if classifier.on_transformed:
+        unlabeled = classifier.transform_without_estimating(unlabeled)
+
     if classifier.X_training is None:
         best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
         instance_index_ranking = [best_coldstart_instance_index]
     elif classifier.X_training.shape[0] > 0:
-        labeled = classifier.X_training[:]
+        labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
         instance_index_ranking = []
 
     # The maximum number of records to sample.
@@ -180,7 +185,7 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
                                metric: Union[str, Callable] = 'euclidean',
                                n_jobs: Optional[int] = None,
                                **uncertainty_measure_kwargs
-                               ) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
+                               ) -> np.ndarray:
     """
     Batch sampling query strategy. Selects the least sure instances for labelling.
 
@@ -206,6 +211,6 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
         Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
     """
     uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
-    query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
+    return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
                                  n_instances=n_instances, metric=metric, n_jobs=n_jobs)
-    return query_indices, X[query_indices]
+
diff --git a/modAL/disagreement.py b/modAL/disagreement.py
@@ -104,7 +104,7 @@ def KL_max_disagreement(committee: BaseCommittee, X: modALinput, **predict_proba
 
 def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
                           n_instances: int = 1, random_tie_break=False,
-                          **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                          **disagreement_measure_kwargs) -> np.ndarray:
     """
     Vote entropy sampling strategy.
 
@@ -124,16 +124,14 @@ def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
     disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)
 
     if not random_tie_break:
-        query_idx = multi_argmax(disagreement, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
+        return multi_argmax(disagreement, n_instances=n_instances)
 
-    return query_idx, X[query_idx]
+    return shuffled_argmax(disagreement, n_instances=n_instances)
 
 
 def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
                                n_instances: int = 1, random_tie_break=False,
-                               **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                               **disagreement_measure_kwargs) -> np.ndarray:
     """
     Consensus entropy sampling strategy.
 
@@ -153,16 +151,14 @@ def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
     disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs)
 
     if not random_tie_break:
-        query_idx = multi_argmax(disagreement, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
+        return multi_argmax(disagreement, n_instances=n_instances)
 
-    return query_idx, X[query_idx]
+    return shuffled_argmax(disagreement, n_instances=n_instances)
 
 
 def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
                               n_instances: int = 1, random_tie_break=False,
-                              **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                              **disagreement_measure_kwargs) -> np.ndarray:
     """
     Maximum disagreement sampling strategy.
 
@@ -182,16 +178,14 @@ def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
     disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs)
 
     if not random_tie_break:
-        query_idx = multi_argmax(disagreement, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
+        return multi_argmax(disagreement, n_instances=n_instances)
 
-    return query_idx, X[query_idx]
+    return shuffled_argmax(disagreement, n_instances=n_instances)
 
 
 def max_std_sampling(regressor: BaseEstimator, X: modALinput,
                      n_instances: int = 1,  random_tie_break=False,
-                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
+                     **predict_kwargs) -> np.ndarray:
     """
     Regressor standard deviation sampling strategy.
 
@@ -211,8 +205,6 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput,
     std = std.reshape(X.shape[0], )
 
     if not random_tie_break:
-        query_idx = multi_argmax(std, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(std, n_instances=n_instances)
+        return multi_argmax(std, n_instances=n_instances)
 
-    return query_idx, X[query_idx]
+    return shuffled_argmax(std, n_instances=n_instances)
diff --git a/modAL/expected_error.py b/modAL/expected_error.py
@@ -10,14 +10,14 @@
 from sklearn.exceptions import NotFittedError
 
 from modAL.models import ActiveLearner
-from modAL.utils.data import modALinput, data_vstack
+from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows, data_shape, add_row
 from modAL.utils.selection import multi_argmax, shuffled_argmax
 from modAL.uncertainty import _proba_uncertainty, _proba_entropy
 
 
 def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
                              p_subsample: np.float = 1.0, n_instances: int = 1,
-                             random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+                             random_tie_break: bool = False) -> np.ndarray:
     """
     Expected error reduction query strategy.
 
@@ -38,31 +38,30 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
 
 
     Returns:
-        The indices of the instances from X chosen to be labelled;
-        the instances from X chosen to be labelled.
+        The indices of the instances from X chosen to be labelled.
     """
 
     assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
     assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\''
 
-    expected_error = np.zeros(shape=(len(X), ))
+    expected_error = np.zeros(shape=(data_shape(X)[0],))
     possible_labels = np.unique(learner.y_training)
 
     try:
         X_proba = learner.predict_proba(X)
     except NotFittedError:
         # TODO: implement a proper cold-start
-        return 0, X[0]
+        return np.array([0])
 
     cloned_estimator = clone(learner.estimator)
 
-    for x_idx, x in enumerate(X):
+    for x_idx, x in enumerate_data(X):
         # subsample the data if needed
         if np.random.rand() <= p_subsample:
-            X_reduced = np.delete(X, x_idx, axis=0)
+            X_reduced = drop_rows(X, x_idx)
             # estimate the expected error
             for y_idx, y in enumerate(possible_labels):
-                X_new = data_vstack((learner.X_training, np.expand_dims(x, axis=0)))
+                X_new = add_row(learner.X_training, x)
                 y_new = data_vstack((learner.y_training, np.array(y).reshape(1,)))
 
                 cloned_estimator.fit(X_new, y_new)
@@ -78,8 +77,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
             expected_error[x_idx] = np.inf
 
     if not random_tie_break:
-        query_idx = multi_argmax(-expected_error, n_instances)
-    else:
-        query_idx = shuffled_argmax(-expected_error, n_instances)
+        return multi_argmax(-expected_error, n_instances)
 
-    return query_idx, X[query_idx]
+    return shuffled_argmax(-expected_error, n_instances)