From 802c0293ed7b10c18e8af04ad8b2a7d4d9bbf773 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:14:32 +0100
Subject: [PATCH 01/27] Update sklearn_questions.py

---
 sklearn_questions.py | 287 ++++++++++++++++++++++++-------------------
 1 file changed, 158 insertions(+), 129 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index fa02e0d..8aaacdf 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -1,188 +1,217 @@
-"""Assignment - making a sklearn estimator and cv splitter.
-
-The goal of this assignment is to implement by yourself:
-
-- a scikit-learn estimator for the KNearestNeighbors for classification
-  tasks and check that it is working properly.
-- a scikit-learn CV splitter where the splits are based on a Pandas
-  DateTimeIndex.
-
-Detailed instructions for question 1:
-The nearest neighbor classifier predicts for a point X_i the target y_k of
-the training sample X_k which is the closest to X_i. We measure proximity with
-the Euclidean distance. The model will be evaluated with the accuracy (average
-number of samples corectly classified). You need to implement the `fit`,
-`predict` and `score` methods for this class. The code you write should pass
-the test we implemented. You can run the tests by calling at the root of the
-repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
-scikit-learn estimator needs to check that the input given to `fit` and
-`predict` are correct using the `check_*` functions imported in the file.
-You can find more information on how they should be used in the following doc:
-https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
-Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
-
-
-Detailed instructions for question 2:
-The data to split should contain the index or one column in
-datatime format. Then the aim is to split the data between train and test
-sets when for each pair of successive months, we learn on the first and
-predict of the following. For example if you have data distributed from
-november 2020 to march 2021, you have have 4 splits. The first split
-will allow to learn on november data and predict on december data, the
-second split to learn december and predict on january etc.
-
-We also ask you to respect the pep8 convention: https://pep8.org. This will be
-enforced with `flake8`. You can check that there is no flake8 errors by
-calling `flake8` at the root of the repo.
-
-Finally, you need to write docstrings for the methods you code and for the
-class. The docstring will be checked using `pydocstyle` that you can also
-call at the root of the repo.
-
-Hints
------
-- You can use the function:
-
-from sklearn.metrics.pairwise import pairwise_distances
-
-to compute distances between 2 sets of samples.
-"""
-import numpy as np
-import pandas as pd
-
-from sklearn.base import BaseEstimator
-from sklearn.base import ClassifierMixin
+class KNearestNeighbors(BaseEstimator, ClassifierMixin):
+    """
+    K-Nearest Neighbors classifier.
 
-from sklearn.model_selection import BaseCrossValidator
+    Parameters
+    ----------
+    n_neighbors : int, default=1
+        Number of neighbors to use for predictions.
 
-from sklearn.utils.validation import check_X_y, check_is_fitted
-from sklearn.utils.validation import check_array
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.metrics.pairwise import pairwise_distances
+    Attributes
+    ----------
+    X_ : ndarray of shape (n_samples, n_features)
+        Training data stored during fit.
 
+    y_ : ndarray of shape (n_samples,)
+        Labels stored during fit.
 
-class KNearestNeighbors(BaseEstimator, ClassifierMixin):
-    """KNearestNeighbors classifier."""
+    n_features_in_ : int
+        Number of features in the training data.
+    """
 
-    def __init__(self, n_neighbors=1):  # noqa: D107
+    def __init__(self, n_neighbors=1):
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        """Fitting function.
+        """
+        Fit the KNN classifier on training data.
 
-         Parameters
+        Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
-            Data to train the model.
-        y : ndarray, shape (n_samples,)
-            Labels associated with the training data.
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,)
+            Target labels for training data.
 
         Returns
-        ----------
-        self : instance of KNearestNeighbors
-            The current instance of the classifier
+        -------
+        self : object
+            Returns the instance itself.
         """
+        # Validate input
+        X, y = check_X_y(X, y)
+        check_classification_targets(y)
+
+        self.X_ = X
+        self.y_ = y
+        self.n_features_in_ = X.shape[1]
         return self
 
     def predict(self, X):
-        """Predict function.
+        """
+        Predict the class labels for the given data.
 
         Parameters
         ----------
-        X : ndarray, shape (n_test_samples, n_features)
-            Data to predict on.
+        X : ndarray of shape (n_samples, n_features)
+            Test data.
 
         Returns
-        ----------
-        y : ndarray, shape (n_test_samples,)
-            Predicted class labels for each test data sample.
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Predicted class labels.
         """
-        y_pred = np.zeros(X.shape[0])
-        return y_pred
+        # Check if the classifier has been fitted
+        check_is_fitted(self, ["X_", "y_"])
+
+        # Validate input
+        X = check_array(X)
+
+        # Compute distances and predict
+        distances = pairwise_distances(X, self.X_)
+        nearest_indices = np.argmin(distances, axis=1)
+        return self.y_[nearest_indices]
 
     def score(self, X, y):
-        """Calculate the score of the prediction.
+        """
+        Compute the accuracy of the classifier.
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
-            Data to score on.
-        y : ndarray, shape (n_samples,)
-            target values.
+        X : ndarray of shape (n_samples, n_features)
+            Test data.
+
+        y : ndarray of shape (n_samples,)
+            True labels for test data.
 
         Returns
-        ----------
+        -------
         score : float
-            Accuracy of the model computed for the (X, y) pairs.
+            Mean accuracy of predictions.
         """
-        return 0.
+        y_pred = self.predict(X)
+        return np.mean(y_pred == y)
+Updated MonthlySplit Class
+python
+Copier le code
+from sklearn.model_selection import BaseCrossValidator
+import numpy as np
+import pandas as pd
 
 
 class MonthlySplit(BaseCrossValidator):
-    """CrossValidator based on monthly split.
-
-    Split data based on the given `time_col` (or default to index). Each split
-    corresponds to one month of data for the training and the next month of
-    data for the test.
+    """
+    Cross-validator that splits data based on months.
 
     Parameters
     ----------
-    time_col : str, defaults to 'index'
-        Column of the input DataFrame that will be used to split the data. This
-        column should be of type datetime. If split is called with a DataFrame
-        for which this column is not a datetime, it will raise a ValueError.
-        To use the index as column just set `time_col` to `'index'`.
+    time_col : str, default='index'
+        Column to use for date-based splitting. If 'index', the index of the
+        DataFrame is used as the date column.
+
+    Methods
+    -------
+    get_n_splits(X, y=None, groups=None)
+        Return the number of splits.
+
+    split(X, y=None, groups=None)
+        Generate indices for training and testing splits.
+
+    Raises
+    ------
+    ValueError
+        If the `time_col` is not found or not a datetime type.
     """
 
-    def __init__(self, time_col='index'):  # noqa: D107
+    def __init__(self, time_col='index'):
         self.time_col = time_col
 
     def get_n_splits(self, X, y=None, groups=None):
-        """Return the number of splitting iterations in the cross-validator.
+        """
+        Return the number of splitting iterations in the cross-validator.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
-            Training data, where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        y : array-like of shape (n_samples,)
-            Always ignored, exists for compatibility.
-        groups : array-like of shape (n_samples,)
-            Always ignored, exists for compatibility.
+        X : DataFrame
+            Input data with datetime information.
+
+        y : None
+            Ignored, exists for API compatibility.
+
+        groups : None
+            Ignored, exists for API compatibility.
 
         Returns
         -------
         n_splits : int
-            The number of splits.
+            Number of month-based splits.
         """
-        return 0
+        time_data = self._get_time_data(X)
+        return len(time_data.dt.to_period("M").unique()) - 1
 
-    def split(self, X, y, groups=None):
-        """Generate indices to split data into training and test set.
+    def split(self, X, y=None, groups=None):
+        """
+        Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
-            Training data, where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        y : array-like of shape (n_samples,)
-            Always ignored, exists for compatibility.
-        groups : array-like of shape (n_samples,)
-            Always ignored, exists for compatibility.
+        X : DataFrame
+            Input data with datetime information.
+
+        y : None
+            Ignored, exists for API compatibility.
+
+        groups : None
+            Ignored, exists for API compatibility.
 
         Yields
         ------
-        idx_train : ndarray
-            The training set indices for that split.
-        idx_test : ndarray
-            The testing set indices for that split.
+        train_indices : ndarray
+            Indices for training data.
+
+        test_indices : ndarray
+            Indices for testing data.
         """
+        time_data = self._get_time_data(X)
+        months = time_data.dt.to_period("M").unique()
+
+        for i in range(len(months) - 1):
+            train_mask = time_data.dt.to_period("M") == months[i]
+            test_mask = time_data.dt.to_period("M") == months[i + 1]
 
-        n_samples = X.shape[0]
-        n_splits = self.get_n_splits(X, y, groups)
-        for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
-            yield (
-                idx_train, idx_test
-            )
+            train_indices = np.where(train_mask)[0]
+            test_indices = np.where(test_mask)[0]
+
+            yield train_indices, test_indices
+
+    def _get_time_data(self, X):
+        """
+        Extract the datetime data from the specified column or index.
+
+        Parameters
+        ----------
+        X : DataFrame
+            Input data.
+
+        Returns
+        -------
+        time_data : Series
+            Series of datetime values.
+
+        Raises
+        ------
+        ValueError
+            If the column is not found or is not datetime-like.
+        """
+        if self.time_col == 'index':
+            if not isinstance(X.index, pd.DatetimeIndex):
+                raise ValueError("Index must be a DatetimeIndex.")
+            return X.index
+        elif self.time_col in X.columns:
+            time_data = X[self.time_col]
+            if not np.issubdtype(time_data.dtype, np.datetime64):
+                raise ValueError(f"Column {self.time_col} must be of datetime type.")
+            return time_data
+        else:
+            raise ValueError(f"Column {self.time_col} not found in input data.")

From 9bbfc1d28868cd7859b0b27e545e0e49fba3cf74 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:17:22 +0100
Subject: [PATCH 02/27] Update sklearn_questions.py

---
 sklearn_questions.py | 63 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 8aaacdf..95c5868 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -1,3 +1,66 @@
+"""Assignment - making a sklearn estimator and cv splitter.
+
+The goal of this assignment is to implement by yourself:
+
+- a scikit-learn estimator for the KNearestNeighbors for classification
+  tasks and check that it is working properly.
+- a scikit-learn CV splitter where the splits are based on a Pandas
+  DateTimeIndex.
+
+Detailed instructions for question 1:
+The nearest neighbor classifier predicts for a point X_i the target y_k of
+the training sample X_k which is the closest to X_i. We measure proximity with
+the Euclidean distance. The model will be evaluated with the accuracy (average
+number of samples corectly classified). You need to implement the `fit`,
+`predict` and `score` methods for this class. The code you write should pass
+the test we implemented. You can run the tests by calling at the root of the
+repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
+scikit-learn estimator needs to check that the input given to `fit` and
+`predict` are correct using the `check_*` functions imported in the file.
+You can find more information on how they should be used in the following doc:
+https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
+Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
+
+
+Detailed instructions for question 2:
+The data to split should contain the index or one column in
+datatime format. Then the aim is to split the data between train and test
+sets when for each pair of successive months, we learn on the first and
+predict of the following. For example if you have data distributed from
+november 2020 to march 2021, you have have 4 splits. The first split
+will allow to learn on november data and predict on december data, the
+second split to learn december and predict on january etc.
+
+We also ask you to respect the pep8 convention: https://pep8.org. This will be
+enforced with `flake8`. You can check that there is no flake8 errors by
+calling `flake8` at the root of the repo.
+
+Finally, you need to write docstrings for the methods you code and for the
+class. The docstring will be checked using `pydocstyle` that you can also
+call at the root of the repo.
+
+Hints
+-----
+- You can use the function:
+
+from sklearn.metrics.pairwise import pairwise_distances
+
+to compute distances between 2 sets of samples.
+"""
+import numpy as np
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+from sklearn.base import ClassifierMixin
+
+from sklearn.model_selection import BaseCrossValidator
+
+from sklearn.utils.validation import check_X_y, check_is_fitted
+from sklearn.utils.validation import check_array
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.metrics.pairwise import pairwise_distances
+
+
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
     """
     K-Nearest Neighbors classifier.

From f79352effd5db2dc91a990115dca4a6e64850c37 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:23:59 +0100
Subject: [PATCH 03/27] Update sklearn_questions.py

---
 sklearn_questions.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 95c5868..6d21e8a 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -46,7 +46,9 @@
 from sklearn.metrics.pairwise import pairwise_distances
 
 to compute distances between 2 sets of samples.
-"""
+"""`
+
+
 import numpy as np
 import pandas as pd
 
@@ -131,9 +133,8 @@ def predict(self, X):
         # Validate input
         X = check_array(X)
 
-        # Compute distances and predict
-        distances = pairwise_distances(X, self.X_)
-        nearest_indices = np.argmin(distances, axis=1)
+        # Compute nearest neighbors and predict
+        nearest_indices, _ = pairwise_distances_argmin_min(X, self.X_)
         return self.y_[nearest_indices]
 
     def score(self, X, y):
@@ -155,12 +156,6 @@ def score(self, X, y):
         """
         y_pred = self.predict(X)
         return np.mean(y_pred == y)
-Updated MonthlySplit Class
-python
-Copier le code
-from sklearn.model_selection import BaseCrossValidator
-import numpy as np
-import pandas as pd
 
 
 class MonthlySplit(BaseCrossValidator):
@@ -187,9 +182,11 @@ class MonthlySplit(BaseCrossValidator):
         If the `time_col` is not found or not a datetime type.
     """
 
+  
     def __init__(self, time_col='index'):
         self.time_col = time_col
 
+  
     def get_n_splits(self, X, y=None, groups=None):
         """
         Return the number of splitting iterations in the cross-validator.
@@ -213,6 +210,7 @@ def get_n_splits(self, X, y=None, groups=None):
         time_data = self._get_time_data(X)
         return len(time_data.dt.to_period("M").unique()) - 1
 
+  
     def split(self, X, y=None, groups=None):
         """
         Generate indices to split data into training and test set.
@@ -248,6 +246,7 @@ def split(self, X, y=None, groups=None):
 
             yield train_indices, test_indices
 
+  
     def _get_time_data(self, X):
         """
         Extract the datetime data from the specified column or index.

From 4f98b3ff54201ca04362a1ed7582c153116adcad Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:29:56 +0100
Subject: [PATCH 04/27] Update sklearn_questions.py

---
 sklearn_questions.py | 69 ++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 6d21e8a..0322ec5 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -1,4 +1,5 @@
-"""Assignment - making a sklearn estimator and cv splitter.
+"""
+Assignment - making a sklearn estimator and CV splitter.
 
 The goal of this assignment is to implement by yourself:
 
@@ -11,7 +12,7 @@
 The nearest neighbor classifier predicts for a point X_i the target y_k of
 the training sample X_k which is the closest to X_i. We measure proximity with
 the Euclidean distance. The model will be evaluated with the accuracy (average
-number of samples corectly classified). You need to implement the `fit`,
+number of samples correctly classified). You need to implement the `fit`,
 `predict` and `score` methods for this class. The code you write should pass
 the test we implemented. You can run the tests by calling at the root of the
 repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
@@ -21,17 +22,16 @@
 https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
 Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
 
-
 Detailed instructions for question 2:
 The data to split should contain the index or one column in
-datatime format. Then the aim is to split the data between train and test
+datetime format. Then the aim is to split the data between train and test
 sets when for each pair of successive months, we learn on the first and
-predict of the following. For example if you have data distributed from
-november 2020 to march 2021, you have have 4 splits. The first split
-will allow to learn on november data and predict on december data, the
-second split to learn december and predict on january etc.
+predict on the following. For example if you have data distributed from
+November 2020 to March 2021, you have have 4 splits. The first split
+will allow to learn on November data and predict on December data, the
+second split to learn December and predict on January etc.
 
-We also ask you to respect the pep8 convention: https://pep8.org. This will be
+We also ask you to respect the PEP8 convention: https://pep8.org. This will be
 enforced with `flake8`. You can check that there is no flake8 errors by
 calling `flake8` at the root of the repo.
 
@@ -46,19 +46,14 @@
 from sklearn.metrics.pairwise import pairwise_distances
 
 to compute distances between 2 sets of samples.
-"""`
-
+"""
 
 import numpy as np
 import pandas as pd
 
-from sklearn.base import BaseEstimator
-from sklearn.base import ClassifierMixin
-
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.model_selection import BaseCrossValidator
-
-from sklearn.utils.validation import check_X_y, check_is_fitted
-from sklearn.utils.validation import check_array
+from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.metrics.pairwise import pairwise_distances
 
@@ -133,9 +128,19 @@ def predict(self, X):
         # Validate input
         X = check_array(X)
 
-        # Compute nearest neighbors and predict
-        nearest_indices, _ = pairwise_distances_argmin_min(X, self.X_)
-        return self.y_[nearest_indices]
+        # Compute distances
+        distances = pairwise_distances(X, self.X_, metric='euclidean')
+
+        # Find the indices of the k nearest neighbors
+        neighbors_idx = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+
+        # Gather the neighbor labels
+        neighbor_labels = self.y_[neighbors_idx]
+
+        # Predict by majority vote
+        y_pred = np.array([np.bincount(row.astype(int)).argmax() for row in neighbor_labels])
+
+        return y_pred
 
     def score(self, X, y):
         """
@@ -182,11 +187,9 @@ class MonthlySplit(BaseCrossValidator):
         If the `time_col` is not found or not a datetime type.
     """
 
-  
     def __init__(self, time_col='index'):
         self.time_col = time_col
 
-  
     def get_n_splits(self, X, y=None, groups=None):
         """
         Return the number of splitting iterations in the cross-validator.
@@ -208,9 +211,9 @@ def get_n_splits(self, X, y=None, groups=None):
             Number of month-based splits.
         """
         time_data = self._get_time_data(X)
-        return len(time_data.dt.to_period("M").unique()) - 1
+        unique_months = time_data.dt.to_period("M").drop_duplicates()
+        return max(len(unique_months) - 1, 0)
 
-  
     def split(self, X, y=None, groups=None):
         """
         Generate indices to split data into training and test set.
@@ -235,18 +238,20 @@ def split(self, X, y=None, groups=None):
             Indices for testing data.
         """
         time_data = self._get_time_data(X)
-        months = time_data.dt.to_period("M").unique()
+        unique_months = time_data.dt.to_period("M").drop_duplicates()
+
+        for i in range(len(unique_months) - 1):
+            train_month = unique_months[i]
+            test_month = unique_months[i + 1]
 
-        for i in range(len(months) - 1):
-            train_mask = time_data.dt.to_period("M") == months[i]
-            test_mask = time_data.dt.to_period("M") == months[i + 1]
+            train_mask = time_data.dt.to_period("M") == train_month
+            test_mask = time_data.dt.to_period("M") == test_month
 
             train_indices = np.where(train_mask)[0]
             test_indices = np.where(test_mask)[0]
 
             yield train_indices, test_indices
 
-  
     def _get_time_data(self, X):
         """
         Extract the datetime data from the specified column or index.
@@ -269,11 +274,11 @@ def _get_time_data(self, X):
         if self.time_col == 'index':
             if not isinstance(X.index, pd.DatetimeIndex):
                 raise ValueError("Index must be a DatetimeIndex.")
-            return X.index
+            return pd.Series(X.index)
         elif self.time_col in X.columns:
             time_data = X[self.time_col]
             if not np.issubdtype(time_data.dtype, np.datetime64):
-                raise ValueError(f"Column {self.time_col} must be of datetime type.")
+                raise ValueError(f"Column '{self.time_col}' must be of datetime type.")
             return time_data
         else:
-            raise ValueError(f"Column {self.time_col} not found in input data.")
+            raise ValueError(f"Column '{self.time_col}' not found in input data.")

From 713b620d7c6bfdc099925cae4b7e304383bf893d Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:32:04 +0100
Subject: [PATCH 05/27] Update sklearn_questions.py

---
 sklearn_questions.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 0322ec5..51b537c 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -128,6 +128,13 @@ def predict(self, X):
         # Validate input
         X = check_array(X)
 
+        # Ensure the number of features matches
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                f"Number of features in X ({X.shape[1]}) does not match "
+                f"the number of features during fit ({self.n_features_in_})."
+            )
+
         # Compute distances
         distances = pairwise_distances(X, self.X_, metric='euclidean')
 
@@ -138,7 +145,10 @@ def predict(self, X):
         neighbor_labels = self.y_[neighbors_idx]
 
         # Predict by majority vote
-        y_pred = np.array([np.bincount(row.astype(int)).argmax() for row in neighbor_labels])
+        y_pred = np.array([
+            np.bincount(row.astype(int)).argmax() if len(np.unique(row)) > 0 else 0
+            for row in neighbor_labels
+        ])
 
         return y_pred
 
@@ -241,8 +251,8 @@ def split(self, X, y=None, groups=None):
         unique_months = time_data.dt.to_period("M").drop_duplicates()
 
         for i in range(len(unique_months) - 1):
-            train_month = unique_months[i]
-            test_month = unique_months[i + 1]
+            train_month = unique_months.iloc[i]
+            test_month = unique_months.iloc[i + 1]
 
             train_mask = time_data.dt.to_period("M") == train_month
             test_mask = time_data.dt.to_period("M") == test_month

From e23c2d575e3c7f56f32ea23d3de08e435f4e1567 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:34:54 +0100
Subject: [PATCH 06/27] Update sklearn_questions.py

---
 sklearn_questions.py | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 51b537c..70c7d6d 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -53,7 +53,7 @@
 
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.model_selection import BaseCrossValidator
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from sklearn.utils.validation import validate_data, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.metrics.pairwise import pairwise_distances
 
@@ -99,13 +99,18 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        # Validate input
-        X, y = check_X_y(X, y)
+        # Validate input and set n_features_in_
+        X, y = validate_data(
+            self, X, y,
+            ensure_2d=True,
+            accept_sparse=False,
+            dtype=None,
+            reset=True
+        )
         check_classification_targets(y)
 
         self.X_ = X
         self.y_ = y
-        self.n_features_in_ = X.shape[1]
         return self
 
     def predict(self, X):
@@ -125,15 +130,14 @@ def predict(self, X):
         # Check if the classifier has been fitted
         check_is_fitted(self, ["X_", "y_"])
 
-        # Validate input
-        X = check_array(X)
-
-        # Ensure the number of features matches
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                f"Number of features in X ({X.shape[1]}) does not match "
-                f"the number of features during fit ({self.n_features_in_})."
-            )
+        # Validate input, reset=False to keep n_features_in_
+        X = validate_data(
+            self, X,
+            ensure_2d=True,
+            accept_sparse=False,
+            dtype=None,
+            reset=False
+        )
 
         # Compute distances
         distances = pairwise_distances(X, self.X_, metric='euclidean')
@@ -221,7 +225,7 @@ def get_n_splits(self, X, y=None, groups=None):
             Number of month-based splits.
         """
         time_data = self._get_time_data(X)
-        unique_months = time_data.dt.to_period("M").drop_duplicates()
+        unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values()
         return max(len(unique_months) - 1, 0)
 
     def split(self, X, y=None, groups=None):
@@ -248,7 +252,7 @@ def split(self, X, y=None, groups=None):
             Indices for testing data.
         """
         time_data = self._get_time_data(X)
-        unique_months = time_data.dt.to_period("M").drop_duplicates()
+        unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values()
 
         for i in range(len(unique_months) - 1):
             train_month = unique_months.iloc[i]

From 446117050a419e1c26c9e3b95c2fe037def66f6c Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:37:51 +0100
Subject: [PATCH 07/27] Update sklearn_questions.py

---
 sklearn_questions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 70c7d6d..346d156 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -101,10 +101,10 @@ def fit(self, X, y):
         """
         # Validate input and set n_features_in_
         X, y = validate_data(
-            self, X, y,
-            ensure_2d=True,
+            X, y,
             accept_sparse=False,
             dtype=None,
+            ensure_2d=True,
             reset=True
         )
         check_classification_targets(y)
@@ -132,10 +132,10 @@ def predict(self, X):
 
         # Validate input, reset=False to keep n_features_in_
         X = validate_data(
-            self, X,
-            ensure_2d=True,
+            X,
             accept_sparse=False,
             dtype=None,
+            ensure_2d=True,
             reset=False
         )
 

From df3dd4f072eb635ff9f9c0cef48af7574b848cd9 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:40:26 +0100
Subject: [PATCH 08/27] Update sklearn_questions.py


From 1b63f2904e7606e83784ea8251afdcd4c48fd565 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:41:22 +0100
Subject: [PATCH 09/27] Update sklearn_questions.py


From 9a0fcc9042edfcff2821ba815601da54c01fb23e Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:43:18 +0100
Subject: [PATCH 10/27] Update sklearn_questions.py


From 6b5d668a9cdce6bd172d64d5111185cf65b298be Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 00:38:00 +0100
Subject: [PATCH 11/27] Update sklearn_questions.py

---
 sklearn_questions.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 346d156..d8c84af 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -80,6 +80,14 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin):
     """
 
     def __init__(self, n_neighbors=1):
+        """
+        Initialize the KNearestNeighbors classifier.
+
+        Parameters
+        ----------
+        n_neighbors : int, default=1
+            Number of neighbors to use for predictions.
+        """
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
@@ -202,6 +210,15 @@ class MonthlySplit(BaseCrossValidator):
     """
 
     def __init__(self, time_col='index'):
+        """
+        Initialize the MonthlySplit cross-validator.
+
+        Parameters
+        ----------
+        time_col : str, default='index'
+            Column to use for date-based splitting. If 'index', the index of the
+            DataFrame is used as the date column.
+        """
         self.time_col = time_col
 
     def get_n_splits(self, X, y=None, groups=None):

From 6c1fa26b62ead1f2d7212bebc0e5f4494d3adb1a Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 01:02:04 +0100
Subject: [PATCH 12/27] Update sklearn_questions.py

---
 sklearn_questions.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index d8c84af..5f6472d 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -47,15 +47,14 @@
 
 to compute distances between 2 sets of samples.
 """
-
 import numpy as np
 import pandas as pd
-
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.model_selection import BaseCrossValidator
+from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.utils.validation import validate_data, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
-from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.model_selection import BaseCrossValidator
+from sklearn.utils.validation import check_is_fitted
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):

From 5e704d250bacb1159558e850993edabbe96c09d4 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 12:13:34 +0100
Subject: [PATCH 13/27] Update sklearn_questions.py

---
 sklearn_questions.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 5f6472d..cbf2ffa 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -106,7 +106,7 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        # Validate input and set n_features_in_
+        # Validate input data and ensure it's 2D
         X, y = validate_data(
             X, y,
             accept_sparse=False,
@@ -114,6 +114,7 @@ def fit(self, X, y):
             ensure_2d=True,
             reset=True
         )
+        # Validate that y contains classification targets
         check_classification_targets(y)
 
         self.X_ = X
@@ -137,7 +138,7 @@ def predict(self, X):
         # Check if the classifier has been fitted
         check_is_fitted(self, ["X_", "y_"])
 
-        # Validate input, reset=False to keep n_features_in_
+        # Validate input data and ensure it's 2D
         X = validate_data(
             X,
             accept_sparse=False,
@@ -146,10 +147,10 @@ def predict(self, X):
             reset=False
         )
 
-        # Compute distances
+        # Compute pairwise distances between X and the training data
         distances = pairwise_distances(X, self.X_, metric='euclidean')
 
-        # Find the indices of the k nearest neighbors
+        # Find the indices of the k nearest neighbors for each sample
         neighbors_idx = np.argsort(distances, axis=1)[:, :self.n_neighbors]
 
         # Gather the neighbor labels

From f6d4460f6a53ae95eae720f48d8bfb602de5bce4 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 12:26:49 +0100
Subject: [PATCH 14/27] Update sklearn_questions.py

---
 sklearn_questions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index cbf2ffa..581319e 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -47,14 +47,15 @@
 
 to compute distances between 2 sets of samples.
 """
+
+
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.model_selection import BaseCrossValidator
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.utils.validation import validate_data, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
-from sklearn.model_selection import BaseCrossValidator
-from sklearn.utils.validation import check_is_fitted
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):

From cdb1590f168c89a9cb4fc8433116eb14ed7c0d47 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 12:38:59 +0100
Subject: [PATCH 15/27] Update sklearn_questions.py

---
 sklearn_questions.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 581319e..0994baf 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -56,6 +56,7 @@
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.utils.validation import validate_data, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
+from sklearn.preprocessing import LabelEncoder
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
@@ -73,7 +74,13 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin):
         Training data stored during fit.
 
     y_ : ndarray of shape (n_samples,)
-        Labels stored during fit.
+        Encoded labels stored during fit.
+
+    le_ : LabelEncoder
+        Label encoder fitted on y.
+
+    classes_ : ndarray of shape (n_classes,)
+        Unique class labels.
 
     n_features_in_ : int
         Number of features in the training data.
@@ -107,19 +114,28 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        # Validate input data and ensure it's 2D
+        # Validate input data and ensure X is 2D and y is 1D
         X, y = validate_data(
             X, y,
             accept_sparse=False,
             dtype=None,
             ensure_2d=True,
+            y_numeric=False,
+            multi_output=False,
             reset=True
         )
+
         # Validate that y contains classification targets
         check_classification_targets(y)
 
+        # Encode y to ensure it contains non-negative integers starting from 0
+        self.le_ = LabelEncoder()
+        y_encoded = self.le_.fit_transform(y)
+
         self.X_ = X
-        self.y_ = y
+        self.y_ = y_encoded
+        self.classes_ = self.le_.classes_
+
         return self
 
     def predict(self, X):
@@ -137,7 +153,7 @@ def predict(self, X):
             Predicted class labels.
         """
         # Check if the classifier has been fitted
-        check_is_fitted(self, ["X_", "y_"])
+        check_is_fitted(self, ["X_", "y_", "le_"])
 
         # Validate input data and ensure it's 2D
         X = validate_data(
@@ -158,11 +174,14 @@ def predict(self, X):
         neighbor_labels = self.y_[neighbors_idx]
 
         # Predict by majority vote
-        y_pred = np.array([
-            np.bincount(row.astype(int)).argmax() if len(np.unique(row)) > 0 else 0
+        y_pred_encoded = np.array([
+            np.bincount(row).argmax() if len(np.unique(row)) > 0 else 0
             for row in neighbor_labels
         ])
 
+        # Decode the encoded labels back to original labels
+        y_pred = self.le_.inverse_transform(y_pred_encoded)
+
         return y_pred
 
     def score(self, X, y):

From 789702c00bf348d0c6871e609c8707e60e17aa04 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 12:48:41 +0100
Subject: [PATCH 16/27] Update sklearn_questions.py

---
 sklearn_questions.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 0994baf..9e737b0 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -52,11 +52,13 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.model_selection import BaseCrossValidator
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.utils.validation import validate_data, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import BaseCrossValidator
+from sklearn.utils.estimator_checks import _check_feature_names, get_tags
+from sklearn.utils.metaestimators import available_if
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
@@ -116,11 +118,12 @@ def fit(self, X, y):
         """
         # Validate input data and ensure X is 2D and y is 1D
         X, y = validate_data(
-            X, y,
+            self,
+            X,
+            y,
             accept_sparse=False,
             dtype=None,
             ensure_2d=True,
-            y_numeric=False,
             multi_output=False,
             reset=True
         )
@@ -157,6 +160,7 @@ def predict(self, X):
 
         # Validate input data and ensure it's 2D
         X = validate_data(
+            self,
             X,
             accept_sparse=False,
             dtype=None,

From 2396a38c7a17a0e5ea0147904913ddbfc2476762 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 12:52:43 +0100
Subject: [PATCH 17/27] Update sklearn_questions.py

---
 sklearn_questions.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 9e737b0..a0a98a2 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -49,16 +49,15 @@
 """
 
 
+# sklearn_questions.py
+
 import numpy as np
-import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.utils.validation import validate_data, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import BaseCrossValidator
-from sklearn.utils.estimator_checks import _check_feature_names, get_tags
-from sklearn.utils.metaestimators import available_if
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):

From 99d2a53900b66013e957f55b9b1606b000dd3af7 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 13:19:07 +0100
Subject: [PATCH 18/27] Update sklearn_questions.py

---
 sklearn_questions.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index a0a98a2..ffd36a4 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -49,18 +49,17 @@
 """
 
 
-# sklearn_questions.py
-
 import numpy as np
-from sklearn.base import BaseEstimator, ClassifierMixin
+import pandas as pd  # Ensure pandas is imported
+from sklearn.base import BaseEstimator, ClassifierMixin, get_tags
 from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.utils.validation import validate_data, check_is_fitted
+from sklearn.utils.validation import validate_data, check_is_fitted, check_array
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import BaseCrossValidator
 
 
-class KNearestNeighbors(BaseEstimator, ClassifierMixin):
+class KNearestNeighbors(ClassifierMixin, BaseEstimator):
     """
     K-Nearest Neighbors classifier.
 
@@ -164,6 +163,7 @@ def predict(self, X):
             accept_sparse=False,
             dtype=None,
             ensure_2d=True,
+            multi_output=False,
             reset=False
         )
 

From 0e48d48ba00ed11c72800f0a97160958b216df41 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 14:07:34 +0100
Subject: [PATCH 19/27] Update sklearn_questions.py

---
 sklearn_questions.py | 273 ++++++++++++++++++-------------------------
 1 file changed, 115 insertions(+), 158 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index ffd36a4..3c574e9 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -50,289 +50,246 @@
 
 
 import numpy as np
-import pandas as pd  # Ensure pandas is imported
-from sklearn.base import BaseEstimator, ClassifierMixin, get_tags
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.utils.validation import validate_data, check_is_fitted, check_array
+from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import BaseCrossValidator
 
 
-class KNearestNeighbors(ClassifierMixin, BaseEstimator):
-    """
-    K-Nearest Neighbors classifier.
+class KNearestNeighbors(BaseEstimator, ClassifierMixin):
+    """Classifier implementing the k-nearest neighbors algorithm.
+
+    This classifier predicts the target of a test point based on the target
+    of its nearest neighbor in the training set, using Euclidean distance.
 
     Parameters
     ----------
     n_neighbors : int, default=1
-        Number of neighbors to use for predictions.
+        Number of neighbors to consider for prediction.
 
     Attributes
     ----------
     X_ : ndarray of shape (n_samples, n_features)
-        Training data stored during fit.
-
+        The input samples.
     y_ : ndarray of shape (n_samples,)
-        Encoded labels stored during fit.
-
-    le_ : LabelEncoder
-        Label encoder fitted on y.
-
+        The target values.
     classes_ : ndarray of shape (n_classes,)
-        Unique class labels.
-
-    n_features_in_ : int
-        Number of features in the training data.
+        The unique classes labels.
     """
 
     def __init__(self, n_neighbors=1):
-        """
-        Initialize the KNearestNeighbors classifier.
+        """Initialize the KNearestNeighbors classifier.
 
         Parameters
         ----------
         n_neighbors : int, default=1
-            Number of neighbors to use for predictions.
+            Number of neighbors to use for prediction.
         """
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        """
-        Fit the KNN classifier on training data.
+        """Fit the k-nearest neighbors classifier.
 
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
+        X : array-like of shape (n_samples, n_features)
             Training data.
-
-        y : ndarray of shape (n_samples,)
-            Target labels for training data.
+        y : array-like of shape (n_samples,)
+            Target values.
 
         Returns
         -------
         self : object
             Returns the instance itself.
         """
-        # Validate input data and ensure X is 2D and y is 1D
-        X, y = validate_data(
-            self,
-            X,
-            y,
-            accept_sparse=False,
-            dtype=None,
-            ensure_2d=True,
-            multi_output=False,
-            reset=True
-        )
-
-        # Validate that y contains classification targets
+        # Input validation using sklearn's check functions
+        X, y = check_X_y(X, y)
         check_classification_targets(y)
 
-        # Encode y to ensure it contains non-negative integers starting from 0
+        # Store number of features for predict step validation
+        self.n_features_in_ = X.shape[1]
+
+        # Encode class labels
         self.le_ = LabelEncoder()
-        y_encoded = self.le_.fit_transform(y)
+        y = self.le_.fit_transform(y)
 
         self.X_ = X
-        self.y_ = y_encoded
+        self.y_ = y
         self.classes_ = self.le_.classes_
 
         return self
 
     def predict(self, X):
-        """
-        Predict the class labels for the given data.
+        """Predict class labels for samples in X.
 
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
-            Test data.
+        X : array-like of shape (n_samples, n_features)
+            The data to predict.
 
         Returns
         -------
         y_pred : ndarray of shape (n_samples,)
-            Predicted class labels.
+            The predicted class labels.
         """
-        # Check if the classifier has been fitted
-        check_is_fitted(self, ["X_", "y_", "le_"])
-
-        # Validate input data and ensure it's 2D
-        X = validate_data(
-            self,
-            X,
-            accept_sparse=False,
-            dtype=None,
-            ensure_2d=True,
-            multi_output=False,
-            reset=False
-        )
+        check_is_fitted(self)
+        X = check_array(X)
 
-        # Compute pairwise distances between X and the training data
-        distances = pairwise_distances(X, self.X_, metric='euclidean')
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                f"X has {X.shape[1]} features, but KNearestNeighbors "
+                f"was trained with {self.n_features_in_} features."
+            )
 
-        # Find the indices of the k nearest neighbors for each sample
-        neighbors_idx = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+        # Compute distances between test points and training points
+        distances = pairwise_distances(X, self.X_)
 
-        # Gather the neighbor labels
-        neighbor_labels = self.y_[neighbors_idx]
+        # Find indices of k nearest neighbors
+        k_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors]
 
-        # Predict by majority vote
-        y_pred_encoded = np.array([
-            np.bincount(row).argmax() if len(np.unique(row)) > 0 else 0
-            for row in neighbor_labels
-        ])
+        # Get labels of k nearest neighbors
+        k_neighbors_labels = self.y_[k_neighbors]
 
-        # Decode the encoded labels back to original labels
-        y_pred = self.le_.inverse_transform(y_pred_encoded)
+        # Predict by majority voting
+        y_pred = np.apply_along_axis(
+            lambda x: np.bincount(x).argmax(),
+            axis=1,
+            arr=k_neighbors_labels
+        )
 
-        return y_pred
+        return self.le_.inverse_transform(y_pred)
 
     def score(self, X, y):
-        """
-        Compute the accuracy of the classifier.
+        """Return the mean accuracy on the given test data and labels.
 
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
-            Test data.
-
-        y : ndarray of shape (n_samples,)
-            True labels for test data.
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+        y : array-like of shape (n_samples,)
+            True labels for X.
 
         Returns
         -------
         score : float
             Mean accuracy of predictions.
         """
-        y_pred = self.predict(X)
-        return np.mean(y_pred == y)
+        return np.mean(self.predict(X) == y)
 
 
 class MonthlySplit(BaseCrossValidator):
-    """
-    Cross-validator that splits data based on months.
+    """Monthly cross-validation splitter.
+
+    Provides train/test indices to split time series data between successive
+    months. For each split, test indices must be higher than before, and thus
+    shuffling in cross validator is inappropriate.
 
     Parameters
     ----------
     time_col : str, default='index'
-        Column to use for date-based splitting. If 'index', the index of the
-        DataFrame is used as the date column.
-
-    Methods
-    -------
-    get_n_splits(X, y=None, groups=None)
-        Return the number of splits.
-
-    split(X, y=None, groups=None)
-        Generate indices for training and testing splits.
-
-    Raises
-    ------
-    ValueError
-        If the `time_col` is not found or not a datetime type.
+        Column name containing datetime values. If 'index', the index is used.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> dates = pd.date_range('2020-01-01', '2020-03-31', freq='D')
+    >>> X = pd.DataFrame({'val': range(len(dates))}, index=dates)
+    >>> cv = MonthlySplit()
+    >>> for train_idx, test_idx in cv.split(X):
+    ...     print(f"TRAIN:", X.index[train_idx].min(), X.index[train_idx].max())
+    ...     print(f"TEST:", X.index[test_idx].min(), X.index[test_idx].max())
     """
 
     def __init__(self, time_col='index'):
-        """
-        Initialize the MonthlySplit cross-validator.
+        """Initialize the monthly splitter.
 
         Parameters
         ----------
         time_col : str, default='index'
-            Column to use for date-based splitting. If 'index', the index of the
-            DataFrame is used as the date column.
+            Column containing datetime values or 'index'.
         """
         self.time_col = time_col
 
-    def get_n_splits(self, X, y=None, groups=None):
-        """
-        Return the number of splitting iterations in the cross-validator.
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Return the number of splitting iterations.
 
         Parameters
         ----------
-        X : DataFrame
-            Input data with datetime information.
-
-        y : None
-            Ignored, exists for API compatibility.
-
-        groups : None
-            Ignored, exists for API compatibility.
+        X : pd.DataFrame
+            Training data.
+        y : array-like, default=None
+            Always ignored, exists for compatibility.
+        groups : array-like, default=None
+            Always ignored, exists for compatibility.
 
         Returns
         -------
         n_splits : int
-            Number of month-based splits.
+            Returns the number of splitting iterations.
         """
         time_data = self._get_time_data(X)
-        unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values()
+        unique_months = time_data.dt.to_period('M').unique()
         return max(len(unique_months) - 1, 0)
 
     def split(self, X, y=None, groups=None):
-        """
-        Generate indices to split data into training and test set.
+        """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : DataFrame
-            Input data with datetime information.
-
-        y : None
-            Ignored, exists for API compatibility.
-
-        groups : None
-            Ignored, exists for API compatibility.
+        X : pd.DataFrame
+            Training data.
+        y : array-like, default=None
+            Always ignored, exists for compatibility.
+        groups : array-like, default=None
+            Always ignored, exists for compatibility.
 
         Yields
         ------
-        train_indices : ndarray
-            Indices for training data.
-
-        test_indices : ndarray
-            Indices for testing data.
+        train : ndarray
+            Training set indices.
+        test : ndarray
+            Test set indices.
         """
         time_data = self._get_time_data(X)
-        unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values()
+        months = time_data.dt.to_period('M')
+        unique_months = sorted(months.unique())
 
         for i in range(len(unique_months) - 1):
-            train_month = unique_months.iloc[i]
-            test_month = unique_months.iloc[i + 1]
-
-            train_mask = time_data.dt.to_period("M") == train_month
-            test_mask = time_data.dt.to_period("M") == test_month
-
-            train_indices = np.where(train_mask)[0]
-            test_indices = np.where(test_mask)[0]
-
-            yield train_indices, test_indices
+            train_mask = months == unique_months[i]
+            test_mask = months == unique_months[i + 1]
+            yield np.where(train_mask)[0], np.where(test_mask)[0]
 
     def _get_time_data(self, X):
-        """
-        Extract the datetime data from the specified column or index.
+        """Extract datetime data from DataFrame.
 
         Parameters
         ----------
-        X : DataFrame
+        X : pd.DataFrame
             Input data.
 
         Returns
         -------
-        time_data : Series
-            Series of datetime values.
+        pd.Series
+            Series containing datetime values.
 
         Raises
         ------
         ValueError
-            If the column is not found or is not datetime-like.
+            If datetime column is not found or is invalid.
         """
         if self.time_col == 'index':
             if not isinstance(X.index, pd.DatetimeIndex):
-                raise ValueError("Index must be a DatetimeIndex.")
+                raise ValueError("Index must be DatetimeIndex when time_col='index'")
             return pd.Series(X.index)
-        elif self.time_col in X.columns:
-            time_data = X[self.time_col]
-            if not np.issubdtype(time_data.dtype, np.datetime64):
-                raise ValueError(f"Column '{self.time_col}' must be of datetime type.")
-            return time_data
-        else:
-            raise ValueError(f"Column '{self.time_col}' not found in input data.")
+
+        if self.time_col not in X.columns:
+            raise ValueError(f"Column {self.time_col} not found in X")
+
+        time_values = X[self.time_col]
+        if not pd.api.types.is_datetime64_any_dtype(time_values):
+            raise ValueError(f"Column {self.time_col} must be datetime type")
+
+        return time_values

From 7316b0d05a498349d077775076a1e7eb1dbeba28 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 14:13:58 +0100
Subject: [PATCH 20/27] Update sklearn_questions.py

---
 sklearn_questions.py | 111 +++++++++++++++++--------------------------
 1 file changed, 44 insertions(+), 67 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 3c574e9..d09aba1 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -53,22 +53,21 @@
 import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from sklearn.utils.validation import (
+    check_X_y, check_array, check_is_fitted, check_consistent_length
+)
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import BaseCrossValidator
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
-    """Classifier implementing the k-nearest neighbors algorithm.
-
-    This classifier predicts the target of a test point based on the target
-    of its nearest neighbor in the training set, using Euclidean distance.
+    """K-nearest neighbors classifier implementation.
 
     Parameters
     ----------
     n_neighbors : int, default=1
-        Number of neighbors to consider for prediction.
+        Number of neighbors to use for classification.
 
     Attributes
     ----------
@@ -78,20 +77,22 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin):
         The target values.
     classes_ : ndarray of shape (n_classes,)
         The unique classes labels.
+    n_features_in_ : int
+        Number of features seen during fit.
     """
 
     def __init__(self, n_neighbors=1):
-        """Initialize the KNearestNeighbors classifier.
+        """Initialize the classifier.
 
         Parameters
         ----------
         n_neighbors : int, default=1
-            Number of neighbors to use for prediction.
+            Number of neighbors to use.
         """
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        """Fit the k-nearest neighbors classifier.
+        """Fit the model using X as training data and y as target values.
 
         Parameters
         ----------
@@ -105,60 +106,53 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        # Input validation using sklearn's check functions
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, ensure_2d=True, allow_nd=False)
         check_classification_targets(y)
 
-        # Store number of features for predict step validation
-        self.n_features_in_ = X.shape[1]
+        if self.n_neighbors < 1:
+            raise ValueError(
+                f"Expected n_neighbors > 0, got {self.n_neighbors}"
+            )
 
-        # Encode class labels
+        self.n_features_in_ = X.shape[1]
         self.le_ = LabelEncoder()
-        y = self.le_.fit_transform(y)
-
-        self.X_ = X
-        self.y_ = y
+        self.y_ = self.le_.fit_transform(y)
         self.classes_ = self.le_.classes_
+        self.X_ = X
 
         return self
 
     def predict(self, X):
-        """Predict class labels for samples in X.
+        """Predict the class labels for the provided data.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
-            The data to predict.
+            Test samples.
 
         Returns
         -------
         y_pred : ndarray of shape (n_samples,)
-            The predicted class labels.
+            Class labels for each data sample.
         """
-        check_is_fitted(self)
-        X = check_array(X)
+        check_is_fitted(self, ['X_', 'y_', 'classes_'])
+        X = check_array(X, ensure_2d=True, allow_nd=False)
 
         if X.shape[1] != self.n_features_in_:
             raise ValueError(
-                f"X has {X.shape[1]} features, but KNearestNeighbors "
-                f"was trained with {self.n_features_in_} features."
+                f"X has {X.shape[1]} features, expected {self.n_features_in_}"
             )
 
-        # Compute distances between test points and training points
         distances = pairwise_distances(X, self.X_)
+        neigh_ind = np.argpartition(
+            distances, min(self.n_neighbors - 1, len(self.y_) - 1), axis=1
+        )[:, :self.n_neighbors]
 
-        # Find indices of k nearest neighbors
-        k_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors]
-
-        # Get labels of k nearest neighbors
-        k_neighbors_labels = self.y_[k_neighbors]
-
-        # Predict by majority voting
-        y_pred = np.apply_along_axis(
-            lambda x: np.bincount(x).argmax(),
-            axis=1,
-            arr=k_neighbors_labels
-        )
+        neigh_labels = self.y_[neigh_ind]
+        y_pred = np.array([
+            np.bincount(labels).argmax()
+            for labels in neigh_labels
+        ])
 
         return self.le_.inverse_transform(y_pred)
 
@@ -175,36 +169,24 @@ def score(self, X, y):
         Returns
         -------
         score : float
-            Mean accuracy of predictions.
+            Mean accuracy of self.predict(X) with respect to y.
         """
+        X = check_array(X, ensure_2d=True, allow_nd=False)
+        check_consistent_length(X, y)
         return np.mean(self.predict(X) == y)
 
 
 class MonthlySplit(BaseCrossValidator):
     """Monthly cross-validation splitter.
 
-    Provides train/test indices to split time series data between successive
-    months. For each split, test indices must be higher than before, and thus
-    shuffling in cross validator is inappropriate.
-
     Parameters
     ----------
     time_col : str, default='index'
-        Column name containing datetime values. If 'index', the index is used.
-
-    Examples
-    --------
-    >>> import pandas as pd
-    >>> dates = pd.date_range('2020-01-01', '2020-03-31', freq='D')
-    >>> X = pd.DataFrame({'val': range(len(dates))}, index=dates)
-    >>> cv = MonthlySplit()
-    >>> for train_idx, test_idx in cv.split(X):
-    ...     print(f"TRAIN:", X.index[train_idx].min(), X.index[train_idx].max())
-    ...     print(f"TEST:", X.index[test_idx].min(), X.index[test_idx].max())
+        Column name containing datetime values. If 'index', uses the index.
     """
 
     def __init__(self, time_col='index'):
-        """Initialize the monthly splitter.
+        """Initialize the splitter.
 
         Parameters
         ----------
@@ -218,7 +200,7 @@ def get_n_splits(self, X=None, y=None, groups=None):
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : pd.DataFrame, required
             Training data.
         y : array-like, default=None
             Always ignored, exists for compatibility.
@@ -231,8 +213,8 @@ def get_n_splits(self, X=None, y=None, groups=None):
             Returns the number of splitting iterations.
         """
         time_data = self._get_time_data(X)
-        unique_months = time_data.dt.to_period('M').unique()
-        return max(len(unique_months) - 1, 0)
+        n_months = time_data.dt.to_period('M').nunique()
+        return max(0, n_months - 1)
 
     def split(self, X, y=None, groups=None):
         """Generate indices to split data into training and test set.
@@ -258,12 +240,12 @@ def split(self, X, y=None, groups=None):
         unique_months = sorted(months.unique())
 
         for i in range(len(unique_months) - 1):
-            train_mask = months == unique_months[i]
-            test_mask = months == unique_months[i + 1]
-            yield np.where(train_mask)[0], np.where(test_mask)[0]
+            train_idx = np.where(months == unique_months[i])[0]
+            test_idx = np.where(months == unique_months[i + 1])[0]
+            yield train_idx, test_idx
 
     def _get_time_data(self, X):
-        """Extract datetime data from DataFrame.
+        """Get datetime data from DataFrame.
 
         Parameters
         ----------
@@ -274,11 +256,6 @@ def _get_time_data(self, X):
         -------
         pd.Series
             Series containing datetime values.
-
-        Raises
-        ------
-        ValueError
-            If datetime column is not found or is invalid.
         """
         if self.time_col == 'index':
             if not isinstance(X.index, pd.DatetimeIndex):

From b438779f0c9ae01810255642675809f9a8eb565d Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 14:20:03 +0100
Subject: [PATCH 21/27] Update sklearn_questions.py

---
 sklearn_questions.py | 112 +++++++++++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 24 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index d09aba1..46bc276 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -54,7 +54,8 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.utils.validation import (
-    check_X_y, check_array, check_is_fitted, check_consistent_length
+    check_X_y, check_array, check_is_fitted, _check_sample_weight,
+    _num_samples
 )
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.preprocessing import LabelEncoder
@@ -79,6 +80,10 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin):
         The unique classes labels.
     n_features_in_ : int
         Number of features seen during fit.
+    _fit_X : ndarray of shape (n_samples, n_features)
+        Validated training data.
+    _y : ndarray of shape (n_samples,)
+        Validated target values.
     """
 
     def __init__(self, n_neighbors=1):
@@ -96,29 +101,54 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data.
         y : array-like of shape (n_samples,)
             Target values.
 
         Returns
         -------
-        self : object
-            Returns the instance itself.
+        self : KNearestNeighbors
+            The fitted classifier.
         """
-        X, y = check_X_y(X, y, ensure_2d=True, allow_nd=False)
-        check_classification_targets(y)
+        # Input validation
+        X, y = check_X_y(
+            X, y,
+            ensure_2d=True,
+            allow_nd=False,
+            dtype=[np.float64, np.float32],
+            force_all_finite=True
+        )
+
+        # Check that X and y have correct shape
+        if X.shape[0] != y.shape[0]:
+            raise ValueError(
+                f"Found input variables with inconsistent numbers of samples: "
+                f"{[X.shape[0], y.shape[0]]}"
+            )
 
+        # Validate n_neighbors
         if self.n_neighbors < 1:
             raise ValueError(
                 f"Expected n_neighbors > 0, got {self.n_neighbors}"
             )
+        n_samples = _num_samples(X)
+        if self.n_neighbors > n_samples:
+            raise ValueError(
+                f"Expected n_neighbors <= n_samples, got n_neighbors = "
+                f"{self.n_neighbors}, n_samples = {n_samples}"
+            )
 
-        self.n_features_in_ = X.shape[1]
-        self.le_ = LabelEncoder()
-        self.y_ = self.le_.fit_transform(y)
-        self.classes_ = self.le_.classes_
+        check_classification_targets(y)
+
+        self._fit_X = X
         self.X_ = X
+        self.n_features_in_ = X.shape[1]
+
+        # Encode labels
+        self._le = LabelEncoder()
+        self._y = self._le.fit_transform(y)
+        self.classes_ = self._le.classes_
 
         return self
 
@@ -134,27 +164,53 @@ def predict(self, X):
         -------
         y_pred : ndarray of shape (n_samples,)
             Class labels for each data sample.
-        """
-        check_is_fitted(self, ['X_', 'y_', 'classes_'])
-        X = check_array(X, ensure_2d=True, allow_nd=False)
 
+        Raises
+        ------
+        ValueError
+            If the number of features in X doesn't match the training data.
+        """
+        # Check if fit has been called
+        check_is_fitted(
+            self,
+            ["_fit_X", "_y", "n_features_in_", "classes_"]
+        )
+
+        # Input validation
+        X = check_array(
+            X,
+            accept_sparse=False,
+            dtype=np.float64,
+            order="C",
+            ensure_2d=True,
+            force_all_finite=True
+        )
+
+        # Check feature size consistency
         if X.shape[1] != self.n_features_in_:
             raise ValueError(
-                f"X has {X.shape[1]} features, expected {self.n_features_in_}"
+                f"X has {X.shape[1]} features, but this "
+                f"KNearestNeighbors is expecting {self.n_features_in_} features"
             )
 
-        distances = pairwise_distances(X, self.X_)
+        # Compute distances and find nearest neighbors
+        distances = pairwise_distances(X, self._fit_X)
         neigh_ind = np.argpartition(
-            distances, min(self.n_neighbors - 1, len(self.y_) - 1), axis=1
+            distances,
+            min(self.n_neighbors - 1, len(self._y) - 1),
+            axis=1
         )[:, :self.n_neighbors]
 
-        neigh_labels = self.y_[neigh_ind]
-        y_pred = np.array([
-            np.bincount(labels).argmax()
-            for labels in neigh_labels
-        ])
+        # Get labels of nearest neighbors
+        neigh_labels = self._y[neigh_ind]
+
+        # Predict by majority voting
+        y_pred = np.zeros(X.shape[0], dtype=self._y.dtype)
+        for i in range(X.shape[0]):
+            counts = np.bincount(neigh_labels[i])
+            y_pred[i] = counts.argmax()
 
-        return self.le_.inverse_transform(y_pred)
+        return self._le.inverse_transform(y_pred)
 
     def score(self, X, y):
         """Return the mean accuracy on the given test data and labels.
@@ -171,8 +227,16 @@ def score(self, X, y):
         score : float
             Mean accuracy of self.predict(X) with respect to y.
         """
-        X = check_array(X, ensure_2d=True, allow_nd=False)
-        check_consistent_length(X, y)
+        # Check that X and y have correct shape
+        X = check_array(X, accept_sparse=False, ensure_2d=True)
+        y = check_array(y, ensure_2d=False, ensure_min_samples=0)
+
+        if X.shape[0] != y.shape[0]:
+            raise ValueError(
+                f"Found input variables with inconsistent numbers of samples: "
+                f"{[X.shape[0], y.shape[0]]}"
+            )
+
         return np.mean(self.predict(X) == y)
 
 
From b344a4eae1025433328f1852ff189270c2d091c5 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 14:56:14 +0100
Subject: [PATCH 22/27] Update sklearn_questions.py


From 884dfad121c14d5a1f06dc84d76f08a066c08d21 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 15:07:29 +0100
Subject: [PATCH 23/27] Update sklearn_questions.py

---
 sklearn_questions.py | 274 ++++++-------------------------------------
 1 file changed, 33 insertions(+), 241 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 46bc276..73b7be9 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -63,274 +63,66 @@
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
-    """K-nearest neighbors classifier implementation.
-
-    Parameters
-    ----------
-    n_neighbors : int, default=1
-        Number of neighbors to use for classification.
-
-    Attributes
-    ----------
-    X_ : ndarray of shape (n_samples, n_features)
-        The input samples.
-    y_ : ndarray of shape (n_samples,)
-        The target values.
-    classes_ : ndarray of shape (n_classes,)
-        The unique classes labels.
-    n_features_in_ : int
-        Number of features seen during fit.
-    _fit_X : ndarray of shape (n_samples, n_features)
-        Validated training data.
-    _y : ndarray of shape (n_samples,)
-        Validated target values.
-    """
-
     def __init__(self, n_neighbors=1):
-        """Initialize the classifier.
-
-        Parameters
-        ----------
-        n_neighbors : int, default=1
-            Number of neighbors to use.
-        """
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        """Fit the model using X as training data and y as target values.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training data.
-        y : array-like of shape (n_samples,)
-            Target values.
-
-        Returns
-        -------
-        self : KNearestNeighbors
-            The fitted classifier.
-        """
-        # Input validation
-        X, y = check_X_y(
-            X, y,
-            ensure_2d=True,
-            allow_nd=False,
-            dtype=[np.float64, np.float32],
-            force_all_finite=True
-        )
-
-        # Check that X and y have correct shape
-        if X.shape[0] != y.shape[0]:
-            raise ValueError(
-                f"Found input variables with inconsistent numbers of samples: "
-                f"{[X.shape[0], y.shape[0]]}"
-            )
-
-        # Validate n_neighbors
-        if self.n_neighbors < 1:
-            raise ValueError(
-                f"Expected n_neighbors > 0, got {self.n_neighbors}"
-            )
-        n_samples = _num_samples(X)
-        if self.n_neighbors > n_samples:
-            raise ValueError(
-                f"Expected n_neighbors <= n_samples, got n_neighbors = "
-                f"{self.n_neighbors}, n_samples = {n_samples}"
-            )
-
-        check_classification_targets(y)
-
-        self._fit_X = X
+        X, y = check_X_y(X, y, ensure_min_samples=1)
+        self.classes_ = np.unique(y)
         self.X_ = X
+        self.y_ = y
         self.n_features_in_ = X.shape[1]
-
-        # Encode labels
-        self._le = LabelEncoder()
-        self._y = self._le.fit_transform(y)
-        self.classes_ = self._le.classes_
-
         return self
 
     def predict(self, X):
-        """Predict the class labels for the provided data.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Test samples.
-
-        Returns
-        -------
-        y_pred : ndarray of shape (n_samples,)
-            Class labels for each data sample.
-
-        Raises
-        ------
-        ValueError
-            If the number of features in X doesn't match the training data.
-        """
-        # Check if fit has been called
-        check_is_fitted(
-            self,
-            ["_fit_X", "_y", "n_features_in_", "classes_"]
-        )
-
-        # Input validation
-        X = check_array(
-            X,
-            accept_sparse=False,
-            dtype=np.float64,
-            order="C",
-            ensure_2d=True,
-            force_all_finite=True
-        )
-
-        # Check feature size consistency
+        check_is_fitted(self)
+        X = check_array(X)
+        
         if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                f"X has {X.shape[1]} features, but this "
-                f"KNearestNeighbors is expecting {self.n_features_in_} features"
-            )
-
-        # Compute distances and find nearest neighbors
-        distances = pairwise_distances(X, self._fit_X)
-        neigh_ind = np.argpartition(
-            distances,
-            min(self.n_neighbors - 1, len(self._y) - 1),
-            axis=1
-        )[:, :self.n_neighbors]
-
-        # Get labels of nearest neighbors
-        neigh_labels = self._y[neigh_ind]
-
-        # Predict by majority voting
-        y_pred = np.zeros(X.shape[0], dtype=self._y.dtype)
-        for i in range(X.shape[0]):
-            counts = np.bincount(neigh_labels[i])
-            y_pred[i] = counts.argmax()
-
-        return self._le.inverse_transform(y_pred)
-
-    def score(self, X, y):
-        """Return the mean accuracy on the given test data and labels.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Test samples.
-        y : array-like of shape (n_samples,)
-            True labels for X.
-
-        Returns
-        -------
-        score : float
-            Mean accuracy of self.predict(X) with respect to y.
-        """
-        # Check that X and y have correct shape
-        X = check_array(X, accept_sparse=False, ensure_2d=True)
-        y = check_array(y, ensure_2d=False, ensure_min_samples=0)
-
-        if X.shape[0] != y.shape[0]:
-            raise ValueError(
-                f"Found input variables with inconsistent numbers of samples: "
-                f"{[X.shape[0], y.shape[0]]}"
-            )
-
-        return np.mean(self.predict(X) == y)
-
+            raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_}")
+        
+        distances = pairwise_distances(X, self.X_)
+        nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+        
+        predictions = []
+        for neighbors in nearest_neighbors:
+            neighbor_labels = self.y_[neighbors]
+            most_common = np.bincount(neighbor_labels).argmax()
+            predictions.append(most_common)
+        
+        return np.array(predictions)
 
 class MonthlySplit(BaseCrossValidator):
-    """Monthly cross-validation splitter.
-
-    Parameters
-    ----------
-    time_col : str, default='index'
-        Column name containing datetime values. If 'index', uses the index.
-    """
-
     def __init__(self, time_col='index'):
-        """Initialize the splitter.
-
-        Parameters
-        ----------
-        time_col : str, default='index'
-            Column containing datetime values or 'index'.
-        """
         self.time_col = time_col
 
-    def get_n_splits(self, X=None, y=None, groups=None):
-        """Return the number of splitting iterations.
-
-        Parameters
-        ----------
-        X : pd.DataFrame, required
-            Training data.
-        y : array-like, default=None
-            Always ignored, exists for compatibility.
-        groups : array-like, default=None
-            Always ignored, exists for compatibility.
-
-        Returns
-        -------
-        n_splits : int
-            Returns the number of splitting iterations.
-        """
-        time_data = self._get_time_data(X)
-        n_months = time_data.dt.to_period('M').nunique()
-        return max(0, n_months - 1)
-
     def split(self, X, y=None, groups=None):
-        """Generate indices to split data into training and test set.
-
-        Parameters
-        ----------
-        X : pd.DataFrame
-            Training data.
-        y : array-like, default=None
-            Always ignored, exists for compatibility.
-        groups : array-like, default=None
-            Always ignored, exists for compatibility.
-
-        Yields
-        ------
-        train : ndarray
-            Training set indices.
-        test : ndarray
-            Test set indices.
-        """
-        time_data = self._get_time_data(X)
-        months = time_data.dt.to_period('M')
+        times = self._get_time_data(X)
+        months = times.dt.to_period('M')
         unique_months = sorted(months.unique())
 
         for i in range(len(unique_months) - 1):
-            train_idx = np.where(months == unique_months[i])[0]
-            test_idx = np.where(months == unique_months[i + 1])[0]
-            yield train_idx, test_idx
-
-    def _get_time_data(self, X):
-        """Get datetime data from DataFrame.
+            train_mask = months == unique_months[i]
+            test_mask = months == unique_months[i + 1]
+            yield np.where(train_mask)[0], np.where(test_mask)[0]
 
-        Parameters
-        ----------
-        X : pd.DataFrame
-            Input data.
+    def get_n_splits(self, X=None, y=None, groups=None):
+        if X is None:
+            raise ValueError("X cannot be None")
+        times = self._get_time_data(X)
+        return times.dt.to_period('M').nunique() - 1
 
-        Returns
-        -------
-        pd.Series
-            Series containing datetime values.
-        """
+    def _get_time_data(self, X):
         if self.time_col == 'index':
             if not isinstance(X.index, pd.DatetimeIndex):
                 raise ValueError("Index must be DatetimeIndex when time_col='index'")
             return pd.Series(X.index)
-
+        
         if self.time_col not in X.columns:
-            raise ValueError(f"Column {self.time_col} not found in X")
-
+            raise ValueError(f"Column {self.time_col} not found")
+            
         time_values = X[self.time_col]
         if not pd.api.types.is_datetime64_any_dtype(time_values):
             raise ValueError(f"Column {self.time_col} must be datetime type")
-
+            
         return time_values

From 3de01d6a3be8a124aa022228f7f07161cd2babf3 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 15:16:40 +0100
Subject: [PATCH 24/27] Update sklearn_questions.py

---
 sklearn_questions.py | 48 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 73b7be9..148eedb 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -53,12 +53,7 @@
 import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.utils.validation import (
-    check_X_y, check_array, check_is_fitted, _check_sample_weight,
-    _num_samples
-)
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.preprocessing import LabelEncoder
+from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.model_selection import BaseCrossValidator
 
 
@@ -67,40 +62,45 @@ def __init__(self, n_neighbors=1):
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        X, y = check_X_y(X, y, ensure_min_samples=1)
-        self.classes_ = np.unique(y)
+        # Validation stricte des entrées
+        X, y = check_X_y(X, y, ensure_min_samples=2)
+        self._validate_params()
+        
         self.X_ = X
         self.y_ = y
+        self.classes_ = np.unique(y)
         self.n_features_in_ = X.shape[1]
+        
         return self
 
+    def _validate_params(self):
+        if not isinstance(self.n_neighbors, int) or self.n_neighbors < 1:
+            raise ValueError("n_neighbors must be a positive integer")
+
     def predict(self, X):
         check_is_fitted(self)
-        X = check_array(X)
+        X = check_array(X, ensure_2d=True)
         
         if X.shape[1] != self.n_features_in_:
             raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_}")
         
         distances = pairwise_distances(X, self.X_)
-        nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+        neigh_idx = np.argpartition(distances, self.n_neighbors-1, axis=1)[:, :self.n_neighbors]
         
-        predictions = []
-        for neighbors in nearest_neighbors:
-            neighbor_labels = self.y_[neighbors]
-            most_common = np.bincount(neighbor_labels).argmax()
-            predictions.append(most_common)
-        
-        return np.array(predictions)
+        return np.array([
+            np.bincount(self.y_[idx]).argmax()
+            for idx in neigh_idx
+        ])
 
 class MonthlySplit(BaseCrossValidator):
     def __init__(self, time_col='index'):
         self.time_col = time_col
 
     def split(self, X, y=None, groups=None):
-        times = self._get_time_data(X)
-        months = times.dt.to_period('M')
+        time_data = self._get_time_data(X)
+        months = time_data.dt.to_period('M')
         unique_months = sorted(months.unique())
-
+        
         for i in range(len(unique_months) - 1):
             train_mask = months == unique_months[i]
             test_mask = months == unique_months[i + 1]
@@ -109,8 +109,8 @@ def split(self, X, y=None, groups=None):
     def get_n_splits(self, X=None, y=None, groups=None):
         if X is None:
             raise ValueError("X cannot be None")
-        times = self._get_time_data(X)
-        return times.dt.to_period('M').nunique() - 1
+        time_data = self._get_time_data(X)
+        return time_data.dt.to_period('M').nunique() - 1
 
     def _get_time_data(self, X):
         if self.time_col == 'index':
@@ -120,9 +120,9 @@ def _get_time_data(self, X):
         
         if self.time_col not in X.columns:
             raise ValueError(f"Column {self.time_col} not found")
-            
+        
         time_values = X[self.time_col]
         if not pd.api.types.is_datetime64_any_dtype(time_values):
             raise ValueError(f"Column {self.time_col} must be datetime type")
-            
+        
         return time_values

From 8cd1bf5db1eba94a6ac0ab5857ca8ac6aceaaec3 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 15:34:50 +0100
Subject: [PATCH 25/27] Update sklearn_questions.py

---
 sklearn_questions.py | 153 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 19 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 148eedb..d1ca77c 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -49,23 +49,64 @@
 """
 
 
+"""Implementation of KNN classifier and monthly split cross-validator."""
+
+
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.model_selection import BaseCrossValidator
+from sklearn.utils.validation import (
+    check_X_y,
+    check_array,
+    check_is_fitted
+)
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
+    """K-nearest neighbors classifier.
+
+    Parameters
+    ----------
+    n_neighbors : int, default=1
+        Number of neighbors to use for classification.
+
+    Attributes
+    ----------
+    X_ : ndarray of shape (n_samples, n_features)
+        Training data.
+    y_ : ndarray of shape (n_samples,)
+        Target values.
+    classes_ : ndarray
+        Unique classes in the training data.
+    n_features_in_ : int
+        Number of features seen during fit.
+    """
+
     def __init__(self, n_neighbors=1):
+        """Initialize the classifier."""
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        # Validation stricte des entrées
-        X, y = check_X_y(X, y, ensure_min_samples=2)
-        self._validate_params()
+        """Fit the model using X as training data and y as target values.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        X, y = check_X_y(X, y)
         
+        if self.n_neighbors < 1:
+            raise ValueError(f"n_neighbors must be >= 1. Got {self.n_neighbors}")
+            
         self.X_ = X
         self.y_ = y
         self.classes_ = np.unique(y)
@@ -73,30 +114,71 @@ def fit(self, X, y):
         
         return self
 
-    def _validate_params(self):
-        if not isinstance(self.n_neighbors, int) or self.n_neighbors < 1:
-            raise ValueError("n_neighbors must be a positive integer")
-
     def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Class labels for each data sample.
+        """
         check_is_fitted(self)
-        X = check_array(X, ensure_2d=True)
+        X = check_array(X)
         
         if X.shape[1] != self.n_features_in_:
-            raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_}")
-        
-        distances = pairwise_distances(X, self.X_)
-        neigh_idx = np.argpartition(distances, self.n_neighbors-1, axis=1)[:, :self.n_neighbors]
+            raise ValueError(
+                f"X has {X.shape[1]} features, but KNearestNeighbors "
+                f"is expecting {self.n_features_in_} features"
+            )
+            
+        distances = ((X[:, np.newaxis, :] - self.X_) ** 2).sum(axis=2)
+        indices = np.argpartition(distances, self.n_neighbors-1)[:, :self.n_neighbors]
         
-        return np.array([
+        predictions = np.array([
             np.bincount(self.y_[idx]).argmax()
-            for idx in neigh_idx
+            for idx in indices
         ])
+        
+        return predictions
+
 
 class MonthlySplit(BaseCrossValidator):
+    """Monthly cross-validation splitter.
+
+    Parameters
+    ----------
+    time_col : str, default='index'
+        Column name containing datetime values.
+    """
+
     def __init__(self, time_col='index'):
+        """Initialize the splitter."""
         self.time_col = time_col
 
     def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and validation set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+        y : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+        groups : array-like
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            Training set indices.
+        test : ndarray
+            Test set indices.
+        """
         time_data = self._get_time_data(X)
         months = time_data.dt.to_period('M')
         unique_months = sorted(months.unique())
@@ -107,22 +189,55 @@ def split(self, X, y=None, groups=None):
             yield np.where(train_mask)[0], np.where(test_mask)[0]
 
     def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+        y : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+        groups : array-like
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations.
+        """
         if X is None:
             raise ValueError("X cannot be None")
         time_data = self._get_time_data(X)
         return time_data.dt.to_period('M').nunique() - 1
 
     def _get_time_data(self, X):
+        """Extract datetime data from DataFrame.
+
+        Parameters
+        ----------
+        X : DataFrame
+            Input data.
+
+        Returns
+        -------
+        pd.Series
+            Series containing datetime values.
+
+        Raises
+        ------
+        ValueError
+            If time column is not found or not datetime type.
+        """
         if self.time_col == 'index':
             if not isinstance(X.index, pd.DatetimeIndex):
                 raise ValueError("Index must be DatetimeIndex when time_col='index'")
             return pd.Series(X.index)
-        
+            
         if self.time_col not in X.columns:
             raise ValueError(f"Column {self.time_col} not found")
-        
+            
         time_values = X[self.time_col]
         if not pd.api.types.is_datetime64_any_dtype(time_values):
             raise ValueError(f"Column {self.time_col} must be datetime type")
-        
+            
         return time_values

From de0b3cc15106341c74beb92f750d9202127dc8b3 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 15:51:11 +0100
Subject: [PATCH 26/27] Update sklearn_questions.py

---
 sklearn_questions.py | 208 +++++++++++++++++++------------------------
 1 file changed, 92 insertions(+), 116 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index d1ca77c..991f040 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -56,140 +56,106 @@
 import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.model_selection import BaseCrossValidator
-from sklearn.utils.validation import (
-    check_X_y,
-    check_array,
-    check_is_fitted
-)
+from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.metrics.pairwise import pairwise_distances
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
-    """K-nearest neighbors classifier.
-
-    Parameters
-    ----------
-    n_neighbors : int, default=1
-        Number of neighbors to use for classification.
-
-    Attributes
-    ----------
-    X_ : ndarray of shape (n_samples, n_features)
-        Training data.
-    y_ : ndarray of shape (n_samples,)
-        Target values.
-    classes_ : ndarray
-        Unique classes in the training data.
-    n_features_in_ : int
-        Number of features seen during fit.
-    """
+    """KNearestNeighbors classifier."""
 
     def __init__(self, n_neighbors=1):
-        """Initialize the classifier."""
+        """Initialize the classifier with the number of neighbors."""
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        """Fit the model using X as training data and y as target values.
+        """
+        Fit the model using X as training data and y as target values.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training data.
-        y : array-like of shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values.
 
         Returns
         -------
         self : object
-            Returns self.
+            Fitted estimator.
         """
         X, y = check_X_y(X, y)
-        
-        if self.n_neighbors < 1:
-            raise ValueError(f"n_neighbors must be >= 1. Got {self.n_neighbors}")
-            
+        check_classification_targets(y)
         self.X_ = X
         self.y_ = y
         self.classes_ = np.unique(y)
-        self.n_features_in_ = X.shape[1]
-        
+        self.is_fitted_ = True
         return self
 
     def predict(self, X):
-        """Predict the class labels for the provided data.
+        """
+        Predict the class labels for the provided data.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Test samples.
 
         Returns
         -------
-        y_pred : ndarray of shape (n_samples,)
-            Class labels for each data sample.
+        y : ndarray of shape (n_samples,)
+            Predicted class labels.
         """
         check_is_fitted(self)
         X = check_array(X)
-        
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                f"X has {X.shape[1]} features, but KNearestNeighbors "
-                f"is expecting {self.n_features_in_} features"
-            )
-            
-        distances = ((X[:, np.newaxis, :] - self.X_) ** 2).sum(axis=2)
-        indices = np.argpartition(distances, self.n_neighbors-1)[:, :self.n_neighbors]
-        
-        predictions = np.array([
-            np.bincount(self.y_[idx]).argmax()
-            for idx in indices
-        ])
-        
-        return predictions
+        distances = pairwise_distances(X, self.X_)
+        k_nearest = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+        y_pred = np.array([np.bincount(self.y_[neighbors]).argmax() for neighbors in k_nearest])
+        return y_pred
+
+    def score(self, X, y):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Test samples.
+        y : ndarray of shape (n_samples,)
+            True labels for X.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) wrt. y.
+        """
+        return np.mean(self.predict(X) == y)
 
 
 class MonthlySplit(BaseCrossValidator):
-    """Monthly cross-validation splitter.
+    """
+    Cross-validator based on monthly split.
+
+    Split data based on the given `time_col` (or default to index). Each split
+    corresponds to one month of data for the training and the next month of
+    data for the test.
 
     Parameters
     ----------
     time_col : str, default='index'
-        Column name containing datetime values.
+        Column of the input DataFrame that will be used to split the data. This
+        column should be of type datetime. If split is called with a DataFrame
+        for which this column is not a datetime, it will raise a ValueError.
+        To use the index as column just set `time_col` to `'index'`.
     """
 
     def __init__(self, time_col='index'):
-        """Initialize the splitter."""
+        """Initialize the cross-validator with the time column."""
         self.time_col = time_col
 
-    def split(self, X, y=None, groups=None):
-        """Generate indices to split data into training and validation set.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Training data.
-        y : array-like of shape (n_samples,)
-            Always ignored, exists for compatibility.
-        groups : array-like
-            Always ignored, exists for compatibility.
-
-        Yields
-        ------
-        train : ndarray
-            Training set indices.
-        test : ndarray
-            Test set indices.
+    def get_n_splits(self, X, y=None, groups=None):
         """
-        time_data = self._get_time_data(X)
-        months = time_data.dt.to_period('M')
-        unique_months = sorted(months.unique())
-        
-        for i in range(len(unique_months) - 1):
-            train_mask = months == unique_months[i]
-            test_mask = months == unique_months[i + 1]
-            yield np.where(train_mask)[0], np.where(test_mask)[0]
-
-    def get_n_splits(self, X=None, y=None, groups=None):
-        """Returns the number of splitting iterations.
+        Return the number of splitting iterations in the cross-validator.
 
         Parameters
         ----------
@@ -197,47 +163,57 @@ def get_n_splits(self, X=None, y=None, groups=None):
             Training data.
         y : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
-        groups : array-like
+        groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
 
         Returns
         -------
         n_splits : int
-            Returns the number of splitting iterations.
+            The number of splits.
         """
-        if X is None:
-            raise ValueError("X cannot be None")
-        time_data = self._get_time_data(X)
-        return time_data.dt.to_period('M').nunique() - 1
+        dates = X.index if self.time_col == 'index' else X[self.time_col]
+        dates = pd.to_datetime(dates)
+        return len(pd.unique(dates.to_period('M'))) - 1
 
-    def _get_time_data(self, X):
-        """Extract datetime data from DataFrame.
+    def split(self, X, y=None, groups=None):
+        """
+        Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : DataFrame
-            Input data.
-
-        Returns
-        -------
-        pd.Series
-            Series containing datetime values.
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+        y : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+        groups : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
 
-        Raises
+        Yields
         ------
-        ValueError
-            If time column is not found or not datetime type.
+        idx_train : ndarray
+            The training set indices for that split.
+        idx_test : ndarray
+            The testing set indices for that split.
         """
         if self.time_col == 'index':
-            if not isinstance(X.index, pd.DatetimeIndex):
-                raise ValueError("Index must be DatetimeIndex when time_col='index'")
-            return pd.Series(X.index)
-            
-        if self.time_col not in X.columns:
-            raise ValueError(f"Column {self.time_col} not found")
-            
-        time_values = X[self.time_col]
-        if not pd.api.types.is_datetime64_any_dtype(time_values):
-            raise ValueError(f"Column {self.time_col} must be datetime type")
-            
-        return time_values
+            dates = X.index
+        else:
+            if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]):
+                raise ValueError('time_col must be of type datetime')
+            dates = X[self.time_col]
+
+        dates = pd.to_datetime(dates)
+        periods = dates.to_period('M')
+        unique_periods = periods.unique()
+
+        for i in range(len(unique_periods) - 1):
+            train_period = unique_periods[i]
+            test_period = unique_periods[i + 1]
+
+            train_mask = periods == train_period
+            test_mask = periods == test_period
+
+            yield (
+                np.where(train_mask)[0],
+                np.where(test_mask)[0]
+            )

From 1be4c2d74c6ea685291f749980df9aac752dac78 Mon Sep 17 00:00:00 2001
From: docloukman <165569528+docloukman@users.noreply.github.com>
Date: Sun, 22 Dec 2024 15:53:13 +0100
Subject: [PATCH 27/27] Update sklearn_questions.py

---
 sklearn_questions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 991f040..e9ecdb6 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -89,6 +89,7 @@ def fit(self, X, y):
         self.X_ = X
         self.y_ = y
         self.classes_ = np.unique(y)
+        self.n_features_in_ = X.shape[1]
         self.is_fitted_ = True
         return self