From 802c0293ed7b10c18e8af04ad8b2a7d4d9bbf773 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:14:32 +0100 Subject: [PATCH 01/27] Update sklearn_questions.py --- sklearn_questions.py | 287 ++++++++++++++++++++++++------------------- 1 file changed, 158 insertions(+), 129 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..8aaacdf 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,188 +1,217 @@ -"""Assignment - making a sklearn estimator and cv splitter. - -The goal of this assignment is to implement by yourself: - -- a scikit-learn estimator for the KNearestNeighbors for classification - tasks and check that it is working properly. -- a scikit-learn CV splitter where the splits are based on a Pandas - DateTimeIndex. - -Detailed instructions for question 1: -The nearest neighbor classifier predicts for a point X_i the target y_k of -the training sample X_k which is the closest to X_i. We measure proximity with -the Euclidean distance. The model will be evaluated with the accuracy (average -number of samples corectly classified). You need to implement the `fit`, -`predict` and `score` methods for this class. The code you write should pass -the test we implemented. You can run the tests by calling at the root of the -repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a -scikit-learn estimator needs to check that the input given to `fit` and -`predict` are correct using the `check_*` functions imported in the file. -You can find more information on how they should be used in the following doc: -https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. -Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - - -Detailed instructions for question 2: -The data to split should contain the index or one column in -datatime format. Then the aim is to split the data between train and test -sets when for each pair of successive months, we learn on the first and -predict of the following. For example if you have data distributed from -november 2020 to march 2021, you have have 4 splits. The first split -will allow to learn on november data and predict on december data, the -second split to learn december and predict on january etc. - -We also ask you to respect the pep8 convention: https://pep8.org. This will be -enforced with `flake8`. You can check that there is no flake8 errors by -calling `flake8` at the root of the repo. - -Finally, you need to write docstrings for the methods you code and for the -class. The docstring will be checked using `pydocstyle` that you can also -call at the root of the repo. - -Hints ------ -- You can use the function: - -from sklearn.metrics.pairwise import pairwise_distances - -to compute distances between 2 sets of samples. -""" -import numpy as np -import pandas as pd - -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin +class KNearestNeighbors(BaseEstimator, ClassifierMixin): + """ + K-Nearest Neighbors classifier. -from sklearn.model_selection import BaseCrossValidator + Parameters + ---------- + n_neighbors : int, default=1 + Number of neighbors to use for predictions. -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array -from sklearn.utils.multiclass import check_classification_targets -from sklearn.metrics.pairwise import pairwise_distances + Attributes + ---------- + X_ : ndarray of shape (n_samples, n_features) + Training data stored during fit. + y_ : ndarray of shape (n_samples,) + Labels stored during fit. -class KNearestNeighbors(BaseEstimator, ClassifierMixin): - """KNearestNeighbors classifier.""" + n_features_in_ : int + Number of features in the training data. + """ - def __init__(self, n_neighbors=1): # noqa: D107 + def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors def fit(self, X, y): - """Fitting function. + """ + Fit the KNN classifier on training data. - Parameters + Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data to train the model. - y : ndarray, shape (n_samples,) - Labels associated with the training data. + X : ndarray of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target labels for training data. Returns - ---------- - self : instance of KNearestNeighbors - The current instance of the classifier + ------- + self : object + Returns the instance itself. """ + # Validate input + X, y = check_X_y(X, y) + check_classification_targets(y) + + self.X_ = X + self.y_ = y + self.n_features_in_ = X.shape[1] return self def predict(self, X): - """Predict function. + """ + Predict the class labels for the given data. Parameters ---------- - X : ndarray, shape (n_test_samples, n_features) - Data to predict on. + X : ndarray of shape (n_samples, n_features) + Test data. Returns - ---------- - y : ndarray, shape (n_test_samples,) - Predicted class labels for each test data sample. + ------- + y_pred : ndarray of shape (n_samples,) + Predicted class labels. """ - y_pred = np.zeros(X.shape[0]) - return y_pred + # Check if the classifier has been fitted + check_is_fitted(self, ["X_", "y_"]) + + # Validate input + X = check_array(X) + + # Compute distances and predict + distances = pairwise_distances(X, self.X_) + nearest_indices = np.argmin(distances, axis=1) + return self.y_[nearest_indices] def score(self, X, y): - """Calculate the score of the prediction. + """ + Compute the accuracy of the classifier. Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data to score on. - y : ndarray, shape (n_samples,) - target values. + X : ndarray of shape (n_samples, n_features) + Test data. + + y : ndarray of shape (n_samples,) + True labels for test data. Returns - ---------- + ------- score : float - Accuracy of the model computed for the (X, y) pairs. + Mean accuracy of predictions. """ - return 0. + y_pred = self.predict(X) + return np.mean(y_pred == y) +Updated MonthlySplit Class +python +Copier le code +from sklearn.model_selection import BaseCrossValidator +import numpy as np +import pandas as pd class MonthlySplit(BaseCrossValidator): - """CrossValidator based on monthly split. - - Split data based on the given `time_col` (or default to index). Each split - corresponds to one month of data for the training and the next month of - data for the test. + """ + Cross-validator that splits data based on months. Parameters ---------- - time_col : str, defaults to 'index' - Column of the input DataFrame that will be used to split the data. This - column should be of type datetime. If split is called with a DataFrame - for which this column is not a datetime, it will raise a ValueError. - To use the index as column just set `time_col` to `'index'`. + time_col : str, default='index' + Column to use for date-based splitting. If 'index', the index of the + DataFrame is used as the date column. + + Methods + ------- + get_n_splits(X, y=None, groups=None) + Return the number of splits. + + split(X, y=None, groups=None) + Generate indices for training and testing splits. + + Raises + ------ + ValueError + If the `time_col` is not found or not a datetime type. """ - def __init__(self, time_col='index'): # noqa: D107 + def __init__(self, time_col='index'): self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): - """Return the number of splitting iterations in the cross-validator. + """ + Return the number of splitting iterations in the cross-validator. Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like of shape (n_samples,) - Always ignored, exists for compatibility. + X : DataFrame + Input data with datetime information. + + y : None + Ignored, exists for API compatibility. + + groups : None + Ignored, exists for API compatibility. Returns ------- n_splits : int - The number of splits. + Number of month-based splits. """ - return 0 + time_data = self._get_time_data(X) + return len(time_data.dt.to_period("M").unique()) - 1 - def split(self, X, y, groups=None): - """Generate indices to split data into training and test set. + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like of shape (n_samples,) - Always ignored, exists for compatibility. + X : DataFrame + Input data with datetime information. + + y : None + Ignored, exists for API compatibility. + + groups : None + Ignored, exists for API compatibility. Yields ------ - idx_train : ndarray - The training set indices for that split. - idx_test : ndarray - The testing set indices for that split. + train_indices : ndarray + Indices for training data. + + test_indices : ndarray + Indices for testing data. """ + time_data = self._get_time_data(X) + months = time_data.dt.to_period("M").unique() + + for i in range(len(months) - 1): + train_mask = time_data.dt.to_period("M") == months[i] + test_mask = time_data.dt.to_period("M") == months[i + 1] - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) - for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test - ) + train_indices = np.where(train_mask)[0] + test_indices = np.where(test_mask)[0] + + yield train_indices, test_indices + + def _get_time_data(self, X): + """ + Extract the datetime data from the specified column or index. + + Parameters + ---------- + X : DataFrame + Input data. + + Returns + ------- + time_data : Series + Series of datetime values. + + Raises + ------ + ValueError + If the column is not found or is not datetime-like. + """ + if self.time_col == 'index': + if not isinstance(X.index, pd.DatetimeIndex): + raise ValueError("Index must be a DatetimeIndex.") + return X.index + elif self.time_col in X.columns: + time_data = X[self.time_col] + if not np.issubdtype(time_data.dtype, np.datetime64): + raise ValueError(f"Column {self.time_col} must be of datetime type.") + return time_data + else: + raise ValueError(f"Column {self.time_col} not found in input data.") From 9bbfc1d28868cd7859b0b27e545e0e49fba3cf74 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:17:22 +0100 Subject: [PATCH 02/27] Update sklearn_questions.py --- sklearn_questions.py | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/sklearn_questions.py b/sklearn_questions.py index 8aaacdf..95c5868 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,3 +1,66 @@ +"""Assignment - making a sklearn estimator and cv splitter. + +The goal of this assignment is to implement by yourself: + +- a scikit-learn estimator for the KNearestNeighbors for classification + tasks and check that it is working properly. +- a scikit-learn CV splitter where the splits are based on a Pandas + DateTimeIndex. + +Detailed instructions for question 1: +The nearest neighbor classifier predicts for a point X_i the target y_k of +the training sample X_k which is the closest to X_i. We measure proximity with +the Euclidean distance. The model will be evaluated with the accuracy (average +number of samples corectly classified). You need to implement the `fit`, +`predict` and `score` methods for this class. The code you write should pass +the test we implemented. You can run the tests by calling at the root of the +repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a +scikit-learn estimator needs to check that the input given to `fit` and +`predict` are correct using the `check_*` functions imported in the file. +You can find more information on how they should be used in the following doc: +https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. +Make sure to use them to pass `test_nearest_neighbor_check_estimator`. + + +Detailed instructions for question 2: +The data to split should contain the index or one column in +datatime format. Then the aim is to split the data between train and test +sets when for each pair of successive months, we learn on the first and +predict of the following. For example if you have data distributed from +november 2020 to march 2021, you have have 4 splits. The first split +will allow to learn on november data and predict on december data, the +second split to learn december and predict on january etc. + +We also ask you to respect the pep8 convention: https://pep8.org. This will be +enforced with `flake8`. You can check that there is no flake8 errors by +calling `flake8` at the root of the repo. + +Finally, you need to write docstrings for the methods you code and for the +class. The docstring will be checked using `pydocstyle` that you can also +call at the root of the repo. + +Hints +----- +- You can use the function: + +from sklearn.metrics.pairwise import pairwise_distances + +to compute distances between 2 sets of samples. +""" +import numpy as np +import pandas as pd + +from sklearn.base import BaseEstimator +from sklearn.base import ClassifierMixin + +from sklearn.model_selection import BaseCrossValidator + +from sklearn.utils.validation import check_X_y, check_is_fitted +from sklearn.utils.validation import check_array +from sklearn.utils.multiclass import check_classification_targets +from sklearn.metrics.pairwise import pairwise_distances + + class KNearestNeighbors(BaseEstimator, ClassifierMixin): """ K-Nearest Neighbors classifier. From f79352effd5db2dc91a990115dca4a6e64850c37 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:23:59 +0100 Subject: [PATCH 03/27] Update sklearn_questions.py --- sklearn_questions.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 95c5868..6d21e8a 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -46,7 +46,9 @@ from sklearn.metrics.pairwise import pairwise_distances to compute distances between 2 sets of samples. -""" +"""` + + import numpy as np import pandas as pd @@ -131,9 +133,8 @@ def predict(self, X): # Validate input X = check_array(X) - # Compute distances and predict - distances = pairwise_distances(X, self.X_) - nearest_indices = np.argmin(distances, axis=1) + # Compute nearest neighbors and predict + nearest_indices, _ = pairwise_distances_argmin_min(X, self.X_) return self.y_[nearest_indices] def score(self, X, y): @@ -155,12 +156,6 @@ def score(self, X, y): """ y_pred = self.predict(X) return np.mean(y_pred == y) -Updated MonthlySplit Class -python -Copier le code -from sklearn.model_selection import BaseCrossValidator -import numpy as np -import pandas as pd class MonthlySplit(BaseCrossValidator): @@ -187,9 +182,11 @@ class MonthlySplit(BaseCrossValidator): If the `time_col` is not found or not a datetime type. """ + def __init__(self, time_col='index'): self.time_col = time_col + def get_n_splits(self, X, y=None, groups=None): """ Return the number of splitting iterations in the cross-validator. @@ -213,6 +210,7 @@ def get_n_splits(self, X, y=None, groups=None): time_data = self._get_time_data(X) return len(time_data.dt.to_period("M").unique()) - 1 + def split(self, X, y=None, groups=None): """ Generate indices to split data into training and test set. @@ -248,6 +246,7 @@ def split(self, X, y=None, groups=None): yield train_indices, test_indices + def _get_time_data(self, X): """ Extract the datetime data from the specified column or index. From 4f98b3ff54201ca04362a1ed7582c153116adcad Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:29:56 +0100 Subject: [PATCH 04/27] Update sklearn_questions.py --- sklearn_questions.py | 69 ++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 6d21e8a..0322ec5 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,4 +1,5 @@ -"""Assignment - making a sklearn estimator and cv splitter. +""" +Assignment - making a sklearn estimator and CV splitter. The goal of this assignment is to implement by yourself: @@ -11,7 +12,7 @@ The nearest neighbor classifier predicts for a point X_i the target y_k of the training sample X_k which is the closest to X_i. We measure proximity with the Euclidean distance. The model will be evaluated with the accuracy (average -number of samples corectly classified). You need to implement the `fit`, +number of samples correctly classified). You need to implement the `fit`, `predict` and `score` methods for this class. The code you write should pass the test we implemented. You can run the tests by calling at the root of the repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a @@ -21,17 +22,16 @@ https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - Detailed instructions for question 2: The data to split should contain the index or one column in -datatime format. Then the aim is to split the data between train and test +datetime format. Then the aim is to split the data between train and test sets when for each pair of successive months, we learn on the first and -predict of the following. For example if you have data distributed from -november 2020 to march 2021, you have have 4 splits. The first split -will allow to learn on november data and predict on december data, the -second split to learn december and predict on january etc. +predict on the following. For example if you have data distributed from +November 2020 to March 2021, you have have 4 splits. The first split +will allow to learn on November data and predict on December data, the +second split to learn December and predict on January etc. -We also ask you to respect the pep8 convention: https://pep8.org. This will be +We also ask you to respect the PEP8 convention: https://pep8.org. This will be enforced with `flake8`. You can check that there is no flake8 errors by calling `flake8` at the root of the repo. @@ -46,19 +46,14 @@ from sklearn.metrics.pairwise import pairwise_distances to compute distances between 2 sets of samples. -"""` - +""" import numpy as np import pandas as pd -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin - +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.model_selection import BaseCrossValidator - -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances @@ -133,9 +128,19 @@ def predict(self, X): # Validate input X = check_array(X) - # Compute nearest neighbors and predict - nearest_indices, _ = pairwise_distances_argmin_min(X, self.X_) - return self.y_[nearest_indices] + # Compute distances + distances = pairwise_distances(X, self.X_, metric='euclidean') + + # Find the indices of the k nearest neighbors + neighbors_idx = np.argsort(distances, axis=1)[:, :self.n_neighbors] + + # Gather the neighbor labels + neighbor_labels = self.y_[neighbors_idx] + + # Predict by majority vote + y_pred = np.array([np.bincount(row.astype(int)).argmax() for row in neighbor_labels]) + + return y_pred def score(self, X, y): """ @@ -182,11 +187,9 @@ class MonthlySplit(BaseCrossValidator): If the `time_col` is not found or not a datetime type. """ - def __init__(self, time_col='index'): self.time_col = time_col - def get_n_splits(self, X, y=None, groups=None): """ Return the number of splitting iterations in the cross-validator. @@ -208,9 +211,9 @@ def get_n_splits(self, X, y=None, groups=None): Number of month-based splits. """ time_data = self._get_time_data(X) - return len(time_data.dt.to_period("M").unique()) - 1 + unique_months = time_data.dt.to_period("M").drop_duplicates() + return max(len(unique_months) - 1, 0) - def split(self, X, y=None, groups=None): """ Generate indices to split data into training and test set. @@ -235,18 +238,20 @@ def split(self, X, y=None, groups=None): Indices for testing data. """ time_data = self._get_time_data(X) - months = time_data.dt.to_period("M").unique() + unique_months = time_data.dt.to_period("M").drop_duplicates() + + for i in range(len(unique_months) - 1): + train_month = unique_months[i] + test_month = unique_months[i + 1] - for i in range(len(months) - 1): - train_mask = time_data.dt.to_period("M") == months[i] - test_mask = time_data.dt.to_period("M") == months[i + 1] + train_mask = time_data.dt.to_period("M") == train_month + test_mask = time_data.dt.to_period("M") == test_month train_indices = np.where(train_mask)[0] test_indices = np.where(test_mask)[0] yield train_indices, test_indices - def _get_time_data(self, X): """ Extract the datetime data from the specified column or index. @@ -269,11 +274,11 @@ def _get_time_data(self, X): if self.time_col == 'index': if not isinstance(X.index, pd.DatetimeIndex): raise ValueError("Index must be a DatetimeIndex.") - return X.index + return pd.Series(X.index) elif self.time_col in X.columns: time_data = X[self.time_col] if not np.issubdtype(time_data.dtype, np.datetime64): - raise ValueError(f"Column {self.time_col} must be of datetime type.") + raise ValueError(f"Column '{self.time_col}' must be of datetime type.") return time_data else: - raise ValueError(f"Column {self.time_col} not found in input data.") + raise ValueError(f"Column '{self.time_col}' not found in input data.") From 713b620d7c6bfdc099925cae4b7e304383bf893d Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:32:04 +0100 Subject: [PATCH 05/27] Update sklearn_questions.py --- sklearn_questions.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 0322ec5..51b537c 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -128,6 +128,13 @@ def predict(self, X): # Validate input X = check_array(X) + # Ensure the number of features matches + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"Number of features in X ({X.shape[1]}) does not match " + f"the number of features during fit ({self.n_features_in_})." + ) + # Compute distances distances = pairwise_distances(X, self.X_, metric='euclidean') @@ -138,7 +145,10 @@ def predict(self, X): neighbor_labels = self.y_[neighbors_idx] # Predict by majority vote - y_pred = np.array([np.bincount(row.astype(int)).argmax() for row in neighbor_labels]) + y_pred = np.array([ + np.bincount(row.astype(int)).argmax() if len(np.unique(row)) > 0 else 0 + for row in neighbor_labels + ]) return y_pred @@ -241,8 +251,8 @@ def split(self, X, y=None, groups=None): unique_months = time_data.dt.to_period("M").drop_duplicates() for i in range(len(unique_months) - 1): - train_month = unique_months[i] - test_month = unique_months[i + 1] + train_month = unique_months.iloc[i] + test_month = unique_months.iloc[i + 1] train_mask = time_data.dt.to_period("M") == train_month test_mask = time_data.dt.to_period("M") == test_month From e23c2d575e3c7f56f32ea23d3de08e435f4e1567 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:34:54 +0100 Subject: [PATCH 06/27] Update sklearn_questions.py --- sklearn_questions.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 51b537c..70c7d6d 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -53,7 +53,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted +from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances @@ -99,13 +99,18 @@ def fit(self, X, y): self : object Returns the instance itself. """ - # Validate input - X, y = check_X_y(X, y) + # Validate input and set n_features_in_ + X, y = validate_data( + self, X, y, + ensure_2d=True, + accept_sparse=False, + dtype=None, + reset=True + ) check_classification_targets(y) self.X_ = X self.y_ = y - self.n_features_in_ = X.shape[1] return self def predict(self, X): @@ -125,15 +130,14 @@ def predict(self, X): # Check if the classifier has been fitted check_is_fitted(self, ["X_", "y_"]) - # Validate input - X = check_array(X) - - # Ensure the number of features matches - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"Number of features in X ({X.shape[1]}) does not match " - f"the number of features during fit ({self.n_features_in_})." - ) + # Validate input, reset=False to keep n_features_in_ + X = validate_data( + self, X, + ensure_2d=True, + accept_sparse=False, + dtype=None, + reset=False + ) # Compute distances distances = pairwise_distances(X, self.X_, metric='euclidean') @@ -221,7 +225,7 @@ def get_n_splits(self, X, y=None, groups=None): Number of month-based splits. """ time_data = self._get_time_data(X) - unique_months = time_data.dt.to_period("M").drop_duplicates() + unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values() return max(len(unique_months) - 1, 0) def split(self, X, y=None, groups=None): @@ -248,7 +252,7 @@ def split(self, X, y=None, groups=None): Indices for testing data. """ time_data = self._get_time_data(X) - unique_months = time_data.dt.to_period("M").drop_duplicates() + unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values() for i in range(len(unique_months) - 1): train_month = unique_months.iloc[i] From 446117050a419e1c26c9e3b95c2fe037def66f6c Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:37:51 +0100 Subject: [PATCH 07/27] Update sklearn_questions.py --- sklearn_questions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 70c7d6d..346d156 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -101,10 +101,10 @@ def fit(self, X, y): """ # Validate input and set n_features_in_ X, y = validate_data( - self, X, y, - ensure_2d=True, + X, y, accept_sparse=False, dtype=None, + ensure_2d=True, reset=True ) check_classification_targets(y) @@ -132,10 +132,10 @@ def predict(self, X): # Validate input, reset=False to keep n_features_in_ X = validate_data( - self, X, - ensure_2d=True, + X, accept_sparse=False, dtype=None, + ensure_2d=True, reset=False ) From df3dd4f072eb635ff9f9c0cef48af7574b848cd9 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:40:26 +0100 Subject: [PATCH 08/27] Update sklearn_questions.py From 1b63f2904e7606e83784ea8251afdcd4c48fd565 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:41:22 +0100 Subject: [PATCH 09/27] Update sklearn_questions.py From 9a0fcc9042edfcff2821ba815601da54c01fb23e Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sat, 21 Dec 2024 02:43:18 +0100 Subject: [PATCH 10/27] Update sklearn_questions.py From 6b5d668a9cdce6bd172d64d5111185cf65b298be Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 00:38:00 +0100 Subject: [PATCH 11/27] Update sklearn_questions.py --- sklearn_questions.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn_questions.py b/sklearn_questions.py index 346d156..d8c84af 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -80,6 +80,14 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin): """ def __init__(self, n_neighbors=1): + """ + Initialize the KNearestNeighbors classifier. + + Parameters + ---------- + n_neighbors : int, default=1 + Number of neighbors to use for predictions. + """ self.n_neighbors = n_neighbors def fit(self, X, y): @@ -202,6 +210,15 @@ class MonthlySplit(BaseCrossValidator): """ def __init__(self, time_col='index'): + """ + Initialize the MonthlySplit cross-validator. + + Parameters + ---------- + time_col : str, default='index' + Column to use for date-based splitting. If 'index', the index of the + DataFrame is used as the date column. + """ self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): From 6c1fa26b62ead1f2d7212bebc0e5f4494d3adb1a Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 01:02:04 +0100 Subject: [PATCH 12/27] Update sklearn_questions.py --- sklearn_questions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index d8c84af..5f6472d 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -47,15 +47,14 @@ to compute distances between 2 sets of samples. """ - import numpy as np import pandas as pd - from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.model_selection import BaseCrossValidator +from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets -from sklearn.metrics.pairwise import pairwise_distances +from sklearn.model_selection import BaseCrossValidator +from sklearn.utils.validation import check_is_fitted class KNearestNeighbors(BaseEstimator, ClassifierMixin): From 5e704d250bacb1159558e850993edabbe96c09d4 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:13:34 +0100 Subject: [PATCH 13/27] Update sklearn_questions.py --- sklearn_questions.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 5f6472d..cbf2ffa 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -106,7 +106,7 @@ def fit(self, X, y): self : object Returns the instance itself. """ - # Validate input and set n_features_in_ + # Validate input data and ensure it's 2D X, y = validate_data( X, y, accept_sparse=False, @@ -114,6 +114,7 @@ def fit(self, X, y): ensure_2d=True, reset=True ) + # Validate that y contains classification targets check_classification_targets(y) self.X_ = X @@ -137,7 +138,7 @@ def predict(self, X): # Check if the classifier has been fitted check_is_fitted(self, ["X_", "y_"]) - # Validate input, reset=False to keep n_features_in_ + # Validate input data and ensure it's 2D X = validate_data( X, accept_sparse=False, @@ -146,10 +147,10 @@ def predict(self, X): reset=False ) - # Compute distances + # Compute pairwise distances between X and the training data distances = pairwise_distances(X, self.X_, metric='euclidean') - # Find the indices of the k nearest neighbors + # Find the indices of the k nearest neighbors for each sample neighbors_idx = np.argsort(distances, axis=1)[:, :self.n_neighbors] # Gather the neighbor labels From f6d4460f6a53ae95eae720f48d8bfb602de5bce4 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:26:49 +0100 Subject: [PATCH 14/27] Update sklearn_questions.py --- sklearn_questions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index cbf2ffa..581319e 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -47,14 +47,15 @@ to compute distances between 2 sets of samples. """ + + import numpy as np import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.model_selection import BaseCrossValidator from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets -from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.validation import check_is_fitted class KNearestNeighbors(BaseEstimator, ClassifierMixin): From cdb1590f168c89a9cb4fc8433116eb14ed7c0d47 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:38:59 +0100 Subject: [PATCH 15/27] Update sklearn_questions.py --- sklearn_questions.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 581319e..0994baf 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -56,6 +56,7 @@ from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets +from sklearn.preprocessing import LabelEncoder class KNearestNeighbors(BaseEstimator, ClassifierMixin): @@ -73,7 +74,13 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin): Training data stored during fit. y_ : ndarray of shape (n_samples,) - Labels stored during fit. + Encoded labels stored during fit. + + le_ : LabelEncoder + Label encoder fitted on y. + + classes_ : ndarray of shape (n_classes,) + Unique class labels. n_features_in_ : int Number of features in the training data. @@ -107,19 +114,28 @@ def fit(self, X, y): self : object Returns the instance itself. """ - # Validate input data and ensure it's 2D + # Validate input data and ensure X is 2D and y is 1D X, y = validate_data( X, y, accept_sparse=False, dtype=None, ensure_2d=True, + y_numeric=False, + multi_output=False, reset=True ) + # Validate that y contains classification targets check_classification_targets(y) + # Encode y to ensure it contains non-negative integers starting from 0 + self.le_ = LabelEncoder() + y_encoded = self.le_.fit_transform(y) + self.X_ = X - self.y_ = y + self.y_ = y_encoded + self.classes_ = self.le_.classes_ + return self def predict(self, X): @@ -137,7 +153,7 @@ def predict(self, X): Predicted class labels. """ # Check if the classifier has been fitted - check_is_fitted(self, ["X_", "y_"]) + check_is_fitted(self, ["X_", "y_", "le_"]) # Validate input data and ensure it's 2D X = validate_data( @@ -158,11 +174,14 @@ def predict(self, X): neighbor_labels = self.y_[neighbors_idx] # Predict by majority vote - y_pred = np.array([ - np.bincount(row.astype(int)).argmax() if len(np.unique(row)) > 0 else 0 + y_pred_encoded = np.array([ + np.bincount(row).argmax() if len(np.unique(row)) > 0 else 0 for row in neighbor_labels ]) + # Decode the encoded labels back to original labels + y_pred = self.le_.inverse_transform(y_pred_encoded) + return y_pred def score(self, X, y): From 789702c00bf348d0c6871e609c8707e60e17aa04 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:48:41 +0100 Subject: [PATCH 16/27] Update sklearn_questions.py --- sklearn_questions.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 0994baf..9e737b0 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -52,11 +52,13 @@ import numpy as np import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.model_selection import BaseCrossValidator from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import BaseCrossValidator +from sklearn.utils.estimator_checks import _check_feature_names, get_tags +from sklearn.utils.metaestimators import available_if class KNearestNeighbors(BaseEstimator, ClassifierMixin): @@ -116,11 +118,12 @@ def fit(self, X, y): """ # Validate input data and ensure X is 2D and y is 1D X, y = validate_data( - X, y, + self, + X, + y, accept_sparse=False, dtype=None, ensure_2d=True, - y_numeric=False, multi_output=False, reset=True ) @@ -157,6 +160,7 @@ def predict(self, X): # Validate input data and ensure it's 2D X = validate_data( + self, X, accept_sparse=False, dtype=None, From 2396a38c7a17a0e5ea0147904913ddbfc2476762 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:52:43 +0100 Subject: [PATCH 17/27] Update sklearn_questions.py --- sklearn_questions.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 9e737b0..a0a98a2 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -49,16 +49,15 @@ """ +# sklearn_questions.py + import numpy as np -import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.estimator_checks import _check_feature_names, get_tags -from sklearn.utils.metaestimators import available_if class KNearestNeighbors(BaseEstimator, ClassifierMixin): From 99d2a53900b66013e957f55b9b1606b000dd3af7 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 13:19:07 +0100 Subject: [PATCH 18/27] Update sklearn_questions.py --- sklearn_questions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index a0a98a2..ffd36a4 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -49,18 +49,17 @@ """ -# sklearn_questions.py - import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin +import pandas as pd # Ensure pandas is imported +from sklearn.base import BaseEstimator, ClassifierMixin, get_tags from sklearn.metrics.pairwise import pairwise_distances -from sklearn.utils.validation import validate_data, check_is_fitted +from sklearn.utils.validation import validate_data, check_is_fitted, check_array from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import BaseCrossValidator -class KNearestNeighbors(BaseEstimator, ClassifierMixin): +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """ K-Nearest Neighbors classifier. @@ -164,6 +163,7 @@ def predict(self, X): accept_sparse=False, dtype=None, ensure_2d=True, + multi_output=False, reset=False ) From 0e48d48ba00ed11c72800f0a97160958b216df41 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:07:34 +0100 Subject: [PATCH 19/27] Update sklearn_questions.py --- sklearn_questions.py | 273 ++++++++++++++++++------------------------- 1 file changed, 115 insertions(+), 158 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index ffd36a4..3c574e9 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -50,289 +50,246 @@ import numpy as np -import pandas as pd # Ensure pandas is imported -from sklearn.base import BaseEstimator, ClassifierMixin, get_tags +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import pairwise_distances -from sklearn.utils.validation import validate_data, check_is_fitted, check_array +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import BaseCrossValidator -class KNearestNeighbors(ClassifierMixin, BaseEstimator): - """ - K-Nearest Neighbors classifier. +class KNearestNeighbors(BaseEstimator, ClassifierMixin): + """Classifier implementing the k-nearest neighbors algorithm. + + This classifier predicts the target of a test point based on the target + of its nearest neighbor in the training set, using Euclidean distance. Parameters ---------- n_neighbors : int, default=1 - Number of neighbors to use for predictions. + Number of neighbors to consider for prediction. Attributes ---------- X_ : ndarray of shape (n_samples, n_features) - Training data stored during fit. - + The input samples. y_ : ndarray of shape (n_samples,) - Encoded labels stored during fit. - - le_ : LabelEncoder - Label encoder fitted on y. - + The target values. classes_ : ndarray of shape (n_classes,) - Unique class labels. - - n_features_in_ : int - Number of features in the training data. + The unique classes labels. """ def __init__(self, n_neighbors=1): - """ - Initialize the KNearestNeighbors classifier. + """Initialize the KNearestNeighbors classifier. Parameters ---------- n_neighbors : int, default=1 - Number of neighbors to use for predictions. + Number of neighbors to use for prediction. """ self.n_neighbors = n_neighbors def fit(self, X, y): - """ - Fit the KNN classifier on training data. + """Fit the k-nearest neighbors classifier. Parameters ---------- - X : ndarray of shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data. - - y : ndarray of shape (n_samples,) - Target labels for training data. + y : array-like of shape (n_samples,) + Target values. Returns ------- self : object Returns the instance itself. """ - # Validate input data and ensure X is 2D and y is 1D - X, y = validate_data( - self, - X, - y, - accept_sparse=False, - dtype=None, - ensure_2d=True, - multi_output=False, - reset=True - ) - - # Validate that y contains classification targets + # Input validation using sklearn's check functions + X, y = check_X_y(X, y) check_classification_targets(y) - # Encode y to ensure it contains non-negative integers starting from 0 + # Store number of features for predict step validation + self.n_features_in_ = X.shape[1] + + # Encode class labels self.le_ = LabelEncoder() - y_encoded = self.le_.fit_transform(y) + y = self.le_.fit_transform(y) self.X_ = X - self.y_ = y_encoded + self.y_ = y self.classes_ = self.le_.classes_ return self def predict(self, X): - """ - Predict the class labels for the given data. + """Predict class labels for samples in X. Parameters ---------- - X : ndarray of shape (n_samples, n_features) - Test data. + X : array-like of shape (n_samples, n_features) + The data to predict. Returns ------- y_pred : ndarray of shape (n_samples,) - Predicted class labels. + The predicted class labels. """ - # Check if the classifier has been fitted - check_is_fitted(self, ["X_", "y_", "le_"]) - - # Validate input data and ensure it's 2D - X = validate_data( - self, - X, - accept_sparse=False, - dtype=None, - ensure_2d=True, - multi_output=False, - reset=False - ) + check_is_fitted(self) + X = check_array(X) - # Compute pairwise distances between X and the training data - distances = pairwise_distances(X, self.X_, metric='euclidean') + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but KNearestNeighbors " + f"was trained with {self.n_features_in_} features." + ) - # Find the indices of the k nearest neighbors for each sample - neighbors_idx = np.argsort(distances, axis=1)[:, :self.n_neighbors] + # Compute distances between test points and training points + distances = pairwise_distances(X, self.X_) - # Gather the neighbor labels - neighbor_labels = self.y_[neighbors_idx] + # Find indices of k nearest neighbors + k_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors] - # Predict by majority vote - y_pred_encoded = np.array([ - np.bincount(row).argmax() if len(np.unique(row)) > 0 else 0 - for row in neighbor_labels - ]) + # Get labels of k nearest neighbors + k_neighbors_labels = self.y_[k_neighbors] - # Decode the encoded labels back to original labels - y_pred = self.le_.inverse_transform(y_pred_encoded) + # Predict by majority voting + y_pred = np.apply_along_axis( + lambda x: np.bincount(x).argmax(), + axis=1, + arr=k_neighbors_labels + ) - return y_pred + return self.le_.inverse_transform(y_pred) def score(self, X, y): - """ - Compute the accuracy of the classifier. + """Return the mean accuracy on the given test data and labels. Parameters ---------- - X : ndarray of shape (n_samples, n_features) - Test data. - - y : ndarray of shape (n_samples,) - True labels for test data. + X : array-like of shape (n_samples, n_features) + Test samples. + y : array-like of shape (n_samples,) + True labels for X. Returns ------- score : float Mean accuracy of predictions. """ - y_pred = self.predict(X) - return np.mean(y_pred == y) + return np.mean(self.predict(X) == y) class MonthlySplit(BaseCrossValidator): - """ - Cross-validator that splits data based on months. + """Monthly cross-validation splitter. + + Provides train/test indices to split time series data between successive + months. For each split, test indices must be higher than before, and thus + shuffling in cross validator is inappropriate. Parameters ---------- time_col : str, default='index' - Column to use for date-based splitting. If 'index', the index of the - DataFrame is used as the date column. - - Methods - ------- - get_n_splits(X, y=None, groups=None) - Return the number of splits. - - split(X, y=None, groups=None) - Generate indices for training and testing splits. - - Raises - ------ - ValueError - If the `time_col` is not found or not a datetime type. + Column name containing datetime values. If 'index', the index is used. + + Examples + -------- + >>> import pandas as pd + >>> dates = pd.date_range('2020-01-01', '2020-03-31', freq='D') + >>> X = pd.DataFrame({'val': range(len(dates))}, index=dates) + >>> cv = MonthlySplit() + >>> for train_idx, test_idx in cv.split(X): + ... print(f"TRAIN:", X.index[train_idx].min(), X.index[train_idx].max()) + ... print(f"TEST:", X.index[test_idx].min(), X.index[test_idx].max()) """ def __init__(self, time_col='index'): - """ - Initialize the MonthlySplit cross-validator. + """Initialize the monthly splitter. Parameters ---------- time_col : str, default='index' - Column to use for date-based splitting. If 'index', the index of the - DataFrame is used as the date column. + Column containing datetime values or 'index'. """ self.time_col = time_col - def get_n_splits(self, X, y=None, groups=None): - """ - Return the number of splitting iterations in the cross-validator. + def get_n_splits(self, X=None, y=None, groups=None): + """Return the number of splitting iterations. Parameters ---------- - X : DataFrame - Input data with datetime information. - - y : None - Ignored, exists for API compatibility. - - groups : None - Ignored, exists for API compatibility. + X : pd.DataFrame + Training data. + y : array-like, default=None + Always ignored, exists for compatibility. + groups : array-like, default=None + Always ignored, exists for compatibility. Returns ------- n_splits : int - Number of month-based splits. + Returns the number of splitting iterations. """ time_data = self._get_time_data(X) - unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values() + unique_months = time_data.dt.to_period('M').unique() return max(len(unique_months) - 1, 0) def split(self, X, y=None, groups=None): - """ - Generate indices to split data into training and test set. + """Generate indices to split data into training and test set. Parameters ---------- - X : DataFrame - Input data with datetime information. - - y : None - Ignored, exists for API compatibility. - - groups : None - Ignored, exists for API compatibility. + X : pd.DataFrame + Training data. + y : array-like, default=None + Always ignored, exists for compatibility. + groups : array-like, default=None + Always ignored, exists for compatibility. Yields ------ - train_indices : ndarray - Indices for training data. - - test_indices : ndarray - Indices for testing data. + train : ndarray + Training set indices. + test : ndarray + Test set indices. """ time_data = self._get_time_data(X) - unique_months = time_data.dt.to_period("M").drop_duplicates().sort_values() + months = time_data.dt.to_period('M') + unique_months = sorted(months.unique()) for i in range(len(unique_months) - 1): - train_month = unique_months.iloc[i] - test_month = unique_months.iloc[i + 1] - - train_mask = time_data.dt.to_period("M") == train_month - test_mask = time_data.dt.to_period("M") == test_month - - train_indices = np.where(train_mask)[0] - test_indices = np.where(test_mask)[0] - - yield train_indices, test_indices + train_mask = months == unique_months[i] + test_mask = months == unique_months[i + 1] + yield np.where(train_mask)[0], np.where(test_mask)[0] def _get_time_data(self, X): - """ - Extract the datetime data from the specified column or index. + """Extract datetime data from DataFrame. Parameters ---------- - X : DataFrame + X : pd.DataFrame Input data. Returns ------- - time_data : Series - Series of datetime values. + pd.Series + Series containing datetime values. Raises ------ ValueError - If the column is not found or is not datetime-like. + If datetime column is not found or is invalid. """ if self.time_col == 'index': if not isinstance(X.index, pd.DatetimeIndex): - raise ValueError("Index must be a DatetimeIndex.") + raise ValueError("Index must be DatetimeIndex when time_col='index'") return pd.Series(X.index) - elif self.time_col in X.columns: - time_data = X[self.time_col] - if not np.issubdtype(time_data.dtype, np.datetime64): - raise ValueError(f"Column '{self.time_col}' must be of datetime type.") - return time_data - else: - raise ValueError(f"Column '{self.time_col}' not found in input data.") + + if self.time_col not in X.columns: + raise ValueError(f"Column {self.time_col} not found in X") + + time_values = X[self.time_col] + if not pd.api.types.is_datetime64_any_dtype(time_values): + raise ValueError(f"Column {self.time_col} must be datetime type") + + return time_values From 7316b0d05a498349d077775076a1e7eb1dbeba28 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:13:58 +0100 Subject: [PATCH 20/27] Update sklearn_questions.py --- sklearn_questions.py | 111 +++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 67 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 3c574e9..d09aba1 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -53,22 +53,21 @@ import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import pairwise_distances -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted +from sklearn.utils.validation import ( + check_X_y, check_array, check_is_fitted, check_consistent_length +) from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import BaseCrossValidator class KNearestNeighbors(BaseEstimator, ClassifierMixin): - """Classifier implementing the k-nearest neighbors algorithm. - - This classifier predicts the target of a test point based on the target - of its nearest neighbor in the training set, using Euclidean distance. + """K-nearest neighbors classifier implementation. Parameters ---------- n_neighbors : int, default=1 - Number of neighbors to consider for prediction. + Number of neighbors to use for classification. Attributes ---------- @@ -78,20 +77,22 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin): The target values. classes_ : ndarray of shape (n_classes,) The unique classes labels. + n_features_in_ : int + Number of features seen during fit. """ def __init__(self, n_neighbors=1): - """Initialize the KNearestNeighbors classifier. + """Initialize the classifier. Parameters ---------- n_neighbors : int, default=1 - Number of neighbors to use for prediction. + Number of neighbors to use. """ self.n_neighbors = n_neighbors def fit(self, X, y): - """Fit the k-nearest neighbors classifier. + """Fit the model using X as training data and y as target values. Parameters ---------- @@ -105,60 +106,53 @@ def fit(self, X, y): self : object Returns the instance itself. """ - # Input validation using sklearn's check functions - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, ensure_2d=True, allow_nd=False) check_classification_targets(y) - # Store number of features for predict step validation - self.n_features_in_ = X.shape[1] + if self.n_neighbors < 1: + raise ValueError( + f"Expected n_neighbors > 0, got {self.n_neighbors}" + ) - # Encode class labels + self.n_features_in_ = X.shape[1] self.le_ = LabelEncoder() - y = self.le_.fit_transform(y) - - self.X_ = X - self.y_ = y + self.y_ = self.le_.fit_transform(y) self.classes_ = self.le_.classes_ + self.X_ = X return self def predict(self, X): - """Predict class labels for samples in X. + """Predict the class labels for the provided data. Parameters ---------- X : array-like of shape (n_samples, n_features) - The data to predict. + Test samples. Returns ------- y_pred : ndarray of shape (n_samples,) - The predicted class labels. + Class labels for each data sample. """ - check_is_fitted(self) - X = check_array(X) + check_is_fitted(self, ['X_', 'y_', 'classes_']) + X = check_array(X, ensure_2d=True, allow_nd=False) if X.shape[1] != self.n_features_in_: raise ValueError( - f"X has {X.shape[1]} features, but KNearestNeighbors " - f"was trained with {self.n_features_in_} features." + f"X has {X.shape[1]} features, expected {self.n_features_in_}" ) - # Compute distances between test points and training points distances = pairwise_distances(X, self.X_) + neigh_ind = np.argpartition( + distances, min(self.n_neighbors - 1, len(self.y_) - 1), axis=1 + )[:, :self.n_neighbors] - # Find indices of k nearest neighbors - k_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors] - - # Get labels of k nearest neighbors - k_neighbors_labels = self.y_[k_neighbors] - - # Predict by majority voting - y_pred = np.apply_along_axis( - lambda x: np.bincount(x).argmax(), - axis=1, - arr=k_neighbors_labels - ) + neigh_labels = self.y_[neigh_ind] + y_pred = np.array([ + np.bincount(labels).argmax() + for labels in neigh_labels + ]) return self.le_.inverse_transform(y_pred) @@ -175,36 +169,24 @@ def score(self, X, y): Returns ------- score : float - Mean accuracy of predictions. + Mean accuracy of self.predict(X) with respect to y. """ + X = check_array(X, ensure_2d=True, allow_nd=False) + check_consistent_length(X, y) return np.mean(self.predict(X) == y) class MonthlySplit(BaseCrossValidator): """Monthly cross-validation splitter. - Provides train/test indices to split time series data between successive - months. For each split, test indices must be higher than before, and thus - shuffling in cross validator is inappropriate. - Parameters ---------- time_col : str, default='index' - Column name containing datetime values. If 'index', the index is used. - - Examples - -------- - >>> import pandas as pd - >>> dates = pd.date_range('2020-01-01', '2020-03-31', freq='D') - >>> X = pd.DataFrame({'val': range(len(dates))}, index=dates) - >>> cv = MonthlySplit() - >>> for train_idx, test_idx in cv.split(X): - ... print(f"TRAIN:", X.index[train_idx].min(), X.index[train_idx].max()) - ... print(f"TEST:", X.index[test_idx].min(), X.index[test_idx].max()) + Column name containing datetime values. If 'index', uses the index. """ def __init__(self, time_col='index'): - """Initialize the monthly splitter. + """Initialize the splitter. Parameters ---------- @@ -218,7 +200,7 @@ def get_n_splits(self, X=None, y=None, groups=None): Parameters ---------- - X : pd.DataFrame + X : pd.DataFrame, required Training data. y : array-like, default=None Always ignored, exists for compatibility. @@ -231,8 +213,8 @@ def get_n_splits(self, X=None, y=None, groups=None): Returns the number of splitting iterations. """ time_data = self._get_time_data(X) - unique_months = time_data.dt.to_period('M').unique() - return max(len(unique_months) - 1, 0) + n_months = time_data.dt.to_period('M').nunique() + return max(0, n_months - 1) def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -258,12 +240,12 @@ def split(self, X, y=None, groups=None): unique_months = sorted(months.unique()) for i in range(len(unique_months) - 1): - train_mask = months == unique_months[i] - test_mask = months == unique_months[i + 1] - yield np.where(train_mask)[0], np.where(test_mask)[0] + train_idx = np.where(months == unique_months[i])[0] + test_idx = np.where(months == unique_months[i + 1])[0] + yield train_idx, test_idx def _get_time_data(self, X): - """Extract datetime data from DataFrame. + """Get datetime data from DataFrame. Parameters ---------- @@ -274,11 +256,6 @@ def _get_time_data(self, X): ------- pd.Series Series containing datetime values. - - Raises - ------ - ValueError - If datetime column is not found or is invalid. """ if self.time_col == 'index': if not isinstance(X.index, pd.DatetimeIndex): From b438779f0c9ae01810255642675809f9a8eb565d Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:20:03 +0100 Subject: [PATCH 21/27] Update sklearn_questions.py --- sklearn_questions.py | 112 +++++++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index d09aba1..46bc276 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -54,7 +54,8 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import ( - check_X_y, check_array, check_is_fitted, check_consistent_length + check_X_y, check_array, check_is_fitted, _check_sample_weight, + _num_samples ) from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder @@ -79,6 +80,10 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin): The unique classes labels. n_features_in_ : int Number of features seen during fit. + _fit_X : ndarray of shape (n_samples, n_features) + Validated training data. + _y : ndarray of shape (n_samples,) + Validated target values. """ def __init__(self, n_neighbors=1): @@ -96,29 +101,54 @@ def fit(self, X, y): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. Returns ------- - self : object - Returns the instance itself. + self : KNearestNeighbors + The fitted classifier. """ - X, y = check_X_y(X, y, ensure_2d=True, allow_nd=False) - check_classification_targets(y) + # Input validation + X, y = check_X_y( + X, y, + ensure_2d=True, + allow_nd=False, + dtype=[np.float64, np.float32], + force_all_finite=True + ) + + # Check that X and y have correct shape + if X.shape[0] != y.shape[0]: + raise ValueError( + f"Found input variables with inconsistent numbers of samples: " + f"{[X.shape[0], y.shape[0]]}" + ) + # Validate n_neighbors if self.n_neighbors < 1: raise ValueError( f"Expected n_neighbors > 0, got {self.n_neighbors}" ) + n_samples = _num_samples(X) + if self.n_neighbors > n_samples: + raise ValueError( + f"Expected n_neighbors <= n_samples, got n_neighbors = " + f"{self.n_neighbors}, n_samples = {n_samples}" + ) - self.n_features_in_ = X.shape[1] - self.le_ = LabelEncoder() - self.y_ = self.le_.fit_transform(y) - self.classes_ = self.le_.classes_ + check_classification_targets(y) + + self._fit_X = X self.X_ = X + self.n_features_in_ = X.shape[1] + + # Encode labels + self._le = LabelEncoder() + self._y = self._le.fit_transform(y) + self.classes_ = self._le.classes_ return self @@ -134,27 +164,53 @@ def predict(self, X): ------- y_pred : ndarray of shape (n_samples,) Class labels for each data sample. - """ - check_is_fitted(self, ['X_', 'y_', 'classes_']) - X = check_array(X, ensure_2d=True, allow_nd=False) + Raises + ------ + ValueError + If the number of features in X doesn't match the training data. + """ + # Check if fit has been called + check_is_fitted( + self, + ["_fit_X", "_y", "n_features_in_", "classes_"] + ) + + # Input validation + X = check_array( + X, + accept_sparse=False, + dtype=np.float64, + order="C", + ensure_2d=True, + force_all_finite=True + ) + + # Check feature size consistency if X.shape[1] != self.n_features_in_: raise ValueError( - f"X has {X.shape[1]} features, expected {self.n_features_in_}" + f"X has {X.shape[1]} features, but this " + f"KNearestNeighbors is expecting {self.n_features_in_} features" ) - distances = pairwise_distances(X, self.X_) + # Compute distances and find nearest neighbors + distances = pairwise_distances(X, self._fit_X) neigh_ind = np.argpartition( - distances, min(self.n_neighbors - 1, len(self.y_) - 1), axis=1 + distances, + min(self.n_neighbors - 1, len(self._y) - 1), + axis=1 )[:, :self.n_neighbors] - neigh_labels = self.y_[neigh_ind] - y_pred = np.array([ - np.bincount(labels).argmax() - for labels in neigh_labels - ]) + # Get labels of nearest neighbors + neigh_labels = self._y[neigh_ind] + + # Predict by majority voting + y_pred = np.zeros(X.shape[0], dtype=self._y.dtype) + for i in range(X.shape[0]): + counts = np.bincount(neigh_labels[i]) + y_pred[i] = counts.argmax() - return self.le_.inverse_transform(y_pred) + return self._le.inverse_transform(y_pred) def score(self, X, y): """Return the mean accuracy on the given test data and labels. @@ -171,8 +227,16 @@ def score(self, X, y): score : float Mean accuracy of self.predict(X) with respect to y. """ - X = check_array(X, ensure_2d=True, allow_nd=False) - check_consistent_length(X, y) + # Check that X and y have correct shape + X = check_array(X, accept_sparse=False, ensure_2d=True) + y = check_array(y, ensure_2d=False, ensure_min_samples=0) + + if X.shape[0] != y.shape[0]: + raise ValueError( + f"Found input variables with inconsistent numbers of samples: " + f"{[X.shape[0], y.shape[0]]}" + ) + return np.mean(self.predict(X) == y) From b344a4eae1025433328f1852ff189270c2d091c5 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:56:14 +0100 Subject: [PATCH 22/27] Update sklearn_questions.py From 884dfad121c14d5a1f06dc84d76f08a066c08d21 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 15:07:29 +0100 Subject: [PATCH 23/27] Update sklearn_questions.py --- sklearn_questions.py | 274 ++++++------------------------------------- 1 file changed, 33 insertions(+), 241 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 46bc276..73b7be9 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -63,274 +63,66 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin): - """K-nearest neighbors classifier implementation. - - Parameters - ---------- - n_neighbors : int, default=1 - Number of neighbors to use for classification. - - Attributes - ---------- - X_ : ndarray of shape (n_samples, n_features) - The input samples. - y_ : ndarray of shape (n_samples,) - The target values. - classes_ : ndarray of shape (n_classes,) - The unique classes labels. - n_features_in_ : int - Number of features seen during fit. - _fit_X : ndarray of shape (n_samples, n_features) - Validated training data. - _y : ndarray of shape (n_samples,) - Validated target values. - """ - def __init__(self, n_neighbors=1): - """Initialize the classifier. - - Parameters - ---------- - n_neighbors : int, default=1 - Number of neighbors to use. - """ self.n_neighbors = n_neighbors def fit(self, X, y): - """Fit the model using X as training data and y as target values. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Training data. - y : array-like of shape (n_samples,) - Target values. - - Returns - ------- - self : KNearestNeighbors - The fitted classifier. - """ - # Input validation - X, y = check_X_y( - X, y, - ensure_2d=True, - allow_nd=False, - dtype=[np.float64, np.float32], - force_all_finite=True - ) - - # Check that X and y have correct shape - if X.shape[0] != y.shape[0]: - raise ValueError( - f"Found input variables with inconsistent numbers of samples: " - f"{[X.shape[0], y.shape[0]]}" - ) - - # Validate n_neighbors - if self.n_neighbors < 1: - raise ValueError( - f"Expected n_neighbors > 0, got {self.n_neighbors}" - ) - n_samples = _num_samples(X) - if self.n_neighbors > n_samples: - raise ValueError( - f"Expected n_neighbors <= n_samples, got n_neighbors = " - f"{self.n_neighbors}, n_samples = {n_samples}" - ) - - check_classification_targets(y) - - self._fit_X = X + X, y = check_X_y(X, y, ensure_min_samples=1) + self.classes_ = np.unique(y) self.X_ = X + self.y_ = y self.n_features_in_ = X.shape[1] - - # Encode labels - self._le = LabelEncoder() - self._y = self._le.fit_transform(y) - self.classes_ = self._le.classes_ - return self def predict(self, X): - """Predict the class labels for the provided data. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Test samples. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Class labels for each data sample. - - Raises - ------ - ValueError - If the number of features in X doesn't match the training data. - """ - # Check if fit has been called - check_is_fitted( - self, - ["_fit_X", "_y", "n_features_in_", "classes_"] - ) - - # Input validation - X = check_array( - X, - accept_sparse=False, - dtype=np.float64, - order="C", - ensure_2d=True, - force_all_finite=True - ) - - # Check feature size consistency + check_is_fitted(self) + X = check_array(X) + if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but this " - f"KNearestNeighbors is expecting {self.n_features_in_} features" - ) - - # Compute distances and find nearest neighbors - distances = pairwise_distances(X, self._fit_X) - neigh_ind = np.argpartition( - distances, - min(self.n_neighbors - 1, len(self._y) - 1), - axis=1 - )[:, :self.n_neighbors] - - # Get labels of nearest neighbors - neigh_labels = self._y[neigh_ind] - - # Predict by majority voting - y_pred = np.zeros(X.shape[0], dtype=self._y.dtype) - for i in range(X.shape[0]): - counts = np.bincount(neigh_labels[i]) - y_pred[i] = counts.argmax() - - return self._le.inverse_transform(y_pred) - - def score(self, X, y): - """Return the mean accuracy on the given test data and labels. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Test samples. - y : array-like of shape (n_samples,) - True labels for X. - - Returns - ------- - score : float - Mean accuracy of self.predict(X) with respect to y. - """ - # Check that X and y have correct shape - X = check_array(X, accept_sparse=False, ensure_2d=True) - y = check_array(y, ensure_2d=False, ensure_min_samples=0) - - if X.shape[0] != y.shape[0]: - raise ValueError( - f"Found input variables with inconsistent numbers of samples: " - f"{[X.shape[0], y.shape[0]]}" - ) - - return np.mean(self.predict(X) == y) - + raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_}") + + distances = pairwise_distances(X, self.X_) + nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors] + + predictions = [] + for neighbors in nearest_neighbors: + neighbor_labels = self.y_[neighbors] + most_common = np.bincount(neighbor_labels).argmax() + predictions.append(most_common) + + return np.array(predictions) class MonthlySplit(BaseCrossValidator): - """Monthly cross-validation splitter. - - Parameters - ---------- - time_col : str, default='index' - Column name containing datetime values. If 'index', uses the index. - """ - def __init__(self, time_col='index'): - """Initialize the splitter. - - Parameters - ---------- - time_col : str, default='index' - Column containing datetime values or 'index'. - """ self.time_col = time_col - def get_n_splits(self, X=None, y=None, groups=None): - """Return the number of splitting iterations. - - Parameters - ---------- - X : pd.DataFrame, required - Training data. - y : array-like, default=None - Always ignored, exists for compatibility. - groups : array-like, default=None - Always ignored, exists for compatibility. - - Returns - ------- - n_splits : int - Returns the number of splitting iterations. - """ - time_data = self._get_time_data(X) - n_months = time_data.dt.to_period('M').nunique() - return max(0, n_months - 1) - def split(self, X, y=None, groups=None): - """Generate indices to split data into training and test set. - - Parameters - ---------- - X : pd.DataFrame - Training data. - y : array-like, default=None - Always ignored, exists for compatibility. - groups : array-like, default=None - Always ignored, exists for compatibility. - - Yields - ------ - train : ndarray - Training set indices. - test : ndarray - Test set indices. - """ - time_data = self._get_time_data(X) - months = time_data.dt.to_period('M') + times = self._get_time_data(X) + months = times.dt.to_period('M') unique_months = sorted(months.unique()) for i in range(len(unique_months) - 1): - train_idx = np.where(months == unique_months[i])[0] - test_idx = np.where(months == unique_months[i + 1])[0] - yield train_idx, test_idx - - def _get_time_data(self, X): - """Get datetime data from DataFrame. + train_mask = months == unique_months[i] + test_mask = months == unique_months[i + 1] + yield np.where(train_mask)[0], np.where(test_mask)[0] - Parameters - ---------- - X : pd.DataFrame - Input data. + def get_n_splits(self, X=None, y=None, groups=None): + if X is None: + raise ValueError("X cannot be None") + times = self._get_time_data(X) + return times.dt.to_period('M').nunique() - 1 - Returns - ------- - pd.Series - Series containing datetime values. - """ + def _get_time_data(self, X): if self.time_col == 'index': if not isinstance(X.index, pd.DatetimeIndex): raise ValueError("Index must be DatetimeIndex when time_col='index'") return pd.Series(X.index) - + if self.time_col not in X.columns: - raise ValueError(f"Column {self.time_col} not found in X") - + raise ValueError(f"Column {self.time_col} not found") + time_values = X[self.time_col] if not pd.api.types.is_datetime64_any_dtype(time_values): raise ValueError(f"Column {self.time_col} must be datetime type") - + return time_values From 3de01d6a3be8a124aa022228f7f07161cd2babf3 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 15:16:40 +0100 Subject: [PATCH 24/27] Update sklearn_questions.py --- sklearn_questions.py | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 73b7be9..148eedb 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -53,12 +53,7 @@ import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import pairwise_distances -from sklearn.utils.validation import ( - check_X_y, check_array, check_is_fitted, _check_sample_weight, - _num_samples -) -from sklearn.utils.multiclass import check_classification_targets -from sklearn.preprocessing import LabelEncoder +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.model_selection import BaseCrossValidator @@ -67,40 +62,45 @@ def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors def fit(self, X, y): - X, y = check_X_y(X, y, ensure_min_samples=1) - self.classes_ = np.unique(y) + # Validation stricte des entrées + X, y = check_X_y(X, y, ensure_min_samples=2) + self._validate_params() + self.X_ = X self.y_ = y + self.classes_ = np.unique(y) self.n_features_in_ = X.shape[1] + return self + def _validate_params(self): + if not isinstance(self.n_neighbors, int) or self.n_neighbors < 1: + raise ValueError("n_neighbors must be a positive integer") + def predict(self, X): check_is_fitted(self) - X = check_array(X) + X = check_array(X, ensure_2d=True) if X.shape[1] != self.n_features_in_: raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_}") distances = pairwise_distances(X, self.X_) - nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors] + neigh_idx = np.argpartition(distances, self.n_neighbors-1, axis=1)[:, :self.n_neighbors] - predictions = [] - for neighbors in nearest_neighbors: - neighbor_labels = self.y_[neighbors] - most_common = np.bincount(neighbor_labels).argmax() - predictions.append(most_common) - - return np.array(predictions) + return np.array([ + np.bincount(self.y_[idx]).argmax() + for idx in neigh_idx + ]) class MonthlySplit(BaseCrossValidator): def __init__(self, time_col='index'): self.time_col = time_col def split(self, X, y=None, groups=None): - times = self._get_time_data(X) - months = times.dt.to_period('M') + time_data = self._get_time_data(X) + months = time_data.dt.to_period('M') unique_months = sorted(months.unique()) - + for i in range(len(unique_months) - 1): train_mask = months == unique_months[i] test_mask = months == unique_months[i + 1] @@ -109,8 +109,8 @@ def split(self, X, y=None, groups=None): def get_n_splits(self, X=None, y=None, groups=None): if X is None: raise ValueError("X cannot be None") - times = self._get_time_data(X) - return times.dt.to_period('M').nunique() - 1 + time_data = self._get_time_data(X) + return time_data.dt.to_period('M').nunique() - 1 def _get_time_data(self, X): if self.time_col == 'index': @@ -120,9 +120,9 @@ def _get_time_data(self, X): if self.time_col not in X.columns: raise ValueError(f"Column {self.time_col} not found") - + time_values = X[self.time_col] if not pd.api.types.is_datetime64_any_dtype(time_values): raise ValueError(f"Column {self.time_col} must be datetime type") - + return time_values From 8cd1bf5db1eba94a6ac0ab5857ca8ac6aceaaec3 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 15:34:50 +0100 Subject: [PATCH 25/27] Update sklearn_questions.py --- sklearn_questions.py | 153 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 19 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 148eedb..d1ca77c 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -49,23 +49,64 @@ """ +"""Implementation of KNN classifier and monthly split cross-validator.""" + + import numpy as np import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.model_selection import BaseCrossValidator +from sklearn.utils.validation import ( + check_X_y, + check_array, + check_is_fitted +) class KNearestNeighbors(BaseEstimator, ClassifierMixin): + """K-nearest neighbors classifier. + + Parameters + ---------- + n_neighbors : int, default=1 + Number of neighbors to use for classification. + + Attributes + ---------- + X_ : ndarray of shape (n_samples, n_features) + Training data. + y_ : ndarray of shape (n_samples,) + Target values. + classes_ : ndarray + Unique classes in the training data. + n_features_in_ : int + Number of features seen during fit. + """ + def __init__(self, n_neighbors=1): + """Initialize the classifier.""" self.n_neighbors = n_neighbors def fit(self, X, y): - # Validation stricte des entrées - X, y = check_X_y(X, y, ensure_min_samples=2) - self._validate_params() + """Fit the model using X as training data and y as target values. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Returns self. + """ + X, y = check_X_y(X, y) + if self.n_neighbors < 1: + raise ValueError(f"n_neighbors must be >= 1. Got {self.n_neighbors}") + self.X_ = X self.y_ = y self.classes_ = np.unique(y) @@ -73,30 +114,71 @@ def fit(self, X, y): return self - def _validate_params(self): - if not isinstance(self.n_neighbors, int) or self.n_neighbors < 1: - raise ValueError("n_neighbors must be a positive integer") - def predict(self, X): + """Predict the class labels for the provided data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Class labels for each data sample. + """ check_is_fitted(self) - X = check_array(X, ensure_2d=True) + X = check_array(X) if X.shape[1] != self.n_features_in_: - raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_}") - - distances = pairwise_distances(X, self.X_) - neigh_idx = np.argpartition(distances, self.n_neighbors-1, axis=1)[:, :self.n_neighbors] + raise ValueError( + f"X has {X.shape[1]} features, but KNearestNeighbors " + f"is expecting {self.n_features_in_} features" + ) + + distances = ((X[:, np.newaxis, :] - self.X_) ** 2).sum(axis=2) + indices = np.argpartition(distances, self.n_neighbors-1)[:, :self.n_neighbors] - return np.array([ + predictions = np.array([ np.bincount(self.y_[idx]).argmax() - for idx in neigh_idx + for idx in indices ]) + + return predictions + class MonthlySplit(BaseCrossValidator): + """Monthly cross-validation splitter. + + Parameters + ---------- + time_col : str, default='index' + Column name containing datetime values. + """ + def __init__(self, time_col='index'): + """Initialize the splitter.""" self.time_col = time_col def split(self, X, y=None, groups=None): + """Generate indices to split data into training and validation set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + groups : array-like + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + Training set indices. + test : ndarray + Test set indices. + """ time_data = self._get_time_data(X) months = time_data.dt.to_period('M') unique_months = sorted(months.unique()) @@ -107,22 +189,55 @@ def split(self, X, y=None, groups=None): yield np.where(train_mask)[0], np.where(test_mask)[0] def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + groups : array-like + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations. + """ if X is None: raise ValueError("X cannot be None") time_data = self._get_time_data(X) return time_data.dt.to_period('M').nunique() - 1 def _get_time_data(self, X): + """Extract datetime data from DataFrame. + + Parameters + ---------- + X : DataFrame + Input data. + + Returns + ------- + pd.Series + Series containing datetime values. + + Raises + ------ + ValueError + If time column is not found or not datetime type. + """ if self.time_col == 'index': if not isinstance(X.index, pd.DatetimeIndex): raise ValueError("Index must be DatetimeIndex when time_col='index'") return pd.Series(X.index) - + if self.time_col not in X.columns: raise ValueError(f"Column {self.time_col} not found") - + time_values = X[self.time_col] if not pd.api.types.is_datetime64_any_dtype(time_values): raise ValueError(f"Column {self.time_col} must be datetime type") - + return time_values From de0b3cc15106341c74beb92f750d9202127dc8b3 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 15:51:11 +0100 Subject: [PATCH 26/27] Update sklearn_questions.py --- sklearn_questions.py | 208 +++++++++++++++++++------------------------ 1 file changed, 92 insertions(+), 116 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index d1ca77c..991f040 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -56,140 +56,106 @@ import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.validation import ( - check_X_y, - check_array, - check_is_fitted -) +from sklearn.utils.validation import check_X_y, check_is_fitted, check_array +from sklearn.utils.multiclass import check_classification_targets +from sklearn.metrics.pairwise import pairwise_distances class KNearestNeighbors(BaseEstimator, ClassifierMixin): - """K-nearest neighbors classifier. - - Parameters - ---------- - n_neighbors : int, default=1 - Number of neighbors to use for classification. - - Attributes - ---------- - X_ : ndarray of shape (n_samples, n_features) - Training data. - y_ : ndarray of shape (n_samples,) - Target values. - classes_ : ndarray - Unique classes in the training data. - n_features_in_ : int - Number of features seen during fit. - """ + """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): - """Initialize the classifier.""" + """Initialize the classifier with the number of neighbors.""" self.n_neighbors = n_neighbors def fit(self, X, y): - """Fit the model using X as training data and y as target values. + """ + Fit the model using X as training data and y as target values. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Training data. - y : array-like of shape (n_samples,) + y : ndarray of shape (n_samples,) Target values. Returns ------- self : object - Returns self. + Fitted estimator. """ X, y = check_X_y(X, y) - - if self.n_neighbors < 1: - raise ValueError(f"n_neighbors must be >= 1. Got {self.n_neighbors}") - + check_classification_targets(y) self.X_ = X self.y_ = y self.classes_ = np.unique(y) - self.n_features_in_ = X.shape[1] - + self.is_fitted_ = True return self def predict(self, X): - """Predict the class labels for the provided data. + """ + Predict the class labels for the provided data. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Test samples. Returns ------- - y_pred : ndarray of shape (n_samples,) - Class labels for each data sample. + y : ndarray of shape (n_samples,) + Predicted class labels. """ check_is_fitted(self) X = check_array(X) - - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but KNearestNeighbors " - f"is expecting {self.n_features_in_} features" - ) - - distances = ((X[:, np.newaxis, :] - self.X_) ** 2).sum(axis=2) - indices = np.argpartition(distances, self.n_neighbors-1)[:, :self.n_neighbors] - - predictions = np.array([ - np.bincount(self.y_[idx]).argmax() - for idx in indices - ]) - - return predictions + distances = pairwise_distances(X, self.X_) + k_nearest = np.argsort(distances, axis=1)[:, :self.n_neighbors] + y_pred = np.array([np.bincount(self.y_[neighbors]).argmax() for neighbors in k_nearest]) + return y_pred + + def score(self, X, y): + """ + Return the mean accuracy on the given test data and labels. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Test samples. + y : ndarray of shape (n_samples,) + True labels for X. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) wrt. y. + """ + return np.mean(self.predict(X) == y) class MonthlySplit(BaseCrossValidator): - """Monthly cross-validation splitter. + """ + Cross-validator based on monthly split. + + Split data based on the given `time_col` (or default to index). Each split + corresponds to one month of data for the training and the next month of + data for the test. Parameters ---------- time_col : str, default='index' - Column name containing datetime values. + Column of the input DataFrame that will be used to split the data. This + column should be of type datetime. If split is called with a DataFrame + for which this column is not a datetime, it will raise a ValueError. + To use the index as column just set `time_col` to `'index'`. """ def __init__(self, time_col='index'): - """Initialize the splitter.""" + """Initialize the cross-validator with the time column.""" self.time_col = time_col - def split(self, X, y=None, groups=None): - """Generate indices to split data into training and validation set. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like - Always ignored, exists for compatibility. - - Yields - ------ - train : ndarray - Training set indices. - test : ndarray - Test set indices. + def get_n_splits(self, X, y=None, groups=None): """ - time_data = self._get_time_data(X) - months = time_data.dt.to_period('M') - unique_months = sorted(months.unique()) - - for i in range(len(unique_months) - 1): - train_mask = months == unique_months[i] - test_mask = months == unique_months[i + 1] - yield np.where(train_mask)[0], np.where(test_mask)[0] - - def get_n_splits(self, X=None, y=None, groups=None): - """Returns the number of splitting iterations. + Return the number of splitting iterations in the cross-validator. Parameters ---------- @@ -197,47 +163,57 @@ def get_n_splits(self, X=None, y=None, groups=None): Training data. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. - groups : array-like + groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Returns ------- n_splits : int - Returns the number of splitting iterations. + The number of splits. """ - if X is None: - raise ValueError("X cannot be None") - time_data = self._get_time_data(X) - return time_data.dt.to_period('M').nunique() - 1 + dates = X.index if self.time_col == 'index' else X[self.time_col] + dates = pd.to_datetime(dates) + return len(pd.unique(dates.to_period('M'))) - 1 - def _get_time_data(self, X): - """Extract datetime data from DataFrame. + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. Parameters ---------- - X : DataFrame - Input data. - - Returns - ------- - pd.Series - Series containing datetime values. + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + groups : array-like of shape (n_samples,) + Always ignored, exists for compatibility. - Raises + Yields ------ - ValueError - If time column is not found or not datetime type. + idx_train : ndarray + The training set indices for that split. + idx_test : ndarray + The testing set indices for that split. """ if self.time_col == 'index': - if not isinstance(X.index, pd.DatetimeIndex): - raise ValueError("Index must be DatetimeIndex when time_col='index'") - return pd.Series(X.index) - - if self.time_col not in X.columns: - raise ValueError(f"Column {self.time_col} not found") - - time_values = X[self.time_col] - if not pd.api.types.is_datetime64_any_dtype(time_values): - raise ValueError(f"Column {self.time_col} must be datetime type") - - return time_values + dates = X.index + else: + if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]): + raise ValueError('time_col must be of type datetime') + dates = X[self.time_col] + + dates = pd.to_datetime(dates) + periods = dates.to_period('M') + unique_periods = periods.unique() + + for i in range(len(unique_periods) - 1): + train_period = unique_periods[i] + test_period = unique_periods[i + 1] + + train_mask = periods == train_period + test_mask = periods == test_period + + yield ( + np.where(train_mask)[0], + np.where(test_mask)[0] + ) From 1be4c2d74c6ea685291f749980df9aac752dac78 Mon Sep 17 00:00:00 2001 From: docloukman <165569528+docloukman@users.noreply.github.com> Date: Sun, 22 Dec 2024 15:53:13 +0100 Subject: [PATCH 27/27] Update sklearn_questions.py --- sklearn_questions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn_questions.py b/sklearn_questions.py index 991f040..e9ecdb6 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -89,6 +89,7 @@ def fit(self, X, y): self.X_ = X self.y_ = y self.classes_ = np.unique(y) + self.n_features_in_ = X.shape[1] self.is_fitted_ = True return self