From 5f65fd36d5b7a4a5780113df935b0cb72ddd5443 Mon Sep 17 00:00:00 2001 From: elhcs Date: Fri, 20 Dec 2024 17:51:43 +0100 Subject: [PATCH] UP my solution --- sklearn_questions.py | 88 ++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..3ea2b2e 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,12 +1,9 @@ """Assignment - making a sklearn estimator and cv splitter. - The goal of this assignment is to implement by yourself: - - a scikit-learn estimator for the KNearestNeighbors for classification tasks and check that it is working properly. - a scikit-learn CV splitter where the splits are based on a Pandas DateTimeIndex. - Detailed instructions for question 1: The nearest neighbor classifier predicts for a point X_i the target y_k of the training sample X_k which is the closest to X_i. We measure proximity with @@ -20,8 +17,6 @@ You can find more information on how they should be used in the following doc: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - - Detailed instructions for question 2: The data to split should contain the index or one column in datatime format. Then the aim is to split the data between train and test @@ -30,101 +25,113 @@ november 2020 to march 2021, you have have 4 splits. The first split will allow to learn on november data and predict on december data, the second split to learn december and predict on january etc. - We also ask you to respect the pep8 convention: https://pep8.org. This will be enforced with `flake8`. You can check that there is no flake8 errors by calling `flake8` at the root of the repo. - Finally, you need to write docstrings for the methods you code and for the class. The docstring will be checked using `pydocstyle` that you can also call at the root of the repo. - Hints ----- - You can use the function: - from sklearn.metrics.pairwise import pairwise_distances - to compute distances between 2 sets of samples. """ +from typing import Counter import numpy as np import pandas as pd from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin - from sklearn.model_selection import BaseCrossValidator from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array +from sklearn.utils.validation import validate_data from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances -class KNearestNeighbors(BaseEstimator, ClassifierMixin): +class KNearestNeighbors(ClassifierMixin, BaseEstimator, ): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): # noqa: D107 self.n_neighbors = n_neighbors - def fit(self, X, y): """Fitting function. - Parameters ---------- X : ndarray, shape (n_samples, n_features) Data to train the model. y : ndarray, shape (n_samples,) Labels associated with the training data. - Returns ---------- self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = check_X_y(X, y) + check_classification_targets(y) + + X, y = validate_data(self, X, y) + + self._X, self._y = X, y + self.classes_ = np.unique(y) return self def predict(self, X): """Predict function. - Parameters ---------- X : ndarray, shape (n_test_samples, n_features) Data to predict on. - Returns ---------- y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self) + X = validate_data(self, X, reset=False) + check_is_fitted(self, ['_X', '_y']) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, " + f"but the model was trained with " + f"{self.n_features_in_} features." + ) + distances = pairwise_distances(X, self._X, metric='euclidean') + + nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors] + # get the labels of the nearest neighbors + nearest_labels = self._y[nearest_neighbors] + # get the most common label among the nearest neighbors + y_pred = np.array([Counter(labels).most_common(1)[0][0] + for labels in nearest_labels]) return y_pred def score(self, X, y): """Calculate the score of the prediction. - Parameters ---------- X : ndarray, shape (n_samples, n_features) Data to score on. y : ndarray, shape (n_samples,) target values. - Returns ---------- score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + y_pred = self.predict(X) + + return np.mean(y_pred == y) class MonthlySplit(BaseCrossValidator): """CrossValidator based on monthly split. - Split data based on the given `time_col` (or default to index). Each split corresponds to one month of data for the training and the next month of data for the test. - Parameters ---------- time_col : str, defaults to 'index' @@ -133,13 +140,10 @@ class MonthlySplit(BaseCrossValidator): for which this column is not a datetime, it will raise a ValueError. To use the index as column just set `time_col` to `'index'`. """ - def __init__(self, time_col='index'): # noqa: D107 self.time_col = time_col - def get_n_splits(self, X, y=None, groups=None): """Return the number of splitting iterations in the cross-validator. - Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -149,17 +153,23 @@ def get_n_splits(self, X, y=None, groups=None): Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. - Returns ------- n_splits : int The number of splits. """ - return 0 + if self.time_col == 'index': + time_column = pd.Series(X.index) + else: + time_column = X[self.time_col] + + if not np.issubdtype(time_column.dtype, np.datetime64): + raise ValueError("The time_col must be of datetime type.") + + return len(time_column.dt.to_period('M').unique()) - 1 def split(self, X, y, groups=None): """Generate indices to split data into training and test set. - Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -169,7 +179,6 @@ def split(self, X, y, groups=None): Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. - Yields ------ idx_train : ndarray @@ -177,12 +186,21 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ + if self.time_col == 'index': + time_column = pd.Series(X.index) + else: + time_column = X[self.time_col] - n_samples = X.shape[0] + if not np.issubdtype(time_column.dtype, np.datetime64): + raise ValueError("The time_col must be of datetime type.") + + monthly_periods = time_column.dt.to_period('M') + unique_months = monthly_periods.unique() + unique_months = sorted(unique_months) n_splits = self.get_n_splits(X, y, groups) for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) + idx_train = np.where(monthly_periods == unique_months[i])[0] + idx_test = np.where(monthly_periods == unique_months[i + 1])[0] yield ( idx_train, idx_test - ) + ) \ No newline at end of file