diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..e9ecdb6 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,4 +1,5 @@ -"""Assignment - making a sklearn estimator and cv splitter. +""" +Assignment - making a sklearn estimator and CV splitter. The goal of this assignment is to implement by yourself: @@ -11,7 +12,7 @@ The nearest neighbor classifier predicts for a point X_i the target y_k of the training sample X_k which is the closest to X_i. We measure proximity with the Euclidean distance. The model will be evaluated with the accuracy (average -number of samples corectly classified). You need to implement the `fit`, +number of samples correctly classified). You need to implement the `fit`, `predict` and `score` methods for this class. The code you write should pass the test we implemented. You can run the tests by calling at the root of the repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a @@ -21,17 +22,16 @@ https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - Detailed instructions for question 2: The data to split should contain the index or one column in -datatime format. Then the aim is to split the data between train and test +datetime format. Then the aim is to split the data between train and test sets when for each pair of successive months, we learn on the first and -predict of the following. For example if you have data distributed from -november 2020 to march 2021, you have have 4 splits. The first split -will allow to learn on november data and predict on december data, the -second split to learn december and predict on january etc. +predict on the following. For example if you have data distributed from +November 2020 to March 2021, you have have 4 splits. The first split +will allow to learn on November data and predict on December data, the +second split to learn December and predict on January etc. -We also ask you to respect the pep8 convention: https://pep8.org. This will be +We also ask you to respect the PEP8 convention: https://pep8.org. This will be enforced with `flake8`. You can check that there is no flake8 errors by calling `flake8` at the root of the repo. @@ -47,16 +47,16 @@ to compute distances between 2 sets of samples. """ -import numpy as np -import pandas as pd -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin -from sklearn.model_selection import BaseCrossValidator +"""Implementation of KNN classifier and monthly split cross-validator.""" + -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.model_selection import BaseCrossValidator +from sklearn.utils.validation import check_X_y, check_is_fitted, check_array from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances @@ -64,62 +64,78 @@ class KNearestNeighbors(BaseEstimator, ClassifierMixin): """KNearestNeighbors classifier.""" - def __init__(self, n_neighbors=1): # noqa: D107 + def __init__(self, n_neighbors=1): + """Initialize the classifier with the number of neighbors.""" self.n_neighbors = n_neighbors def fit(self, X, y): - """Fitting function. + """ + Fit the model using X as training data and y as target values. - Parameters + Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data to train the model. - y : ndarray, shape (n_samples,) - Labels associated with the training data. + X : ndarray of shape (n_samples, n_features) + Training data. + y : ndarray of shape (n_samples,) + Target values. Returns - ---------- - self : instance of KNearestNeighbors - The current instance of the classifier + ------- + self : object + Fitted estimator. """ + X, y = check_X_y(X, y) + check_classification_targets(y) + self.X_ = X + self.y_ = y + self.classes_ = np.unique(y) + self.n_features_in_ = X.shape[1] + self.is_fitted_ = True return self def predict(self, X): - """Predict function. + """ + Predict the class labels for the provided data. Parameters ---------- - X : ndarray, shape (n_test_samples, n_features) - Data to predict on. + X : ndarray of shape (n_samples, n_features) + Test samples. Returns - ---------- - y : ndarray, shape (n_test_samples,) - Predicted class labels for each test data sample. + ------- + y : ndarray of shape (n_samples,) + Predicted class labels. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self) + X = check_array(X) + distances = pairwise_distances(X, self.X_) + k_nearest = np.argsort(distances, axis=1)[:, :self.n_neighbors] + y_pred = np.array([np.bincount(self.y_[neighbors]).argmax() for neighbors in k_nearest]) return y_pred def score(self, X, y): - """Calculate the score of the prediction. + """ + Return the mean accuracy on the given test data and labels. Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data to score on. - y : ndarray, shape (n_samples,) - target values. + X : ndarray of shape (n_samples, n_features) + Test samples. + y : ndarray of shape (n_samples,) + True labels for X. Returns - ---------- + ------- score : float - Accuracy of the model computed for the (X, y) pairs. + Mean accuracy of self.predict(X) wrt. y. """ - return 0. + return np.mean(self.predict(X) == y) class MonthlySplit(BaseCrossValidator): - """CrossValidator based on monthly split. + """ + Cross-validator based on monthly split. Split data based on the given `time_col` (or default to index). Each split corresponds to one month of data for the training and the next month of @@ -127,24 +143,25 @@ class MonthlySplit(BaseCrossValidator): Parameters ---------- - time_col : str, defaults to 'index' + time_col : str, default='index' Column of the input DataFrame that will be used to split the data. This column should be of type datetime. If split is called with a DataFrame for which this column is not a datetime, it will raise a ValueError. To use the index as column just set `time_col` to `'index'`. """ - def __init__(self, time_col='index'): # noqa: D107 + def __init__(self, time_col='index'): + """Initialize the cross-validator with the time column.""" self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): - """Return the number of splitting iterations in the cross-validator. + """ + Return the number of splitting iterations in the cross-validator. Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. + Training data. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) @@ -155,16 +172,18 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + dates = X.index if self.time_col == 'index' else X[self.time_col] + dates = pd.to_datetime(dates) + return len(pd.unique(dates.to_period('M'))) - 1 - def split(self, X, y, groups=None): - """Generate indices to split data into training and test set. + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. + Training data. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) @@ -177,12 +196,25 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ + if self.time_col == 'index': + dates = X.index + else: + if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]): + raise ValueError('time_col must be of type datetime') + dates = X[self.time_col] + + dates = pd.to_datetime(dates) + periods = dates.to_period('M') + unique_periods = periods.unique() + + for i in range(len(unique_periods) - 1): + train_period = unique_periods[i] + test_period = unique_periods[i + 1] + + train_mask = periods == train_period + test_mask = periods == test_period - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) - for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) yield ( - idx_train, idx_test + np.where(train_mask)[0], + np.where(test_mask)[0] )