x-datascience-datacamp · docloukman · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/sklearn_questions.py b/sklearn_questions.py
@@ -1,4 +1,5 @@
-"""Assignment - making a sklearn estimator and cv splitter.
+"""
+Assignment - making a sklearn estimator and CV splitter.
 
 The goal of this assignment is to implement by yourself:
 
@@ -11,7 +12,7 @@
 The nearest neighbor classifier predicts for a point X_i the target y_k of
 the training sample X_k which is the closest to X_i. We measure proximity with
 the Euclidean distance. The model will be evaluated with the accuracy (average
-number of samples corectly classified). You need to implement the `fit`,
+number of samples correctly classified). You need to implement the `fit`,
 `predict` and `score` methods for this class. The code you write should pass
 the test we implemented. You can run the tests by calling at the root of the
 repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
@@ -21,17 +22,16 @@
 https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
 Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
 
-
 Detailed instructions for question 2:
 The data to split should contain the index or one column in
-datatime format. Then the aim is to split the data between train and test
+datetime format. Then the aim is to split the data between train and test
 sets when for each pair of successive months, we learn on the first and
-predict of the following. For example if you have data distributed from
-november 2020 to march 2021, you have have 4 splits. The first split
-will allow to learn on november data and predict on december data, the
-second split to learn december and predict on january etc.
+predict on the following. For example if you have data distributed from
+November 2020 to March 2021, you have have 4 splits. The first split
+will allow to learn on November data and predict on December data, the
+second split to learn December and predict on January etc.
 
-We also ask you to respect the pep8 convention: https://pep8.org. This will be
+We also ask you to respect the PEP8 convention: https://pep8.org. This will be
 enforced with `flake8`. You can check that there is no flake8 errors by
 calling `flake8` at the root of the repo.
 
@@ -47,104 +47,121 @@
 
 to compute distances between 2 sets of samples.
 """
-import numpy as np
-import pandas as pd
 
-from sklearn.base import BaseEstimator
-from sklearn.base import ClassifierMixin
 
-from sklearn.model_selection import BaseCrossValidator
+"""Implementation of KNN classifier and monthly split cross-validator."""
+
 
-from sklearn.utils.validation import check_X_y, check_is_fitted
-from sklearn.utils.validation import check_array
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.model_selection import BaseCrossValidator
+from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.metrics.pairwise import pairwise_distances
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
     """KNearestNeighbors classifier."""
 
-    def __init__(self, n_neighbors=1):  # noqa: D107
+    def __init__(self, n_neighbors=1):
+        """Initialize the classifier with the number of neighbors."""
         self.n_neighbors = n_neighbors
 
     def fit(self, X, y):
-        """Fitting function.
+        """
+        Fit the model using X as training data and y as target values.
 
-         Parameters
+        Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
-            Data to train the model.
-        y : ndarray, shape (n_samples,)
-            Labels associated with the training data.
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values.
 
         Returns
-        ----------
-        self : instance of KNearestNeighbors
-            The current instance of the classifier
+        -------
+        self : object
+            Fitted estimator.
         """
+        X, y = check_X_y(X, y)
+        check_classification_targets(y)
+        self.X_ = X
+        self.y_ = y
+        self.classes_ = np.unique(y)
+        self.n_features_in_ = X.shape[1]
+        self.is_fitted_ = True
         return self
 
     def predict(self, X):
-        """Predict function.
+        """
+        Predict the class labels for the provided data.
 
         Parameters
         ----------
-        X : ndarray, shape (n_test_samples, n_features)
-            Data to predict on.
+        X : ndarray of shape (n_samples, n_features)
+            Test samples.
 
         Returns
-        ----------
-        y : ndarray, shape (n_test_samples,)
-            Predicted class labels for each test data sample.
+        -------
+        y : ndarray of shape (n_samples,)
+            Predicted class labels.
         """
-        y_pred = np.zeros(X.shape[0])
+        check_is_fitted(self)
+        X = check_array(X)
+        distances = pairwise_distances(X, self.X_)
+        k_nearest = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+        y_pred = np.array([np.bincount(self.y_[neighbors]).argmax() for neighbors in k_nearest])
         return y_pred
 
     def score(self, X, y):
-        """Calculate the score of the prediction.
+        """
+        Return the mean accuracy on the given test data and labels.
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
-            Data to score on.
-        y : ndarray, shape (n_samples,)
-            target values.
+        X : ndarray of shape (n_samples, n_features)
+            Test samples.
+        y : ndarray of shape (n_samples,)
+            True labels for X.
 
         Returns
-        ----------
+        -------
         score : float
-            Accuracy of the model computed for the (X, y) pairs.
+            Mean accuracy of self.predict(X) wrt. y.
         """
-        return 0.
+        return np.mean(self.predict(X) == y)
 
 
 class MonthlySplit(BaseCrossValidator):
-    """CrossValidator based on monthly split.
+    """
+    Cross-validator based on monthly split.
 
     Split data based on the given `time_col` (or default to index). Each split
     corresponds to one month of data for the training and the next month of
     data for the test.
 
     Parameters
     ----------
-    time_col : str, defaults to 'index'
+    time_col : str, default='index'
         Column of the input DataFrame that will be used to split the data. This
         column should be of type datetime. If split is called with a DataFrame
         for which this column is not a datetime, it will raise a ValueError.
         To use the index as column just set `time_col` to `'index'`.
     """
 
-    def __init__(self, time_col='index'):  # noqa: D107
+    def __init__(self, time_col='index'):
+        """Initialize the cross-validator with the time column."""
         self.time_col = time_col
 
     def get_n_splits(self, X, y=None, groups=None):
-        """Return the number of splitting iterations in the cross-validator.
+        """
+        Return the number of splitting iterations in the cross-validator.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
-            Training data, where `n_samples` is the number of samples
-            and `n_features` is the number of features.
+            Training data.
         y : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
@@ -155,16 +172,18 @@ def get_n_splits(self, X, y=None, groups=None):
         n_splits : int
             The number of splits.
         """
-        return 0
+        dates = X.index if self.time_col == 'index' else X[self.time_col]
+        dates = pd.to_datetime(dates)
+        return len(pd.unique(dates.to_period('M'))) - 1
 
-    def split(self, X, y, groups=None):
-        """Generate indices to split data into training and test set.
+    def split(self, X, y=None, groups=None):
+        """
+        Generate indices to split data into training and test set.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
-            Training data, where `n_samples` is the number of samples
-            and `n_features` is the number of features.
+            Training data.
         y : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
@@ -177,12 +196,25 @@ def split(self, X, y, groups=None):
         idx_test : ndarray
             The testing set indices for that split.
         """
+        if self.time_col == 'index':
+            dates = X.index
+        else:
+            if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]):
+                raise ValueError('time_col must be of type datetime')
+            dates = X[self.time_col]
+
+        dates = pd.to_datetime(dates)
+        periods = dates.to_period('M')
+        unique_periods = periods.unique()
+
+        for i in range(len(unique_periods) - 1):
+            train_period = unique_periods[i]
+            test_period = unique_periods[i + 1]
+
+            train_mask = periods == train_period
+            test_mask = periods == test_period
 
-        n_samples = X.shape[0]
-        n_splits = self.get_n_splits(X, y, groups)
-        for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
             yield (
-                idx_train, idx_test
+                np.where(train_mask)[0],
+                np.where(test_mask)[0]
             )