Skip to content

UP my solution #153

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 53 additions & 35 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""Assignment - making a sklearn estimator and cv splitter.

The goal of this assignment is to implement by yourself:

- a scikit-learn estimator for the KNearestNeighbors for classification
tasks and check that it is working properly.
- a scikit-learn CV splitter where the splits are based on a Pandas
DateTimeIndex.

Detailed instructions for question 1:
The nearest neighbor classifier predicts for a point X_i the target y_k of
the training sample X_k which is the closest to X_i. We measure proximity with
Expand All @@ -20,8 +17,6 @@
You can find more information on how they should be used in the following doc:
https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
Make sure to use them to pass `test_nearest_neighbor_check_estimator`.


Detailed instructions for question 2:
The data to split should contain the index or one column in
datatime format. Then the aim is to split the data between train and test
Expand All @@ -30,101 +25,113 @@
november 2020 to march 2021, you have have 4 splits. The first split
will allow to learn on november data and predict on december data, the
second split to learn december and predict on january etc.

We also ask you to respect the pep8 convention: https://pep8.org. This will be
enforced with `flake8`. You can check that there is no flake8 errors by
calling `flake8` at the root of the repo.

Finally, you need to write docstrings for the methods you code and for the
class. The docstring will be checked using `pydocstyle` that you can also
call at the root of the repo.

Hints
-----
- You can use the function:

from sklearn.metrics.pairwise import pairwise_distances

to compute distances between 2 sets of samples.
"""
from typing import Counter
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin

from sklearn.model_selection import BaseCrossValidator

from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.utils.validation import check_array
from sklearn.utils.validation import validate_data
from sklearn.utils.multiclass import check_classification_targets
from sklearn.metrics.pairwise import pairwise_distances


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
class KNearestNeighbors(ClassifierMixin, BaseEstimator, ):
"""KNearestNeighbors classifier."""

def __init__(self, n_neighbors=1): # noqa: D107
self.n_neighbors = n_neighbors

def fit(self, X, y):
"""Fitting function.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to train the model.
y : ndarray, shape (n_samples,)
Labels associated with the training data.

Returns
----------
self : instance of KNearestNeighbors
The current instance of the classifier
"""
X, y = check_X_y(X, y)
check_classification_targets(y)

X, y = validate_data(self, X, y)

self._X, self._y = X, y
self.classes_ = np.unique(y)
return self

def predict(self, X):
"""Predict function.

Parameters
----------
X : ndarray, shape (n_test_samples, n_features)
Data to predict on.

Returns
----------
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
y_pred = np.zeros(X.shape[0])
check_is_fitted(self)
X = validate_data(self, X, reset=False)
check_is_fitted(self, ['_X', '_y'])

if X.shape[1] != self.n_features_in_:
raise ValueError(
f"X has {X.shape[1]} features, "
f"but the model was trained with "
f"{self.n_features_in_} features."
)
distances = pairwise_distances(X, self._X, metric='euclidean')

nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors]
# get the labels of the nearest neighbors
nearest_labels = self._y[nearest_neighbors]
# get the most common label among the nearest neighbors
y_pred = np.array([Counter(labels).most_common(1)[0][0]
for labels in nearest_labels])
return y_pred

def score(self, X, y):
"""Calculate the score of the prediction.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to score on.
y : ndarray, shape (n_samples,)
target values.

Returns
----------
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
y_pred = self.predict(X)

return np.mean(y_pred == y)


class MonthlySplit(BaseCrossValidator):
"""CrossValidator based on monthly split.

Split data based on the given `time_col` (or default to index). Each split
corresponds to one month of data for the training and the next month of
data for the test.

Parameters
----------
time_col : str, defaults to 'index'
Expand All @@ -133,13 +140,10 @@ class MonthlySplit(BaseCrossValidator):
for which this column is not a datetime, it will raise a ValueError.
To use the index as column just set `time_col` to `'index'`.
"""

def __init__(self, time_col='index'): # noqa: D107
self.time_col = time_col

def get_n_splits(self, X, y=None, groups=None):
"""Return the number of splitting iterations in the cross-validator.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Expand All @@ -149,17 +153,23 @@ def get_n_splits(self, X, y=None, groups=None):
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,)
Always ignored, exists for compatibility.

Returns
-------
n_splits : int
The number of splits.
"""
return 0
if self.time_col == 'index':
time_column = pd.Series(X.index)
else:
time_column = X[self.time_col]

if not np.issubdtype(time_column.dtype, np.datetime64):
raise ValueError("The time_col must be of datetime type.")

return len(time_column.dt.to_period('M').unique()) - 1

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Expand All @@ -169,20 +179,28 @@ def split(self, X, y, groups=None):
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,)
Always ignored, exists for compatibility.

Yields
------
idx_train : ndarray
The training set indices for that split.
idx_test : ndarray
The testing set indices for that split.
"""
if self.time_col == 'index':
time_column = pd.Series(X.index)
else:
time_column = X[self.time_col]

n_samples = X.shape[0]
if not np.issubdtype(time_column.dtype, np.datetime64):
raise ValueError("The time_col must be of datetime type.")

monthly_periods = time_column.dt.to_period('M')
unique_months = monthly_periods.unique()
unique_months = sorted(unique_months)
n_splits = self.get_n_splits(X, y, groups)
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
idx_train = np.where(monthly_periods == unique_months[i])[0]
idx_test = np.where(monthly_periods == unique_months[i + 1])[0]
yield (
idx_train, idx_test
)
)