Skip to content

UP my solution #173

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 117 additions & 75 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,142 +47,184 @@

to compute distances between 2 sets of samples.
"""

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import BaseCrossValidator

from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.metrics.pairwise import pairwise_distances


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
"""KNearestNeighbors classifier."""

def __init__(self, n_neighbors=1): # noqa: D107
def __init__(self, n_neighbors=1):
"""Initialize the classifier with the specified number of neighbors."""
self.n_neighbors = n_neighbors

def fit(self, X, y):
"""Fitting function.
"""Fit the classifier using the training data.

Parameters
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to train the model.
Training data.
y : ndarray, shape (n_samples,)
Labels associated with the training data.
Target values.

Returns
----------
self : instance of KNearestNeighbors
The current instance of the classifier
-------
self : object
Fitted classifier.
"""
X, y = check_X_y(X, y)
check_classification_targets(y)
self.X_ = X
self.y_ = y
self.classes_, self.y_mapped_ = np.unique(y, return_inverse=True)
self.n_features_in_ = X.shape[1]
return self

def predict(self, X):
"""Predict function.
"""Predict the class labels for the provided data.

Parameters
----------
X : ndarray, shape (n_test_samples, n_features)
Data to predict on.
X : ndarray, shape (n_samples, n_features)
Test samples.

Returns
----------
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
-------
y_pred : ndarray, shape (n_samples,)
Predicted class labels.
"""
y_pred = np.zeros(X.shape[0])
return y_pred
check_is_fitted(self, ["X_", "y_mapped_", "n_features_in_"])
X = check_array(X)

if X.shape[1] != self.n_features_in_:
raise ValueError(
f"Number of features in input ({X.shape[1]}) does not match "
f"number of features seen during fit ({self.n_features_in_})."
)

distances = pairwise_distances(X, self.X_)
nearest_indices = np.argsort(distances, axis=1)[:, :self.n_neighbors]
nearest_classes = self.y_mapped_[nearest_indices]
y_pred_mapped = np.array([
np.bincount(row, minlength=len(self.classes_)).argmax()
for row in nearest_classes
])

return self.classes_[y_pred_mapped]

def score(self, X, y):
"""Calculate the score of the prediction.
"""Calculate the accuracy of the classifier.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to score on.
Test samples.
y : ndarray, shape (n_samples,)
target values.
True class labels.

Returns
----------
-------
score : float
Accuracy of the model computed for the (X, y) pairs.
Mean accuracy of the predictions.
"""
return 0.
y_pred = self.predict(X)
return np.mean(y_pred == y)


class MonthlySplit(BaseCrossValidator):
"""CrossValidator based on monthly split.

Split data based on the given `time_col` (or default to index). Each split
corresponds to one month of data for the training and the next month of
data for the test.

Parameters
----------
time_col : str, defaults to 'index'
Column of the input DataFrame that will be used to split the data. This
column should be of type datetime. If split is called with a DataFrame
for which this column is not a datetime, it will raise a ValueError.
To use the index as column just set `time_col` to `'index'`.
"""Cross-validator based on monthly splits.

Generates train-test splits where training data is from one month
and test data is from the following month.
"""

def __init__(self, time_col='index'): # noqa: D107
def __init__(self, time_col='index'):
"""Initialize the cross-validator.

Parameters
----------
time_col : str, optional
Column name of the DataFrame to use for splitting.
Defaults to 'index'.
"""
self.time_col = time_col

def get_n_splits(self, X, y=None, groups=None):
"""Return the number of splitting iterations in the cross-validator.
"""Get the number of splits.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
X : DataFrame
Input data.
y : None
Ignored.
groups : None
Ignored.

Returns
-------
n_splits : int
The number of splits.
Number of splits based on unique months.
"""
return 0

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
time_index = self._get_time_index(X)
unique_months = time_index.to_period('M').unique()
return len(unique_months) - 1

def _get_time_index(self, X):
"""Helper method to extract the time index."""
if self.time_col == 'index':
if not isinstance(X.index, pd.DatetimeIndex):
raise ValueError("Index must be a DatetimeIndex "
+ "for time_col='index'.")
return X.index
else:
if self.time_col not in X.columns:
raise ValueError(f"Column '{self.time_col}'"
+ " not found in DataFrame.")
time_column = X[self.time_col]
if not pd.api.types.is_datetime64_any_dtype(time_column):
raise ValueError(f"Column '{self.time_col}' "
+ "must be of datetime type.")
return pd.DatetimeIndex(time_column)

def split(self, X, y=None, groups=None):
"""Generate train-test splits.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
X : DataFrame
Input data.
y : None
Ignored.
groups : None
Ignored.

Yields
------
idx_train : ndarray
The training set indices for that split.
idx_test : ndarray
The testing set indices for that split.
train_indices : ndarray
Indices for the training set.
test_indices : ndarray
Indices for the testing set.
"""
time_index = self._get_time_index(X)
months = time_index.to_period('M')
unique_months = months.unique().sort_values()

n_samples = X.shape[0]
n_splits = self.get_n_splits(X, y, groups)
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
yield (
idx_train, idx_test
)
for i in range(len(unique_months) - 1):
train_month = unique_months[i]
test_month = unique_months[i + 1]

train_indices = np.where(months == train_month)[0]
test_indices = np.where(months == test_month)[0]

train_indices = np.sort(train_indices)
test_indices = np.sort(test_indices)

yield train_indices, test_indices