From f33cc5ea8ec39c40cabb1e1c81d05e70ba9d2c50 Mon Sep 17 00:00:00 2001 From: ojyassine Date: Fri, 20 Dec 2024 23:47:15 +0100 Subject: [PATCH] UP my solution --- sklearn_questions.py | 55 +++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..ea8bb64 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -55,13 +55,13 @@ from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array +from sklearn.utils.validation import validate_data, check_is_fitted from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.multiclass import unique_labels from sklearn.metrics.pairwise import pairwise_distances -class KNearestNeighbors(BaseEstimator, ClassifierMixin): +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): # noqa: D107 @@ -82,6 +82,13 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = validate_data(self, X, y) + check_classification_targets(y) + self.classes_ = unique_labels(y) + self.n_features_in_ = X.shape[1] + self.X_ = X + self.y_ = y + self.is_fitted_ = True return self def predict(self, X): @@ -97,7 +104,17 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self) + X = validate_data(self, X, reset=False) + y_pred = [] + for i in range(X.shape[0]): + dists = pairwise_distances(X[i].reshape(1, -1), self.X_) + nearest_indexes = np.argsort(dists, axis=1)[0][:self.n_neighbors] + vals, counts = np.unique( + self.y_[nearest_indexes], return_counts=True + ) + y_pred.append(vals[np.argmax(counts)]) + y_pred = np.array(y_pred) return y_pred def score(self, X, y): @@ -115,7 +132,11 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + check_is_fitted(self) + X, y = validate_data(self, X, y, ensure_2d=False, reset=False) + y_pred = self.predict(X) + accuracy = np.mean(y_pred == y) + return accuracy class MonthlySplit(BaseCrossValidator): @@ -155,7 +176,13 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + X = X.reset_index() + if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]): + raise ValueError( + f"The column '{self.time_col}' is not a datetime." + ) + n_splits = X[self.time_col].dt.to_period('M').unique() + return len(n_splits) - 1 def split(self, X, y, groups=None): """Generate indices to split data into training and test set. @@ -177,12 +204,14 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ - - n_samples = X.shape[0] + X = X.reset_index() n_splits = self.get_n_splits(X, y, groups) + X_group = ( + X.sort_values(by=self.time_col) + .groupby(pd.Grouper(key=self.time_col, freq="ME")) + ) + indexes = [group.index for _, group in X_group] for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test - ) + index_train = list(indexes[i]) + index_test = list(indexes[i + 1]) + yield (index_train, index_test)