From e76f7fd4706172a6c581dfe1e98bdb4176e320cc Mon Sep 17 00:00:00 2001 From: ljnel Date: Fri, 20 Dec 2024 19:16:09 +0100 Subject: [PATCH 1/2] UP my solution --- sklearn_questions.py | 65 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..5dbca22 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -55,13 +55,12 @@ from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array -from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted, validate_data +from sklearn.utils.multiclass import unique_labels from sklearn.metrics.pairwise import pairwise_distances -class KNearestNeighbors(BaseEstimator, ClassifierMixin): +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): # noqa: D107 @@ -82,6 +81,10 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + + self.X_train_, self.y_train_ = validate_data(self, X, y) + self.classes_ = unique_labels(y) + self.is_fitted_ = True return self def predict(self, X): @@ -97,7 +100,18 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + from collections import Counter + + check_is_fitted(self, ['X_train_', 'y_train_']) + X = validate_data(self, X, reset=False) + + distances = pairwise_distances(self.X_train_, X, metric="euclidean") + y_pred = np.empty(X.shape[0], dtype=self.y_train_.dtype) + for i in range(len(X)): + nearest_indices = np.argsort(distances[:, i])[:self.n_neighbors] + nearest_labels = self.y_train_[nearest_indices] + most_common_label = Counter(nearest_labels).most_common(1)[0][0] + y_pred[i] = most_common_label return y_pred def score(self, X, y): @@ -115,7 +129,10 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + from sklearn.metrics import accuracy_score + + y_pred = self.predict(X) + return accuracy_score(y, y_pred) class MonthlySplit(BaseCrossValidator): @@ -155,7 +172,14 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + if self.time_col != 'index': + X = X.set_index(self.time_col) + + if not isinstance(X.index, pd.DatetimeIndex): + X.index = pd.to_datetime(X.index) + + groups = X.groupby(by=[X.index.year, X.index.month]) + return len(groups.groups.keys())-1 def split(self, X, y, groups=None): """Generate indices to split data into training and test set. @@ -178,11 +202,24 @@ def split(self, X, y, groups=None): The testing set indices for that split. """ - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) - for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test + if self.time_col != 'index': + X = X.set_index(self.time_col) + + if not isinstance(X.index, pd.DatetimeIndex): + X.index = pd.to_datetime(X.index) + + groups = X.groupby(by=[X.index.year, X.index.month]) + n_splits = len(groups.groups.keys()) - 1 + + if n_splits < 1: + raise ValueError( + "Insufficient data to create splits based on datetime column" ) + + for i in range(n_splits): + idx_tr = X.index.get_indexer_for( + groups.groups[list(groups.groups.keys())[i]]) + idx_te = X.index.get_indexer_for( + groups.groups[list(groups.groups.keys())[i + 1]]) + + yield (idx_tr, idx_te) From ab3d4324329fc40f2be960144a9b89f10cdfb741 Mon Sep 17 00:00:00 2001 From: ljnel Date: Fri, 20 Dec 2024 19:23:37 +0100 Subject: [PATCH 2/2] UP my solution --- sklearn_questions.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 5dbca22..90bb67e 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -81,7 +81,6 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ - self.X_train_, self.y_train_ = validate_data(self, X, y) self.classes_ = unique_labels(y) self.is_fitted_ = True @@ -105,12 +104,12 @@ def predict(self, X): check_is_fitted(self, ['X_train_', 'y_train_']) X = validate_data(self, X, reset=False) - distances = pairwise_distances(self.X_train_, X, metric="euclidean") + dist = pairwise_distances(self.X_train_, X, metric="euclidean") y_pred = np.empty(X.shape[0], dtype=self.y_train_.dtype) for i in range(len(X)): - nearest_indices = np.argsort(distances[:, i])[:self.n_neighbors] - nearest_labels = self.y_train_[nearest_indices] - most_common_label = Counter(nearest_labels).most_common(1)[0][0] + idx_nearest = np.argsort(dist[:, i])[:self.n_neighbors] + labels = self.y_train_[idx_nearest] + most_common_label = Counter(labels).most_common(1)[0][0] y_pred[i] = most_common_label return y_pred @@ -201,7 +200,6 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ - if self.time_col != 'index': X = X.set_index(self.time_col)