diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..d79009f 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -55,13 +55,12 @@ from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances -class KNearestNeighbors(BaseEstimator, ClassifierMixin): +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): # noqa: D107 @@ -82,6 +81,16 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = self._validate_data(X, y, accept_sparse=True, + multi_output=False) + check_classification_targets(y) + self.classes_ = np.unique(y) + self.n_features_in_ = X.shape[1] + if len(self.classes_) < 2: + raise ValueError("Only 1 class is present.") + self.X_train_ = X + self.y_train_ = y + return self def predict(self, X): @@ -97,7 +106,16 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self, ['X_train_', 'y_train_']) + X = self._validate_data(X, accept_sparse=True, reset=False) + y_pred = np.zeros(X.shape[0], dtype=self.y_train_.dtype) + distances = pairwise_distances(X, self.X_train_, metric='euclidean') + nearest_i = np.argsort(distances, axis=1)[:, :self.n_neighbors] + nearest_l = self.y_train_[nearest_i] + for i, labels in enumerate(nearest_l): + unique_labels, counts = np.unique(labels, return_counts=True) + y_pred[i] = unique_labels[np.argmax(counts)] + return y_pred def score(self, X, y): @@ -115,7 +133,12 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + check_is_fitted(self) + X = self._validate_data(X, accept_sparse=True, reset=False) + y = self._validate_data(y, ensure_2d=False, reset=False) + y_predict = self.predict(X) + acc = np.mean(y_predict == y) + return acc class MonthlySplit(BaseCrossValidator): @@ -155,7 +178,19 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + X_cc = X.copy() + if self.time_col == 'index': + X_cc = X_cc.reset_index() + + if not pd.api.types.is_datetime64_any_dtype(X_cc[self.time_col]): + raise ValueError( + f"The column '{self.time_col}' is not a datetime." + ) + + X_cc = X_cc.sort_values(by=self.time_col) + unique_months = X_cc[self.time_col].dt.to_period('M').unique() + + return len(unique_months) - 1 def split(self, X, y, groups=None): """Generate indices to split data into training and test set. @@ -177,12 +212,15 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ + X_cc = X.reset_index() + n_splits = self.get_n_splits(X_cc, y, groups) + X_grp = X_cc.sort_values(by=self.time_col).groupby( + pd.Grouper(key=self.time_col, freq="M")) + idxs = [group.index for _, group in X_grp] - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) + idx_train = list(idxs[i]) + idx_test = list(idxs[i+1]) yield ( idx_train, idx_test )