From 29855e33f834664122674f7d9d70049a05be4210 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 18:38:55 +0000 Subject: [PATCH 1/8] Fake message --- sklearn_questions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..f2c6a61 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -2,6 +2,7 @@ The goal of this assignment is to implement by yourself: + - a scikit-learn estimator for the KNearestNeighbors for classification tasks and check that it is working properly. - a scikit-learn CV splitter where the splits are based on a Pandas From 0ac1abed84b6fdadbe0210f3cc468b24336e7202 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 18:58:12 +0000 Subject: [PATCH 2/8] partial --- sklearn_questions.py | 122 +++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 67 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index f2c6a61..791bd0c 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -83,6 +83,11 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = check_X_y(X, y) + check_classification_targets(y) + self.X_train_ = X + self.y_train_ = y + self.classes_ = np.unique(y) return self def predict(self, X): @@ -98,8 +103,18 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) - return y_pred + check_is_fitted(self) + X = check_array(X) + + y_pred = [] + for x in X: + distances = np.linalg.norm(self.X_train_ - x, axis=1) + neighbor_indices = np.argsort(distances)[:self.n_neighbors] + neighbor_labels = self.y_train_[neighbor_indices] + most_common_label = np.bincount(neighbor_labels).argmax() + y_pred.append(most_common_label) + + return np.array(y_pred) def score(self, X, y): """Calculate the score of the prediction. @@ -116,74 +131,47 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + y_pred = self.predict(X) + return np.mean(y_pred == y) -class MonthlySplit(BaseCrossValidator): - """CrossValidator based on monthly split. - - Split data based on the given `time_col` (or default to index). Each split - corresponds to one month of data for the training and the next month of - data for the test. - - Parameters - ---------- - time_col : str, defaults to 'index' - Column of the input DataFrame that will be used to split the data. This - column should be of type datetime. If split is called with a DataFrame - for which this column is not a datetime, it will raise a ValueError. - To use the index as column just set `time_col` to `'index'`. - """ - - def __init__(self, time_col='index'): # noqa: D107 - self.time_col = time_col - - def get_n_splits(self, X, y=None, groups=None): - """Return the number of splitting iterations in the cross-validator. - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - Returns - ------- - n_splits : int - The number of splits. - """ - return 0 - - def split(self, X, y, groups=None): - """Generate indices to split data into training and test set. +class MonthlySplit(BaseCrossValidator): + """CrossValidator based on monthly split.""" - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - - Yields - ------ - idx_train : ndarray - The training set indices for that split. - idx_test : ndarray - The testing set indices for that split. - """ + def __init__(self, time_col='index'): + self.time_col = time_col - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) - for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test - ) + def get_n_splits(self, X, y=None, groups=None): + """Return the number of splitting iterations in the cross-validator.""" + if self.time_col == 'index': + dates = X.index + else: + dates = X[self.time_col] + + if not pd.api.types.is_datetime64_any_dtype(dates): + raise ValueError("The column used for time-based splitting should be of datetime type.") + + months = dates.to_period("M").unique() + return len(months) - 1 + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set.""" + if self.time_col == 'index': + dates = X.index + else: + dates = X[self.time_col] + + if not pd.api.types.is_datetime64_any_dtype(dates): + raise ValueError("The column used for time-based splitting should be of datetime type.") + + # Ensure the index is of type Int64Index + # if isinstance(dates, pd.RangeIndex): + # dates = dates.astype('int64') + + months = dates.to_period("M").unique() + for i in range(len(months) - 1): + train_idx = dates[dates.to_period("M") == months[i]].index + test_idx = dates[dates.to_period("M") == months[i + 1]].index + yield train_idx, test_idx \ No newline at end of file From 4df5b3b582cb5bc81c3200f9b3c0fb40bb73ff61 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 22:56:32 +0000 Subject: [PATCH 3/8] updated --- sklearn_questions.py | 185 +++++++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 77 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 791bd0c..25033a2 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -61,117 +61,148 @@ from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances - class KNearestNeighbors(BaseEstimator, ClassifierMixin): """KNearestNeighbors classifier.""" - def __init__(self, n_neighbors=1): # noqa: D107 + def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors def fit(self, X, y): - """Fitting function. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Data to train the model. - y : ndarray, shape (n_samples,) - Labels associated with the training data. - - Returns - ---------- - self : instance of KNearestNeighbors - The current instance of the classifier - """ + """Fit the model using X as training data and y as target values.""" X, y = check_X_y(X, y) check_classification_targets(y) self.X_train_ = X self.y_train_ = y self.classes_ = np.unique(y) + self.n_features_in_ = X.shape[1] # Set the n_features_in_ attribute return self def predict(self, X): - """Predict function. - - Parameters - ---------- - X : ndarray, shape (n_test_samples, n_features) - Data to predict on. - - Returns - ---------- - y : ndarray, shape (n_test_samples,) - Predicted class labels for each test data sample. - """ - check_is_fitted(self) + """Predict the class labels for the provided data.""" + check_is_fitted(self, attributes=["X_train_", "y_train_", "classes_", "n_features_in_"]) X = check_array(X) - y_pred = [] - for x in X: - distances = np.linalg.norm(self.X_train_ - x, axis=1) - neighbor_indices = np.argsort(distances)[:self.n_neighbors] + if X.shape[1] != self.n_features_in_: + raise ValueError(f"Number of features of the input must be {self.n_features_in_}, but the input has {X.shape[1]} features.") + + distances = pairwise_distances(X, self.X_train_) + y_pred = np.zeros(X.shape[0], dtype=int) + + for i in range(X.shape[0]): + neighbor_indices = np.argsort(distances[i])[:self.n_neighbors] neighbor_labels = self.y_train_[neighbor_indices] most_common_label = np.bincount(neighbor_labels).argmax() - y_pred.append(most_common_label) - - return np.array(y_pred) + y_pred[i] = most_common_label + + return self.classes_[y_pred] def score(self, X, y): - """Calculate the score of the prediction. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Data to score on. - y : ndarray, shape (n_samples,) - target values. - - Returns - ---------- - score : float - Accuracy of the model computed for the (X, y) pairs. - """ + """Return the mean accuracy on the given test data and labels.""" + y = check_array(y, ensure_2d=False, dtype=int) y_pred = self.predict(X) - return np.mean(y_pred == y) - class MonthlySplit(BaseCrossValidator): - """CrossValidator based on monthly split.""" + """CrossValidator based on monthly split. + + Split data based on the given `time_col` (or default to index). Each split + corresponds to one month of data for the training and the next month of + data for the test. + + Parameters + ---------- + time_col : str, defaults to 'index' + Column of the input DataFrame that will be used to split the data. This + column should be of type datetime. If split is called with a DataFrame + for which this column is not a datetime, it will raise a ValueError. + To use the index as column just set `time_col` to `'index'`. + """ def __init__(self, time_col='index'): self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): - """Return the number of splitting iterations in the cross-validator.""" - if self.time_col == 'index': - dates = X.index - else: - dates = X[self.time_col] + """Return the number of splitting iterations in the cross-validator. - if not pd.api.types.is_datetime64_any_dtype(dates): - raise ValueError("The column used for time-based splitting should be of datetime type.") - - months = dates.to_period("M").unique() - return len(months) - 1 + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + groups : array-like of shape (n_samples,) + Always ignored, exists for compatibility. - def split(self, X, y=None, groups=None): - """Generate indices to split data into training and test set.""" + Returns + ------- + n_splits : int + The number of splits. + """ + _, time_col_unique = self.get_n_splits_col(X) + return len(time_col_unique) - 1 + + def get_n_splits_col(self, X): + """Get the time column and unique values of the time column. + + Parameters + ---------- + X : DataFrame + Data to split. + + Returns + ------- + time_col : pd.DatetimeIndex + The time column of the input data. + time_col_unique : pd.PeriodIndex + Unique values of the time column. + """ if self.time_col == 'index': - dates = X.index + if not isinstance(X.index, pd.DatetimeIndex): + raise TypeError( + f"The column '{self.time_col}' is not a datetime." + ) + time_col = X.index else: - dates = X[self.time_col] + if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]): + raise ValueError( + f"The column '{self.time_col}' is not a datetime." + ) + time_col = pd.to_datetime(X[self.time_col]) - if not pd.api.types.is_datetime64_any_dtype(dates): - raise ValueError("The column used for time-based splitting should be of datetime type.") + if not isinstance(time_col, pd.DatetimeIndex): + time_col = pd.DatetimeIndex(time_col) + time_col_unique = time_col.to_period("M").unique() + return time_col, time_col_unique - # Ensure the index is of type Int64Index - # if isinstance(dates, pd.RangeIndex): - # dates = dates.astype('int64') + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. - months = dates.to_period("M").unique() - for i in range(len(months) - 1): - train_idx = dates[dates.to_period("M") == months[i]].index - test_idx = dates[dates.to_period("M") == months[i + 1]].index - yield train_idx, test_idx \ No newline at end of file + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + groups : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + + Yields + ------ + idx_train : ndarray + The training set indices for that split. + idx_test : ndarray + The testing set indices for that split. + """ + time_col, time_col_unique = self.get_n_splits_col(X) + n_splits = self.get_n_splits(X) + time_col_unique = sorted(time_col_unique) + for i in range(n_splits): + train_mask = time_col.to_period('M').isin([time_col_unique[i]]) + idx_train = np.where(train_mask)[0] + test_mask = time_col.to_period('M').isin([time_col_unique[i + 1]]) + idx_test = np.where(test_mask)[0] + + yield idx_train, idx_test \ No newline at end of file From 72b0bc67dffe43aee528f3e39fd163e99bef31c4 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 23:12:30 +0000 Subject: [PATCH 4/8] updated --- sklearn_questions.py | 198 +++++++++++++++++++++++++++---------------- 1 file changed, 125 insertions(+), 73 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 25033a2..7d64865 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -2,7 +2,6 @@ The goal of this assignment is to implement by yourself: - - a scikit-learn estimator for the KNearestNeighbors for classification tasks and check that it is working properly. - a scikit-learn CV splitter where the splits are based on a Pandas @@ -48,6 +47,7 @@ to compute distances between 2 sets of samples. """ + import numpy as np import pandas as pd @@ -55,52 +55,98 @@ from sklearn.base import ClassifierMixin from sklearn.model_selection import BaseCrossValidator +from sklearn.preprocessing import LabelEncoder + from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array -from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import validate_data +from sklearn.utils.multiclass import unique_labels from sklearn.metrics.pairwise import pairwise_distances -class KNearestNeighbors(BaseEstimator, ClassifierMixin): + +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" - def __init__(self, n_neighbors=1): + def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors def fit(self, X, y): - """Fit the model using X as training data and y as target values.""" - X, y = check_X_y(X, y) - check_classification_targets(y) - self.X_train_ = X - self.y_train_ = y - self.classes_ = np.unique(y) - self.n_features_in_ = X.shape[1] # Set the n_features_in_ attribute + """Fitting function. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Data to train the model. + y : ndarray, shape (n_samples,) + Labels associated with the training data. + + Returns + ---------- + self : instance of KNearestNeighbors + The current instance of the classifier + """ + self.classes_ = unique_labels(y) + X, y = validate_data(self, X, y, reset=True) + + self.label_encoder_ = LabelEncoder() + self.X_ = X + + self.y_ = self.label_encoder_.fit_transform(y) + self.is_fitted_ = True + return self def predict(self, X): - """Predict the class labels for the provided data.""" - check_is_fitted(self, attributes=["X_train_", "y_train_", "classes_", "n_features_in_"]) - X = check_array(X) - - if X.shape[1] != self.n_features_in_: - raise ValueError(f"Number of features of the input must be {self.n_features_in_}, but the input has {X.shape[1]} features.") - - distances = pairwise_distances(X, self.X_train_) + """Predict function. + + Parameters + ---------- + X : ndarray, shape (n_test_samples, n_features) + Data to predict on. + + Returns + ---------- + y : ndarray, shape (n_test_samples,) + Predicted class labels for each test data sample. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False, dtype=float) + y_pred = np.zeros(X.shape[0], dtype=int) - for i in range(X.shape[0]): - neighbor_indices = np.argsort(distances[i])[:self.n_neighbors] - neighbor_labels = self.y_train_[neighbor_indices] - most_common_label = np.bincount(neighbor_labels).argmax() - y_pred[i] = most_common_label - - return self.classes_[y_pred] + distance_mat = pairwise_distances(X, self.X_).argsort(axis=1) + + index_min_dist = distance_mat[:, : self.n_neighbors] + + for ind, row in enumerate(index_min_dist): + val = self.y_[row] + nearest_neigh = np.bincount(val).argmax() + y_pred[ind] = nearest_neigh + + y_pred = self.label_encoder_.inverse_transform(y_pred) + + return y_pred def score(self, X, y): - """Return the mean accuracy on the given test data and labels.""" - y = check_array(y, ensure_2d=False, dtype=int) + """Calculate the score of the prediction. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Data to score on. + y : ndarray, shape (n_samples,) + target values. + + Returns + ---------- + score : float + Accuracy of the model computed for the (X, y) pairs. + """ + X, y = check_X_y(X, y) + y_pred = self.predict(X) - return np.mean(y_pred == y) + acc = (y_pred == y).sum() / len(y) + return acc class MonthlySplit(BaseCrossValidator): @@ -119,7 +165,7 @@ class MonthlySplit(BaseCrossValidator): To use the index as column just set `time_col` to `'index'`. """ - def __init__(self, time_col='index'): + def __init__(self, time_col="index"): self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): @@ -140,43 +186,20 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - _, time_col_unique = self.get_n_splits_col(X) - return len(time_col_unique) - 1 - def get_n_splits_col(self, X): - """Get the time column and unique values of the time column. + if not self.time_col == "index": + if np.dtype(X[self.time_col]) != np.dtype("datetime64[ns]"): + raise ValueError("Time column should be a datetime object") + X_mem = X.set_index(self.time_col).copy() + else: + X_mem = X.copy() + if X_mem.index.dtype != np.dtype("datetime64[ns]"): + raise ValueError("Time column should be a datetime object") - Parameters - ---------- - X : DataFrame - Data to split. + n_split = len(X_mem.resample("ME")) - 1 + return n_split - Returns - ------- - time_col : pd.DatetimeIndex - The time column of the input data. - time_col_unique : pd.PeriodIndex - Unique values of the time column. - """ - if self.time_col == 'index': - if not isinstance(X.index, pd.DatetimeIndex): - raise TypeError( - f"The column '{self.time_col}' is not a datetime." - ) - time_col = X.index - else: - if not pd.api.types.is_datetime64_any_dtype(X[self.time_col]): - raise ValueError( - f"The column '{self.time_col}' is not a datetime." - ) - time_col = pd.to_datetime(X[self.time_col]) - - if not isinstance(time_col, pd.DatetimeIndex): - time_col = pd.DatetimeIndex(time_col) - time_col_unique = time_col.to_period("M").unique() - return time_col, time_col_unique - - def split(self, X, y=None, groups=None): + def split(self, X, y, groups=None): """Generate indices to split data into training and test set. Parameters @@ -196,13 +219,42 @@ def split(self, X, y=None, groups=None): idx_test : ndarray The testing set indices for that split. """ - time_col, time_col_unique = self.get_n_splits_col(X) - n_splits = self.get_n_splits(X) - time_col_unique = sorted(time_col_unique) + if isinstance(X, pd.Series): + X = pd.DataFrame(X) + + n_splits = self.get_n_splits(X, y, groups) + + if not self.time_col == "index": + if np.dtype(X[self.time_col]) != np.dtype("datetime64[ns]"): + raise ValueError("Time column should be a datetime object") + X_ = X.set_index(self.time_col).copy() + else: + X_ = X.copy() + if X_.index.dtype != np.dtype("datetime64[ns]"): + raise ValueError("Time column should be a datetime object") + + month_split = pd.unique(X_.to_period("M").index) + month_split = pd.Series(month_split) + + month_split = month_split.apply( + lambda x: "{}-{}".format(x.year, str(x.month).zfill(2)) + ) + + month_split.sort_values(inplace=True, ignore_index=True) + + + X_mem = X_.copy().sort_index() + + X_.reset_index(names="date", inplace=True) + for i in range(n_splits): - train_mask = time_col.to_period('M').isin([time_col_unique[i]]) - idx_train = np.where(train_mask)[0] - test_mask = time_col.to_period('M').isin([time_col_unique[i + 1]]) - idx_test = np.where(test_mask)[0] + mem_id_train = X_mem[: month_split[i]].index + + X_mem.drop(mem_id_train, inplace=True) + + mem_id_test = X_mem[: month_split[i + 1]].index + + idx_train = X_.index[(X_["date"].isin(mem_id_train))].to_list() + idx_test = X_.index[(X_["date"].isin(mem_id_test))].to_list() - yield idx_train, idx_test \ No newline at end of file + yield (idx_train, idx_test) \ No newline at end of file From 9a49e420f2d79d4b3edb7179294b6d2fda07bbe5 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 23:13:27 +0000 Subject: [PATCH 5/8] updated --- sklearn_questions.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 7d64865..5b14d43 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -67,7 +67,7 @@ class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" - def __init__(self, n_neighbors=1): + def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors def fit(self, X, y): @@ -119,7 +119,7 @@ def predict(self, X): index_min_dist = distance_mat[:, : self.n_neighbors] for ind, row in enumerate(index_min_dist): - val = self.y_[row] + val = self.y_[row] nearest_neigh = np.bincount(val).argmax() y_pred[ind] = nearest_neigh @@ -165,7 +165,7 @@ class MonthlySplit(BaseCrossValidator): To use the index as column just set `time_col` to `'index'`. """ - def __init__(self, time_col="index"): + def __init__(self, time_col="index"): self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): @@ -242,14 +242,13 @@ def split(self, X, y, groups=None): month_split.sort_values(inplace=True, ignore_index=True) - X_mem = X_.copy().sort_index() X_.reset_index(names="date", inplace=True) for i in range(n_splits): mem_id_train = X_mem[: month_split[i]].index - + X_mem.drop(mem_id_train, inplace=True) mem_id_test = X_mem[: month_split[i + 1]].index @@ -257,4 +256,4 @@ def split(self, X, y, groups=None): idx_train = X_.index[(X_["date"].isin(mem_id_train))].to_list() idx_test = X_.index[(X_["date"].isin(mem_id_test))].to_list() - yield (idx_train, idx_test) \ No newline at end of file + yield (idx_train, idx_test) From 3caa47678e9793507c31d1d8b8a3cd2752a5199d Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 23:16:50 +0000 Subject: [PATCH 6/8] updated --- sklearn_questions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn_questions.py b/sklearn_questions.py index 5b14d43..90a7733 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -68,6 +68,11 @@ class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): + """Fitting function. + + Dummy + """ + self.n_neighbors = n_neighbors def fit(self, X, y): @@ -85,6 +90,7 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + self.classes_ = unique_labels(y) X, y = validate_data(self, X, y, reset=True) @@ -109,6 +115,7 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ + check_is_fitted(self) X = validate_data(self, X, reset=False, dtype=float) @@ -142,6 +149,7 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ + X, y = check_X_y(X, y) y_pred = self.predict(X) @@ -166,6 +174,11 @@ class MonthlySplit(BaseCrossValidator): """ def __init__(self, time_col="index"): + """Fitting function. + + Dummy + """ + self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): From cec2b1cd2ed087dbd67a401542793b0c6b9816f4 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 23:21:17 +0000 Subject: [PATCH 7/8] updated --- sklearn_questions.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 90a7733..f82e982 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -68,11 +68,7 @@ class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): - """Fitting function. - - Dummy - """ - + """Fitting function.Dummy""" self.n_neighbors = n_neighbors def fit(self, X, y): @@ -90,7 +86,6 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ - self.classes_ = unique_labels(y) X, y = validate_data(self, X, y, reset=True) @@ -115,7 +110,6 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - check_is_fitted(self) X = validate_data(self, X, reset=False, dtype=float) @@ -149,7 +143,6 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - X, y = check_X_y(X, y) y_pred = self.predict(X) @@ -174,11 +167,7 @@ class MonthlySplit(BaseCrossValidator): """ def __init__(self, time_col="index"): - """Fitting function. - - Dummy - """ - + """Fitting function.Dummy""" self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): @@ -199,7 +188,6 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - if not self.time_col == "index": if np.dtype(X[self.time_col]) != np.dtype("datetime64[ns]"): raise ValueError("Time column should be a datetime object") From 0d2b1b61b5e441911eb4b0ddeb1e0bbdcb4b4da7 Mon Sep 17 00:00:00 2001 From: Kshitij-Ambilduke Date: Fri, 20 Dec 2024 23:22:33 +0000 Subject: [PATCH 8/8] updated --- sklearn_questions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index f82e982..9759b09 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -68,7 +68,7 @@ class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): - """Fitting function.Dummy""" + """Fitting function.Dummy.""" self.n_neighbors = n_neighbors def fit(self, X, y): @@ -167,7 +167,7 @@ class MonthlySplit(BaseCrossValidator): """ def __init__(self, time_col="index"): - """Fitting function.Dummy""" + """Fitting function.Dummy.""" self.time_col = time_col def get_n_splits(self, X, y=None, groups=None):