Skip to content

UP my solution #152

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 72 additions & 13 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ def fit(self, X, y):
self : instance of KNearestNeighbors
The current instance of the classifier
"""
X, y = check_X_y(X, y)
self.X_train_ = X
self.y_train_ = y
check_classification_targets(y)
self.n_features_in_ = X.shape[1]
self.classes_ = np.unique(y)
return self

def predict(self, X):
Expand All @@ -97,7 +103,16 @@ def predict(self, X):
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
y_pred = np.zeros(X.shape[0])
check_is_fitted(self)
X = check_array(X)
y_pred = np.zeros(X.shape[0], dtype=self.y_train_.dtype)
dist = pairwise_distances(X, self.X_train_, metric='minkowski')
nearest_ind = np.argsort(dist, axis=1)[:, :self.n_neighbors]
nearest_lab = self.y_train_[nearest_ind]
for i, label in enumerate(nearest_lab):
unique_labels, counts = np.unique(label, return_counts=True)
y_pred[i] = unique_labels[np.argmax(counts)]

return y_pred

def score(self, X, y):
Expand All @@ -115,7 +130,12 @@ def score(self, X, y):
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
check_is_fitted(self)
X, y = check_X_y(X, y)
y_pred = self.predict(X)
accuracy = np.mean(y_pred == y)

return accuracy


class MonthlySplit(BaseCrossValidator):
Expand Down Expand Up @@ -155,9 +175,24 @@ def get_n_splits(self, X, y=None, groups=None):
n_splits : int
The number of splits.
"""
return 0

def split(self, X, y, groups=None):
if not isinstance(X, type(pd.DataFrame())):
time_data = pd.DataFrame({'date': X.index, 'val': X.values})
time_data['date'] = pd.to_datetime(time_data['date'])
elif self.time_col == 'index' and 'date' not in X.columns[0]:
time_data = X.reset_index().copy()
time_data = time_data.rename(
columns={'index': 'date'}, inplace=False)
else:
time_data = X.copy()
if 'date' not in time_data.columns[0]:
time_data = time_data.rename({self.time_col: 'date'})
unique_months = pd.to_datetime(
time_data['date']).dt.strftime('%b-%Y').unique()
n_splits = len(unique_months) - 1

return n_splits

def split(self, X, y=None, groups=None):
"""Generate indices to split data into training and test set.

Parameters
Expand All @@ -177,12 +212,36 @@ def split(self, X, y, groups=None):
idx_test : ndarray
The testing set indices for that split.
"""

n_samples = X.shape[0]
n_splits = self.get_n_splits(X, y, groups)
if self.time_col != 'index':
if not isinstance(X[self.time_col].iloc[0],
type(pd.Timestamp('now'))):
raise ValueError('Index must be of type datetime')
else:
if not isinstance(X.index[0], type(pd.Timestamp('now'))):
raise ValueError('Index must be of type datetime')
if not isinstance(X, type(pd.DataFrame())):
time_data = pd.DataFrame({'date': X.index, 'val': X.values})
time_data['date'] = pd.to_datetime(time_data['date'])
elif self.time_col == 'index':
time_data = X.reset_index().copy()
time_data = time_data.rename(columns={'index': 'date'})
else:
time_data = X.copy()
if 'date' not in time_data.columns[0]:
time_data = time_data.rename(columns={self.time_col: 'date'},
inplace=False)
n_splits = self.get_n_splits(time_data, y, groups)
time_data['period'] = pd.to_datetime(
time_data['date']).dt.strftime('%b-%Y')

periods = np.unique(np.sort(pd.to_datetime(time_data['period'],
format='%b-%Y')))
time_data['period'] = pd.to_datetime(
time_data['period'], format='%b-%Y')
time_data = time_data.reset_index()
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
yield (
idx_train, idx_test
)
idx_train = list(
time_data[time_data['period'] == periods[i]].index)
idx_test = list(
time_data[time_data['period'] == periods[i + 1]].index)
yield (idx_train, idx_test)
Loading