Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Solve compatibility issues with scikit>=0.24 #34

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy
scipy
scikit-learn>=0.22
scikit-learn>=0.24
pytest
nose
joblib
joblib
122 changes: 87 additions & 35 deletions spherecluster/spherical_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,53 +6,61 @@

from sklearn.cluster import KMeans

# from sklearn.cluster import _k_means
from sklearn.cluster import _k_means_fast as _k_means
from sklearn.cluster.k_means_ import (
from sklearn.cluster import _k_means_lloyd
from sklearn.cluster._kmeans import (
_check_sample_weight,
_init_centroids,
_labels_inertia,
_tolerance,
_validate_center_shape,
)
from sklearn.preprocessing import normalize
from sklearn.utils import check_array, check_random_state
from sklearn.utils.extmath import row_norms, squared_norm
from sklearn.utils.validation import _num_samples
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads


def _spherical_kmeans_single_lloyd(
X,
n_clusters,
centers_init,
sample_weight=None,
max_iter=300,
init="k-means++",
verbose=False,
x_squared_norms=None,
random_state=None,
tol=1e-4,
precompute_distances=True,
n_threads=1,
):
"""
Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
"""
random_state = check_random_state(random_state)

sample_weight = _check_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

best_labels, best_inertia, best_centers = None, None, None

# init
centers = _init_centroids(
X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms
)
if verbose:
print("Initialization complete")

# Allocate memory to store the distances for each sample to its
# closer center for reallocation in case of ties
distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

# Buffers to avoid new allocations at each iteration.
centers = centers_init
centers_new = np.zeros_like(centers)
labels = np.full(X.shape[0], -1, dtype=np.int32)
labels_old = labels.copy()
weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
center_shift = np.zeros(n_clusters, dtype=X.dtype)

if sp.issparse(X):
lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_sparse
_inertia = _k_means._inertia_sparse
else:
lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_dense
_inertia = _k_means._inertia_dense

# iterations
for i in range(max_iter):
centers_old = centers.copy()
Expand All @@ -66,24 +74,11 @@ def _spherical_kmeans_single_lloyd(
sample_weight,
x_squared_norms,
centers,
precompute_distances=precompute_distances,
distances=distances,
)
lloyd_iter(X, sample_weight, x_squared_norms, centers, centers_new,
weight_in_clusters, labels, center_shift, n_threads)

# computation of the means
if sp.issparse(X):
centers = _k_means._centers_sparse(
X, sample_weight, labels, n_clusters, distances
)
else:
centers = _k_means._centers_dense(
X.astype(np.float),
sample_weight.astype(np.float),
labels,
n_clusters,
distances.astype(np.float),
)

centers, centers_new = centers_new, centers
# l2-normalize centers (this is the main contibution here)
centers = normalize(centers)

Expand Down Expand Up @@ -112,8 +107,6 @@ def _spherical_kmeans_single_lloyd(
sample_weight,
x_squared_norms,
best_centers,
precompute_distances=precompute_distances,
distances=distances,
)

return best_labels, best_inertia, best_centers, i + 1
Expand All @@ -122,6 +115,7 @@ def _spherical_kmeans_single_lloyd(
def spherical_k_means(
X,
n_clusters,
centers_init,
sample_weight=None,
init="k-means++",
n_init=10,
Expand Down Expand Up @@ -163,9 +157,6 @@ def spherical_k_means(
tol = _tolerance(X, tol)

if hasattr(init, "__array__"):
init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
_validate_center_shape(X, n_clusters, init)

if n_init != 1:
warnings.warn(
"Explicit initial center position passed: "
Expand All @@ -186,7 +177,8 @@ def spherical_k_means(
labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
X,
n_clusters,
sample_weight,
centers_init=centers_init,
sample_weight=sample_weight,
max_iter=max_iter,
init=init,
verbose=verbose,
Expand Down Expand Up @@ -332,6 +324,51 @@ def __init__(
self.n_jobs = n_jobs
self.normalize = normalize

def _check_params(self, X):
# n_jobs
if self.n_jobs != 'deprecated':
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
" removed in 1.0 (renaming of 0.25).", FutureWarning)
self._n_threads = self.n_jobs
else:
self._n_threads = None
self._n_threads = _openmp_effective_n_threads(self._n_threads)

# n_init
if self.n_init <= 0:
raise ValueError(
f"n_init should be > 0, got {self.n_init} instead.")
self._n_init = self.n_init

# max_iter
if self.max_iter <= 0:
raise ValueError(
f"max_iter should be > 0, got {self.max_iter} instead.")

# n_clusters
if X.shape[0] < self.n_clusters:
raise ValueError(f"n_samples={X.shape[0]} should be >= "
f"n_clusters={self.n_clusters}.")

# tol
self._tol = _tolerance(X, self.tol)

# init
if not (hasattr(self.init, '__array__') or callable(self.init)
or (isinstance(self.init, str)
and self.init in ["k-means++", "random"])):
raise ValueError(
f"init should be either 'k-means++', 'random', a ndarray or a "
f"callable, got '{self.init}' instead.")

if hasattr(self.init, '__array__') and self._n_init != 1:
warnings.warn(
f"Explicit initial center position passed: performing only"
f" one init in {self.__class__.__name__} instead of "
f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2)
self._n_init = 1


def fit(self, X, y=None, sample_weight=None):
"""Compute k-means clustering.

Expand All @@ -349,14 +386,29 @@ def fit(self, X, y=None, sample_weight=None):
"""
if self.normalize:
X = normalize(X)
self._check_params(X)

random_state = check_random_state(self.random_state)

# Validate init array
init = self.init
if hasattr(init, "__array__"):
init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
self._validate_center_shape(X, init)

# TODO: add check that all data is unit-normalized
x_squared_norms = row_norms(X, squared=True)
centers_init = self._init_centroids(
X,
init=self.init,
random_state=random_state,
x_squared_norms=x_squared_norms
)

self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
X,
n_clusters=self.n_clusters,
centers_init=centers_init,
sample_weight=sample_weight,
init=self.init,
n_init=self.n_init,
Expand Down
2 changes: 1 addition & 1 deletion spherecluster/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@


def test_estimator_spherical_k_means():
return check_estimator(SphericalKMeans)
return check_estimator(SphericalKMeans())
28 changes: 14 additions & 14 deletions spherecluster/von_mises_fisher_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
from scipy.special import iv # modified Bessel function of first kind, I_v
from scipy.special import logsumexp

from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape

from sklearn.cluster import KMeans
from sklearn.cluster._kmeans import _tolerance, _kmeans_plusplus
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import normalize
from sklearn.utils import check_array, check_random_state, as_float_array
from sklearn.utils.extmath import squared_norm
from sklearn.utils.validation import FLOAT_DTYPES
from sklearn.utils.validation import check_is_fitted

from . import spherical_kmeans
# _init_centroids

MAX_CONTENTRATION = 1e10

Expand Down Expand Up @@ -219,13 +220,9 @@ def _init_unit_centers(X, n_clusters, random_state, init):
return centers

elif init == "k-means++":
centers = _init_centroids(
X,
n_clusters,
"k-means++",
random_state=random_state,
x_squared_norms=np.ones((n_examples,)),
)
centers, _ = _kmeans_plusplus(X, n_clusters,
random_state=random_state,
x_squared_norms=np.ones((n_examples,)))

for cc in range(n_clusters):
centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
Expand Down Expand Up @@ -530,9 +527,6 @@ def movMF(
tol = _tolerance(X, tol)

if hasattr(init, "__array__"):
init = check_array(init, dtype=X.dtype.type, copy=True)
_validate_center_shape(X, n_clusters, init)

if n_init != 1:
warnings.warn(
"Explicit initial center position passed: "
Expand Down Expand Up @@ -613,7 +607,7 @@ def movMF(
)


class VonMisesFisherMixture(BaseEstimator, ClusterMixin, TransformerMixin):
class VonMisesFisherMixture(KMeans):
"""Estimator for Mixture of von Mises Fisher clustering on the unit sphere.

Implements the algorithms (i) and (ii) from
Expand Down Expand Up @@ -820,6 +814,12 @@ def fit(self, X, y=None):
if self.normalize:
X = normalize(X)

# Validate init array
init = self.init
if hasattr(init, "__array__"):
init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
self._validate_center_shape(X, init)

self._check_force_weights()
random_state = check_random_state(self.random_state)
X = self._check_fit_data(X)
Expand Down