-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
65 lines (51 loc) · 2.42 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from numpy import mean
from numpy import absolute
from numpy import sqrt
kmeans_pca = KMeans(n_clusters=6, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
df_new = pd.concat([df.reset_index(drop= True), pd.DataFrame(scores_pca)], axis=1)
df_new.columns.values[-5:] = ['Component0','Component1','Component2','Component3','Component4']
df_new['Segment K-means PCA'] = kmeans_pca.labels_
df_new['Segment'] = df_new['Segment K-means PCA'].map({0:'first',
1:'second',
2:'third',
3:'fourth',
4:'fifth',
5:'sixth',
6:'seventh',
7:'eighth'})
print(df_new.head())
# Visualizing the clusters
x_axis = df_new['Component0']
y_axis = df_new['Component1']
plt.figure(figsize = (10,8))
sns.scatterplot(x_axis, y_axis, hue = df_new['Segment K-means PCA'], palette=['g','r','c','m','k','orange'])
# sns.scatterplot(x_axis, y_axis, hue = df_new['Segment K-means PCA'], palette=['g','r','c','m','lightcoral','orange','brown','lawngreen','orchid'])
plt.title("Clusters by PCA components")
plt.show()
# df.to_csv(r'C:\Users\A K SINGH\Documents\Olfaction Project\Results\pcafive_c_new.csv', index = False)
df_new.to_csv(r'C:\Users\A K SINGH\Documents\Olfaction Project\Results\kmeansfive_a_6clusters.csv', index = False)
# define predictor and response variables
X = df_new[['target']]
y = df_new['Segment K-means PCA']
# define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# build multiple linear regression model
model = LinearRegression()
# use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
cv=cv, n_jobs=-1)
# view mean absolute error
print("MAE")
print(mean(absolute(scores)))
print("RMSE")
print(sqrt(mean(absolute(scores))))