-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsurvey-clustering.py
360 lines (308 loc) · 14.6 KB
/
survey-clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# an example code for clustering texts by their meaning, with visual examples.
# pretrained word embeddings for Spanish language was downloaded from: https://github.com/dccuchile/spanish-word-embeddings (the biggest one was taken)
# The word-embedding visualisation code was inspired by a tutorial: https://towardsdatascience.com/visualizing-word-embedding-with-pca-and-t-sne-961a692509f5
# This k-means clustering code was inspired by a tutorial: https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203
# Author: Elmurod Kuriyozov (elmurod1202@gmail.com)
# Date: March 25, 2022
from sklearn.decomposition import PCA
from matplotlib import pyplot
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
import time
# Tried using cache functions, so I don't have to load the word-embedding model every time I run the code:
from functools import lru_cache
# This helps to find the "elbow point" on an optimization curve:
# Run this if you don't have it:
# $ pip install kneed
from kneed import KneeLocator
import numpy as np
import csv
# # If you are just going to ignore warnings for a clear output, not recommended though:
# import warnings
# warnings.filterwarnings("ignore")
# A function to return a list of words read from a given file:
@lru_cache(maxsize=None)
def load_model(modelf):
model = KeyedVectors.load_word2vec_format(modelf,binary=False)
return model
# Model filename: This time it's Spanish pretrained word embedding.
modelf = "src/embeddings-l-model.vec"
# Name of file with words to be plotted, each word in a new line:
wordf = "input/answers.txt"
# PCA axes to plot on, the most relevant are [0,1] or [1,2] for 2D and [0,1,2] or [1,2,3] for 3D
axes = [0, 1]
# Name of the file that holds the list of Spanish stopwords, one word per line.
stopwords_file = "src/spanish-stopwords.txt"
# To differentiate groups in the graph, you can give the labels a corresponding color or font size
# e.g. words in the first group will be red, words in the second group will be blue, etc.
# Color of words in each group, uses default if too many groups
# Dark colors are good for matplotlib's white background, use hex or https://matplotlib.org/gallery/color/named_colors.html
colors = ["tab:red", "tab:blue", "tab:green", "tab:orange",
"tab:purple", "tab:olive", "tab:pink", "tab:cyan", "tab:gray", "tab:lime", "tab:brown", "tab:yellow"]
defaultcolor = "black"
# Font sizes of words in each group
sizes = []
defaultsize = 6
# A method to return a set of stopwords read from a given file:
def load_stopwords(file_name):
with open(file_name) as file:
lines = [line.rstrip() for line in file]
file.close()
return set(lines)
# A function to return average vector of given vectors:
def makeFeatureVec(words, model):
# Function to average all of the word vectors in a given expression
#
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((300,),dtype="float32")
nwords = 0.
# Loop over each word and add its feature vector to the total
for word in words:
if (word in list(model.index_to_key)):
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
else:
print("!!! Alert: OOV: ", word)
#
# Divide the result by the number of words to get the average
featureVec = np.divide(featureVec,nwords)
return featureVec
# A function to remove stopwords from givem multi-word expression:
def remove_stopwords(multiword, stopwords):
important_words =[]
for word in multiword:
if word not in stopwords:
important_words.append(word)
return important_words
# A function that finds the optimal numbeer of clusters using the elbow method:
def get_optimal_cluster_numbers(result):
max_number_possible_clusters = 30
optimal_cluster_numbers = 0
wcss = []
for i in range(1, max_number_possible_clusters):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(result)
wcss.append(kmeans.inertia_)
# Now let's find out the maximum curvation point using elbow method:
# For this we used an implementation: https://github.com/arvkevi/kneed
kneedle = KneeLocator(range(1, max_number_possible_clusters), wcss, curve="convex", direction="decreasing")
optimal_cluster_numbers = kneedle.elbow
# Uncomment these lines if you want to see the created plot:
pyplot.plot(range(1, max_number_possible_clusters), wcss)
pyplot.title('Elbow Method')
pyplot.xlabel('Number of clusters')
pyplot.ylabel('WCSS')
pyplot.vlines(optimal_cluster_numbers, pyplot.ylim()[0], pyplot.ylim()[1], linestyles='dashed')
#adding text inside the plot
pyplot.text(optimal_cluster_numbers+1, 250, 'K='+str(optimal_cluster_numbers), fontsize = 16)
result_filename = "output/elbow_method_2d.png"
pyplot.savefig(result_filename)
print("Elbow method:" + result_filename)
pyplot.show()
pyplot.close()
return optimal_cluster_numbers
# A function that plots the result of the clustering:
def plot2D_scatter(result):
pyplot.scatter(result[:, axes[0]], result[:, axes[1]], c="black", s=10)
result_filename = "output/result_scatter_2d.png"
pyplot.savefig(result_filename)
print("2D output resulting scatter saved in file:" + result_filename)
pyplot.close()
# A function that plots the scatter of the clustering, but with words grouped by their cluster:
def plot2D_dots(result, wordgroups, words):
for g, group in enumerate(wordgroups):
for word in group:
if not word in words:
continue
i = words.index(word)
# Create plot point
coord = (result[i, axes[0]], result[i, axes[1]])
color = colors[g] if g < len(colors) else defaultcolor
size = sizes[g] if g < len(sizes) else defaultsize
pyplot.annotate('o', xy=coord, color=color, fontsize=size)
result_filename = "output/result_dots_grouped_2d.png"
pyplot.savefig(result_filename)
print("2D output resulting scatter with words saved in file:" + result_filename)
pyplot.show()
pyplot.close()
# A function that plots the result of the clustering, but with words grouped by their cluster:
def plot2D_words(result, wordgroups, words):
pyplot.scatter(result[:, axes[0]], result[:, axes[1]], c="black", s=10)
for g, group in enumerate(wordgroups):
for word in group:
if not word in words:
continue
i = words.index(word)
# Create plot point
coord = (result[i, axes[0]], result[i, axes[1]])
color = colors[g] if g < len(colors) else defaultcolor
size = sizes[g] if g < len(sizes) else defaultsize
pyplot.annotate(word, xy=coord, color=color, fontsize=size)
result_filename = "output/result_words_2d.png"
pyplot.savefig(result_filename)
print("2D output resulting scatter with words saved in file:" + result_filename)
pyplot.show()
pyplot.close()
# A function that plots the result of the clustering, with words grouped by their cluster in a 3D plot:
def plot3D(result, wordgroups, words):
fig = pyplot.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(result[:, axes[0]], result[:, axes[1]], result[:, axes[2]])
for g, group in enumerate(wordgroups):
for word in group:
if not word in words:
continue
i = words.index(word)
# Create plot point
color = colors[g] if g < len(colors) else defaultcolor
size = sizes[g] if g < len(sizes) else defaultsize
ax.text(result[i, axes[0]], result[i, axes[1]],
result[i, axes[2]], word, color=color, fontsize=size)
result_filename = "output/result_words_3d.png"
pyplot.savefig(result_filename)
print("3D output result saved in file:" + result_filename)
# pyplot.show()
pyplot.close()
# A function that reads the texts from a given file, obtains their vectors and returns it
def get_words(wordf, model, stopwords):
words = []
# Extract words to plot from file
for word in open(wordf, "r", encoding="utf-8").read().split("\n"):
# if (word in list(model.index_to_key)):
if len(word) > 0:
words.append(word)
# Get word vectors from model
# vecs = {w: model.key_to_index[w] for w in words}
vecs = {}
words_new = []
for count_word, word in enumerate(words):
vec_done = False
vec = 0
if "_" not in word:
# This means it's a single word, so we get only its vector:
if (word in list(model.index_to_key)):
vecs[word] = model.key_to_index[word]
vec = model.key_to_index[word]
vec_done = True
words_new.append(word)
else:
print("!!! Alert: OOV: ", word)
else:
# This means the expression is not a single word
# We have to deal with this carefully:
# First we split it to separate words, then remove stopwords:
expression = word.split("_")
expression_chunks = remove_stopwords(expression, stopwords)
print("Found a stopword: {}, un-stopworded it:{}".format(word, ' '.join(expression_chunks)))
if len(expression_chunks) == 0:
print("!!! Alert: OOV, could not handle un-stopwording: ", word)
if len(expression_chunks) == 1:
# This means after removing stopword it became a single word
if (expression_chunks[0] in list(model.index_to_key)):
new_vec = model[expression_chunks[0]]
# We also now have to add this expression to the list of vectors in the model:
new_vec_index = model.add_vector(word, new_vec)
print("new-vec_index:", new_vec_index)
vecs[word] = new_vec_index
vec = new_vec_index
vec_done = True
words_new.append(word)
else:
print("!!! Alert: OOV: ", word)
else:
# This means that it still consists of multiple words
# So we consider their average vector for that:
new_vec_index = model.add_vector(word, makeFeatureVec(expression_chunks, model))
vecs[word] = new_vec_index
print("new-vec_index:", new_vec_index)
vec_done = True
vec = new_vec_index
words_new.append(word)
# We also now have to add this expression to the list of vectors in the model:
if not vec_done:
print("!!!!!!!!!!!!! One word could not get a vector:", word)
# print("{}. {}: {}".format(count_word, word, vec))
return words_new, vecs, model
# A function that splits given wordlist into clusters of k numbers
def get_groups(vecs, optimalK, model):
groups = []
# Assign groups if using clustering
estimator = KMeans(init='k-means++', n_clusters=optimalK, n_init=10)
estimator.fit_predict(model[vecs])
groups = [[] for n in range(optimalK)]
for i, w in enumerate(vecs.keys()):
group = estimator.labels_[i]
groups[group].append(w)
return groups
# The main function that runs the whole program:
if __name__ == '__main__':
# Calculating the time it takes to process everything:
start_time = time.time()
print("Programm started.")
print("Loading the pretrained model:")
print("Please wait, it usually takes a few minutes...")
# model = Word2Vec.load(modelf)
model = load_model(modelf)
print("Loading stopwords:")
stopwords = load_stopwords(stopwords_file)
print("Loading answer words from given file:")
# Get groups by clustering
words, vecs, model_new = get_words(wordf, model, stopwords)
model = model_new
coords = model_new[vecs]
print("Words len: ", len(words))
print("Vecs len: ", len(vecs))
print("Model len: ", len(model))
print("Plotting in 2D:")
# Create 2D axes to plot on
pca = PCA(n_components=max(axes)+1)
result = pca.fit_transform(coords)
# One problem that may arise is that you may need the optimal number of clusters, this can also be solved:
# WCSS is defined as the sum of the squared distance between each member of the cluster and its centroid.
# Now, let's try to find out the optimal number of clusters for the plot we have created using the elbow method.
# To get the values used in the graph, we train multiple models using a different number of clusters
# and storing the value of the intertia_ property (WCSS) every time.
# We graph the relationship between the number of clusters and Within Cluster Sum of Squares (WCSS),
# then we select the number of clusters where the change in WCSS begins to level off (elbow method).
print("Obtaining the optimal cluster number:")
optimalK = get_optimal_cluster_numbers(result)
print("Obtained the optimal cluster number = {}".format(optimalK))
# Now grouping the words:
print("Grouping the words into clusters")
groups = get_groups(vecs,optimalK, model_new)
# Saving the resulting vectors in a csv file:
result_file_name = "output/result.csv"
with open(result_file_name, 'w', encoding='UTF8') as out:
# create the csv writer
writer = csv.writer(out, dialect='excel')
header = ['word', 'group', 'vector_x', 'vector_y']
# write the header
writer.writerow(header)
# Loop through each word in groups and write them to the file:
for g, group in enumerate(groups):
group_id = g+1
for word in group:
if not word in words:
continue
i = words.index(word)
data_row = [word, group_id, result[i, 0], result[i, 1]]
# write a row to the csv file
writer.writerow(data_row)
out.close()
print("Resulting data saved in a file: ", result_file_name)
print("Plotting clusters:")
# Plot vectors on axes
plot2D_scatter(result)
plot2D_words(result, groups, words)
plot2D_dots(result, groups, words)
# # You can also uncomment this part to plot the axes in 3D:
print("Plotting in 3D:")
# Create 3D axes to plot on
axes = [0, 1, 2]
pca = PCA(n_components=max(axes)+1)
result = pca.fit_transform(coords)
plot3D(result, groups, words)
print("Programm finished.")
finish_time = time.time()
exec_time = finish_time - start_time
print("It took {} seconds to execute this.".format(round(exec_time)))