-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster_edges.py
66 lines (49 loc) · 1.77 KB
/
cluster_edges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import os
import helpers
import sys
from sklearn.cluster import KMeans
FEAT_TRANSFORM = {
'Points:0': 'x',
'Points:1': 'y'
}
DATA_DIR = 'sample_csv'
OUT_FILE = './clusters.csv'
N_CLUSTERS = 5
def cluster_edges(input_dir, output_file):
file_names = os.listdir(input_dir)
edges = []
for file_name in file_names:
edge = load_edge(file_name)
if edge.shape[0] == 0:
continue
axis_aligned = helpers.axis_align_pandas(edge.sort_values(by='x'))
edges.append(axis_aligned)
downsampled_edges = []
for edge in edges:
downsampled_edges.append(helpers.downsample(helpers.axis_align_pandas(edge), 1000))
total_frame_cols = get_cols_from_frame(downsampled_edges[0])
total_frame = pd.DataFrame(columns=total_frame_cols)
for edge in downsampled_edges:
total_frame = total_frame.append(frame_to_row(edge), ignore_index=True)
kmeans = KMeans(N_CLUSTERS, n_jobs=-1)
kmeans.fit(total_frame)
clusters = kmeans.predict(total_frame)
pd.DataFrame({
'filename': file_names,
'cluster': clusters
}).to_csv(output_file, index=False)
def load_edge(file_name):
image_path = '%s/%s' % (DATA_DIR, file_name)
return pd.read_csv(image_path)[list(FEAT_TRANSFORM.keys())].rename(columns=FEAT_TRANSFORM)
def frame_to_row(in_frame):
cols = get_cols_from_frame(in_frame)
unstacked = in_frame.unstack()
return pd.Series(unstacked.ravel(), index=cols)
def get_cols_from_frame(in_frame):
unstacked = in_frame.unstack()
return list('y' + unstacked['y'].index.astype(str)) + list('x' + unstacked['x'].index.astype(str))
if __name__ == '__main__':
input_dir = './%s' % sys.argv[1]
output_file = sys.argv[2]
cluster_edges(input_dir, output_file)