-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathutils.py
270 lines (239 loc) · 11 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
from copy import deepcopy
import numpy as np, matplotlib.pyplot as plt
from sklearn import cluster
from PIL import Image,ImageOps
def resize_and_crop(img, size=(100,100), crop_type='middle'):
# If height is higher we resize vertically, if not we resize horizontally
# Get current and desired ratio for the images
img_ratio = img.size[0] / float(img.size[1])
ratio = size[0] / float(size[1])
# The image is scaled/cropped vertically or horizontally
# depending on the ratio
if ratio > img_ratio:
img = img.resize((
size[0],
int(round(size[0] * img.size[1] / img.size[0]))),
Image.ANTIALIAS)
# Crop in the top, middle or bottom
if crop_type == 'top':
box = (0, 0, img.size[0], size[1])
elif crop_type == 'middle':
box = (
0,
int(round((img.size[1] - size[1]) / 2)),
img.size[0],
int(round((img.size[1] + size[1]) / 2)))
elif crop_type == 'bottom':
box = (0, img.size[1] - size[1], img.size[0], img.size[1])
else:
raise ValueError('ERROR: invalid value for crop_type')
img = img.crop(box)
elif ratio < img_ratio:
img = img.resize((
int(round(size[1] * img.size[0] / img.size[1])),
size[1]),
Image.ANTIALIAS)
# Crop in the top, middle or bottom
if crop_type == 'top':
box = (0, 0, size[0], img.size[1])
elif crop_type == 'middle':
box = (
int(round((img.size[0] - size[0]) / 2)),
0,
int(round((img.size[0] + size[0]) / 2)),
img.size[1])
elif crop_type == 'bottom':
box = (
img.size[0] - size[0],
0,
img.size[0],
img.size[1])
else:
raise ValueError('ERROR: invalid value for crop_type')
img = img.crop(box)
else:
img = img.resize((
size[0],
size[1]),
Image.ANTIALIAS)
# If the scale is the same, we do not need to crop
return img
class Dataset(object):
# Returns data
def get_data(self):
return self.data
# Returns the background data
def get_bg(self):
return self.bg
# Returns the active data
def get_active(self):
return self.active
# Returns the active labels
def get_active_labels(self):
return self.active_labels
# Returns the pca directions for the active set
def get_pca_directions(self):
return self.pca_directions
# Returns the active set projected in the pca directions
def get_active_pca_projected(self):
return self.active_pca
def get_affinity_matrix(self):
return self.affinity_matrix
# A helper method to standardize arrays
def standardize(self, array):
standardized_array = (array-np.mean(array,axis=0)) / np.std(array,axis=0)
return np.nan_to_num(standardized_array)
"""
Initialization performs the following operations
1) Centering the active, background seperately
2) Standardize if to_standardize is specified as True
3) Calculate the covariance_matrices
"""
def __init__(self, to_standardize=True):
#from contrastive import PCA
# Housekeeping
self.pca_directions = None
self.bg_eig_vals = None
self.affinity_matrix = None
# Dataset sizes
self.n_active, _ = self.active.shape
self.n_bg, self.features_d = self.bg.shape
#Center the background data
self.bg = self.bg - np.mean(self.bg, axis=0)
if to_standardize: #Standardize if specified
self.bg = self.standardize(self.bg)
#Calculate the covariance matrix
self.bg_cov = self.bg.T.dot(self.bg)/(self.bg.shape[0]-1)
#Center the active data
self.active = self.active - np.mean(self.active, axis=0)
if to_standardize: #Standardize if specified
self.active = self.standardize(self.active)
#Calculate the covariance matrix
self.active_cov = self.active.T.dot(self.active)/(self.n_active-1)
#self.cpca = PCA()
#self.cpca.fit_transform(foreground=self.active, background=self.bg)
"""
Perfomes plain vanilla pca on the active dataset (TO DO: write the same code for background )
Not a part of init because this might be time consuming
"""
def pca_active(self, n_components = 2):
# Perform PCA only once (to save computation time)
if self.pca_directions is None:
#print("PCA is being perfomed on the dataset")
# Calculating the top eigen vectors on the covariance of the active dataset
w, v = LA.eig(self.active_cov)
# Sorting the vectors in the order of eigen values
idx = w.argsort()[::-1]
idx = idx[:n_components]
w = w[idx]
# Storing the top_pca_directions
self.pca_directions = v[:,idx]
# Storing the active dataset projected on the top_pca_directions
self.active_pca = self.active.dot(self.pca_directions)
else:
print("PCA has been previously perfomed on the dataset")
"""
Returns active and bg dataset projected in the cpca direction, as well as the top c_cpca eigenvalues indices.
If specified, it returns the top_cpca directions
"""
def cpca_alpha(self, n_components = 2, alpha=1, return_eigenvectors=False):
#return None, self.cpca.cpca_alpha(dataset=self.active,alpha=alpha), None
sigma = self.active_cov - alpha*self.bg_cov
w, v = LA.eig(sigma)
eig_idx = np.argpartition(w, -n_components)[-n_components:]
eig_idx = eig_idx[np.argsort(-w[eig_idx])]
v_top = v[:,eig_idx]
reduced_foreground = self.active.dot(v_top)
reduced_background = self.bg.dot(v_top)
reduced_foreground[:,0] = reduced_foreground[:,0]*np.sign(reduced_foreground[0,0])
reduced_foreground[:,1] = reduced_foreground[:,1]*np.sign(reduced_foreground[0,1])
if (return_eigenvectors):
#return eig_idx, reduced_foreground, reduced_background, v_top
print("WHat?")
return None
else:
#return eig_idx, reduced_foreground, reduced_background
return None, reduced_foreground, None
"""
This function performs contrastive PCA using the alpha technique on the
active and background dataset. It automatically determines n_alphas=4 important values
of alpha up to based to the power of 10^(max_log_alpha=5) on spectral clustering
of the top subspaces identified by cPCA.
The final return value is the data projected into the top (n_components = 2)
subspaces, which can be plotted outside of this function
"""
def automated_cpca(self, n_alphas=4, max_log_alpha=5, n_components = 2, affinity_metric='determinant', exemplar_method='medoid'):
best_alphas, all_alphas, angles0, labels = self.find_spectral_alphas(n_components, n_alphas-1, max_log_alpha, affinity_metric, exemplar_method)
best_alphas = np.concatenate(([0], best_alphas)) #one of the alphas is always alpha=0
data_to_plot = []
for alpha in best_alphas:
_, r_active, r_bg = self.cpca_alpha(n_components=n_components, alpha=alpha)
data_to_plot.append((r_active, r_bg))
return data_to_plot, best_alphas
"""
This function performs contrastive PCA using the alpha technique on the
active and background dataset. It returns the cPCA-reduced data for all values of alpha specified,
both the active and background, as well as the list of alphas
"""
def manual_cpca(self, max_log_alpha=5, n_components = 2):
alphas = np.concatenate(([0],np.logspace(-1,max_log_alpha,40)))
data_to_plot = []
for alpha in alphas:
n, r_active, r_bg = self.cpca_alpha(n_components, alpha=alpha)
data_to_plot.append((r_active, r_bg))
return data_to_plot, alphas
"""
This method performs spectral clustering on the affinity matrix of subspaces
returned by contrastive pca, and returns (`=3) exemplar values of alpha
"""
def find_spectral_alphas(self, n_components=2, max_num=3, max_log_alpha=5, affinity_metric='determinant', exemplar_method='medoid'):
#if (self.affinity_matrix is None): #commented out because different kinds of affinity can be defined
self.create_affinity_matrix(n_components, max_log_alpha, affinity_metric)
affinity = self.affinity_matrix
spectral = cluster.SpectralClustering(n_clusters=max_num+1, affinity='precomputed')
alphas = np.concatenate(([0],np.logspace(-1,max_log_alpha,40)))
spectral.fit(affinity)
labels = spectral.labels_
best_alphas = list()
for i in range(max_num+1):
idx = np.where(labels==i)[0]
if not(0 in idx): #because we don't want to include the cluster that includes alpha=0
if exemplar_method=='smallest':
exemplar_idx = int(np.min(idx)) #min seems to work better than median though I don't know why exactly
elif exemplar_method=='medoid':
affinity_submatrix = affinity[idx][:, idx]
sum_affinities = np.sum(affinity_submatrix, axis=0)
exemplar_idx = idx[np.argmax(sum_affinities)]
else:
raise ValueError("Invalid specification of exemplar method")
best_alphas.append(alphas[exemplar_idx])
return np.sort(best_alphas), alphas, affinity[0,:], labels
"""
This method creates the affinity matrix of subspaces returned by contrastive pca
"""
def create_affinity_matrix(self, n_components=2, max_log_alpha=5, affinity_metric='determinant'):
from math import pi
alphas = np.concatenate(([0],np.logspace(-1,max_log_alpha,40)))
print(alphas)
subspaces = list()
k = len(alphas)
if affinity_metric=='principal':
affinity = pi/4*np.identity(k)
elif affinity_metric=='determinant':
affinity = 0.5*np.identity(k) #it gets doubled
for alpha in alphas:
n, space, _ = self.cpca_alpha(n_components, alpha=alpha)
q, r = np.linalg.qr(space)
subspaces.append(q)
for i in range(k):
for j in range(i+1,k):
q0 = subspaces[i]
q1 = subspaces[j]
u, s, v = np.linalg.svd(q0.T.dot(q1))
if affinity_metric=='principal':
angle = np.arccos(s[0])
affinity[i,j] = pi/2 - angle
elif affinity_metric=='determinant':
affinity[i,j] = np.prod(s)
affinity = affinity + affinity.T
self.affinity_matrix = np.nan_to_num(affinity)