-
Notifications
You must be signed in to change notification settings - Fork 8
/
kmeans.tex
244 lines (224 loc) · 8.24 KB
/
kmeans.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
\documentclass[compress]{beamer}
\setbeamercolor{normal text}{fg=black}
\beamertemplatesolidbackgroundcolor{white}
\setbeamercovered{transparent, still covered={\opaqueness<1->{0}}, again covered={\opaqueness<1->{30}}}
\usecolortheme[named=black]{structure}
\definecolor{links}{HTML}{98AFC7}
\hypersetup{colorlinks,linkcolor=,urlcolor=links}
\usepackage{caption}
\captionsetup{labelformat=empty}
\setbeamertemplate{navigation symbols}{}
%\usefonttheme{structurebold}
\usepackage[scaled]{helvet}
\renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif
\usepackage[T1]{fontenc}
\usepackage{setspace}
%\usepackage{beamerthemesplit}
\usepackage{graphics}
\usepackage{Sweave}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{verbatim}
\usepackage{amssymb}
\usepackage{wrapfig}
\def\labelitemi{\textemdash}
\setbeamertemplate{frametitle}{
\begin{centering}
\vskip15pt
\insertframetitle
\par
\end{centering}
}
\title[DS]{\scalebox{.20}{\includegraphics{specialk.png}}\\ $K$-means Clustering}
\author[Sood]{gaurav~sood\\\href{http://www.gsood.com}{http://gsood.com}\\
\href{https://twitter.com/soodoku}{twitter} \textbf{|} \href{http://github.com/soodoku}{github}}
\large
\date[2015]{\today}
\subject{LearnDS}
\begin{document}
\newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}}
\newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}}
\newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}}
\newenvironment{large_enum}{
\Large
\begin{itemize}
\setlength{\itemsep}{7pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
}{\end{itemize}}
\begin{comment}
setwd(paste0(githubdir, "data-science/ds6/"))
tools::texi2dvi("kmeans.tex", pdf=TRUE,clean=TRUE)
setwd(basedir)
\end{comment}
\frame
{
\titlepage
}
\frame{
\frametitle{Unsupervised Learning}
\begin{large_enum}
\item[-]<2> Everything is dimension reduction
\item[-]<3> In supervised learning, labels supervise dimension reduction\\
\item[-]<4> For instance, regression is about finding a low dimensional representation of $Y$
\item[-]<5> Supervised Learning $\sim$ Given Apples and Oranges, learn traits of Apples Vs. Oranges
\item[-]<6> Given a bunch of spherical fruits, optimally describe types of fruits
\end{large_enum}
}
\frame{
\frametitle{Ways to Think About Unsupervised Learning}
\begin{large_enum}
\item[-]<2>Learning the probability model of the data $p(x_n|x_1,...,x_{n-1})$
\item[-]<3>\textbf{Applications:} Outlier detection, Data compression
\item[-]<4>Find rows similar to each other, groups of rows dissimilar to each other
\item[-]<5>Find columns similar to each other, groups of columns dissimilar to each other
\item[-]<6>\textbf{Applications:} Group movies by ratings, Segment shoppers
\end{large_enum}
}
\frame{
\frametitle{Solutions}
\begin{large_enum}
\item[-]<1-3>Two kinds of methods:
\begin{enumerate}
\item[-]<2->Principal components analysis
\item[-]<3->Clustering
\end{enumerate}
\item[-]<4>Clustering looks to partition data into similar subgroups
\item[-]<5-7>Two popular methods:
\begin{enumerate}
\item[-]<6-> Hierarchical clustering (computationally expensive)
\item[-]<7> $k$-means clustering (pre-specify $k$)
\end{enumerate}
\end{large_enum}
}
\frame{
\only<1>{\scriptsize{Source: \href{http://research.microsoft.com/en-us/um/people/cmbishop/prml/}{Pattern Recognition and Machine Learning}}}
\only<1>{\center{\includegraphics{kmeans-ex.png}}}
}
\frame{
\frametitle{$k$-Means Clustering}
\begin{large_enum}
\item[-]<1-3>$k$-means: Assume that we must split data into $k$ clusters
\begin{enumerate}
\item[-]<2-5>Each observation belongs to one cluster
\item[-]<3-5>No observation belongs to more than one cluster
\end{enumerate}
\item[-]<4>Find partitioning that minimizes within cluster variation summed over all $k$
clusters
\item[-]<5>Euclidean distance between observations, sum it over all observations\\\normalsize
\begin{equation}
\text{min.}_{C_1,\ldots,C_K} \sum_{k=1}^{k} \frac{1}{|C_k|} \sum_{i, i^{'} \in C_k} \sum_{j=1}^{p} (x_{ij} - x_{i^{'}j})^2
\end{equation}
\end{large_enum}
}
\frame{
\frametitle{$k$-Means (Lloyd's) Algorithm}
\begin{large_enum}
\item[-]<2>Randomly assign observations to 1 of $k$ clusters
\item[-]<3-5>Iterate:
\begin{enumerate}
\item[-]<4-> For each of the $k$ clusters, compute the centroid
\item[-]<5-> Assign each observation to cluster whose centroid is closest
\end{enumerate}
\end{large_enum}
\vspace{5cm}
}
\frame{
\only<1-6>{\scriptsize{Source: James et al. 2015}}
\only<1>{\center{\includegraphics{kmeanspic1.png}}}\pause
\only<2>{\center{\includegraphics{kmeanspic2.png}}}\pause
\only<3>{\center{\includegraphics{kmeanspic3.png}}}\pause
\only<4>{\center{\includegraphics{kmeanspic4.png}}}\pause
\only<5>{\center{\includegraphics{kmeanspic5.png}}}\pause
\only<6>{\center{\includegraphics{kmeanspic.png}}}
}
\frame{
\frametitle{$k$-Means Algorithm}
\begin{large_enum}
\item[-]<0>Randomly assign observations to 1 of $k$ clusters
\item[-]<0>Iterate:
\begin{enumerate}
\item[-]<1-> For each of the $k$ clusters, compute the centroid
\item[-]<1-> Assign each observation to cluster whose centroid is closest
\end{enumerate}
\item[-]<2>Why does it work?
\item[-]<3>It doesn't. Local minima possible.
\item[-]<4->Initialization:
\begin{enumerate}
\item[-]<5->Forgy: Randomly choose $k$ observations and set them as centroids.
\item[-]<6->Random Partition: Assign each observation randomly to one of the clusters.
\item[-]<7->Run an alternate clustering algorithm on a small sample and use the clusters as initial centroids
\item[-]<8> Pick dispersed points as centroids. For e.g. $k$-means++ and variations of it.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Distance between clusters}
\begin{large_enum}
\item[-]<1>Complete Linkage\\\normalsize
Farthest distance between points in clusters
\item[-]<2>Single\\\normalsize
Closest pair
\item[-]<3>Average\\\normalsize
All pairs, and then take the average
\item[-]<4>Centroid\\\normalsize
Has problems called inversions\\
Used in Genomics
\item[-]<5>Complete and Average most commonly used
\end{large_enum}
}
\frame{
\frametitle{Practical Issues}
\begin{large_enum}
\item[-]<1-4>Choice of Similarity Measure
\begin{enumerate}
\item[-]<2->Scaling Matters
\item[-]<3->Jaccard --- can be gotten quickly by minhashing via LSH Distance
\item[-]<4->Correlation based measures (+/- may matter)
\end{enumerate}
\item[-]<5>High dimensional data. Solutions e.g. DANN
\item[-]<6-8>Choosing $k$:
\begin{enumerate}
\item[-]<7->Calculate average distance to centroid for multiple $k$
\item[-]<8>Plot them, look for the \emph{knee}\\
\visible<8>{\scalebox{0.5}{\includegraphics{knee.jpg}}}
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{A(N)alyst choose $k$ contd.}
\begin{large_enum}
\item[-]<1-5>Calinski-Harabasz (CH) Index:
\begin{enumerate}
\item[-]<2->Between Cluster, $B = \sum_{1}^k n_k \lVert X_k - \bar{X} \rVert^2$
\item[-]<3->Within Cluster, $W = \sum_{1}^k \lVert X_i - \bar{X_k} \lVert^2$
\item[-]<4->Maximize Between Cluster Variation, Minimize Within Cluster Variation
\item[-]<5->$\text{CH(K)} = \frac{B(K)}{(K-1)}\frac{n - K}{W(K)}$
\end{enumerate}
\item[-]<6-8>Gap Statistic (Tibshirani):
\begin{enumerate}
\item[-]<6->Compare observed $W(K)$ to $W_{\text{unif}}(K)$
\item[-]<7->$\text{GAP}(K) = \text{log} W(K) - \text{log} W_{\text{unif}}(K)$
\item[-]<8->Calculate $W_{\text{unif}}(K)$ by simulation.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Running Time}
\begin{large_enum}
\item[-]<1> $O(kn)$ for each iteration.
\item[-]<2> But total iterations can be a lot, and not bounded.
\item[-]<3> But in practice, polynomial running time.
\item[-]<4-6> Big (Long) Data Solutions:
\begin{enumerate}
\item[-]<5->Bradley-Fayyad-Reina (BFR)
\item[-]<6->CURE
\end{enumerate}
\item[-]<7-8>BFR
\begin{enumerate}
\item[-]<7->Assumes clusters are normally distributed around a centroid in Euclidean space.
\item[-]<8->Exploit that to quantify likelihood point belongs to a cluster
\end{enumerate}
\end{large_enum}
}
\end{document}