-
Notifications
You must be signed in to change notification settings - Fork 8
/
ds1_web.tex
482 lines (468 loc) · 17.1 KB
/
ds1_web.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
\documentclass[compress, black]{beamer}
\setbeamercolor{normal text}{fg=black}
\beamertemplatesolidbackgroundcolor{white}
\usecolortheme[named=black]{structure}
\usepackage{caption}
\captionsetup{labelformat=empty}
\setbeamertemplate{navigation symbols}{}
%\usefonttheme{structurebold}
\usepackage[scaled]{helvet}
\renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif
\usepackage[T1]{fontenc}
\usepackage{setspace}
%\usepackage{beamerthemesplit}
\usepackage{graphics}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{verbatim}
\usepackage{amssymb}
\usepackage{wrapfig}
\usefonttheme[onlymath]{serif}
\usepackage{cmbright}
\usepackage[normalem]{ulem}
\def\labelitemi{\textemdash}
\setbeamertemplate{frametitle}{
\begin{centering}
\vskip15pt
\insertframetitle
\par
\end{centering}
}
\title[DS]{Data Science}
\author[Sood]{Gaurav~Sood}
\large
\date[2015]{Spring 2015}
\subject{LearnDS}
\begin{document}
\newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}}
\newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}}
\newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}}
\newenvironment{large_enum}{
\Large
\begin{itemize}
\setlength{\itemsep}{7pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
}{\end{itemize}}
\begin{comment}
setwd(paste0(basedir, "github/data-science/ds1/"))
tools::texi2dvi("ds1_web.tex", pdf=TRUE,clean=TRUE)
setwd(basedir)
\end{comment}
\frame
{
\titlepage
}
\frame{
\frametitle{Big Data}
\Large
\only<2>{\indent Lots of hype recently.\\\vspace{10mm}} \only<3>{\scalebox{0.7}{\includegraphics{img/bigdata.png}}} \only<4>{\indent But where's the cheese?\\\vspace{10mm}}
}
\frame{
\center
\Large Some examples
}
\frame{
\frametitle{Fishing Out Fishy Figures}
\only<1>{
\begin{figure}[p]
\centering \includegraphics[scale=0.5]{img/economist.png}
\end{figure}
\small ``The CPINu's August inflation figure of \alert{1.3\%} is less than half the \alert{2.65\%} of the CPI Congreso, a compilation of private estimates
gathered by opposition members of Congress.'' (\href{http://www.economist.com/blogs/americasview/2014/09/statistics-argentina}{Economist})}
\only<2>{
\begin{figure}[p]
\centering \includegraphics[scale=0.5]{img/ArgentinaPriceIndex.png}
\end{figure}
\small Source: \href{http://www.mit.edu/~afc/papers/Cavallo-Argentina-JME.pdf}{Online vs Official Price Indexes: Measuring Argentina's Inflation By Alberto Cavallo}}
}
\frame{
\frametitle{Suicide Prevention in the Army}
\Large
\only<2>{``In 2012, more soldiers committed suicide than died while fighting in Afghanistan: 349 suicides compared to 295 combat deaths.''}
\only<3>{\begin{figure}[p]
\centering
\includegraphics[scale=0.5]{img/ArmySuicides.png}\\
\end{figure}
}
\only<4>{``Research has repeatedly shown that doctors are not accurate in predicting who is at risk of suicide.''}
\only<5>{``The soldiers with the \alert{highest 5 percent of risk scores} committed over \alert{half of all suicides} in the period covered --- at an extraordinary rate of about 3,824 suicides per 100,000 person-years.''\\\vspace{5mm}
\small \href{http://fivethirtyeight.com/features/the-army-is-building-an-algorithm-to-prevent-suicide/}{538 Article}\\
\href{http://www.ncbi.nlm.nih.gov/pubmed/25390793}{STARRS paper}}
}
\frame{
\frametitle{Reducing Crime}
\Large
\only<1>{Minority Report}
\only<2>{Predictive, `CompStat', `HotSpot' Policing}
\only<3>{\href{http://www.predpol.com/}{PredPol}: Predictive Policing\\\normalsize
LAPD, Atlanta PD \\
Based off earthquake prediction algorithm}
\only<4>{``During a four-month trial in Kent, \alert{8.5\%} of all street crime occurred within PredPol's pink boxes, with plenty more next door to them; predictions from police analysts scored only \alert{5\%}. An earlier trial in Los Angeles saw the machine score \alert{6\%} compared with human analysts' \alert{3\%}.''\\\vspace{5mm}
\small \href{http://www.economist.com/news/briefing/21582042-it-getting-easier-foresee-wrongdoing-and-spot-likely-wrongdoers-dont-even-think-about-it}{Economist}}
\only<5>{\includegraphics[scale=0.45]{img/PredictivePolicing.jpg}}
}
\frame{
\frametitle{Web Search}
\only<2>{\begin{figure}[p]
\centering\scalebox{0.4}{\includegraphics{img/yahoo.jpg}}
\end{figure}
}
\only<3-7>{
\begin{large_enum}
\item<3->[--]Human Curation, Ad-hoc automation
\item<4->[--]Google crawls over 20 billion URLs a day (Sullivan 2012).
\item<5->[--]Google answers 100 billion search queries a month (Sullivan 2012).
\item<6->[--] ``\ldots a typical search returns results in less than 0.2 seconds'' (Google)
\item<7->[--]Page Rank
\end{large_enum}
}
}
\frame{
\frametitle{Side-effects of Drugs}
\Large
\only<2>{``Adverse drug events cause substantial morbidity and mortality and are often discovered after a drug comes to market.''}
\only<3>{FDA collects this information from ``physicians, pharmacists, patients, and drug companies'' but these reports ``are incomplete and biased''}
\only<4>{``\alert{paroxetine} and \alert{pravastatin}, whose interaction was \alert{reported to cause hyperglycemia after the time period of the online logs} used in the analysis''}
\only<5>{\begin{figure}[p]
\centering
\includegraphics[scale=0.60]{img/jama.png}\\
\end{figure}
\small \href{http://www.ncbi.nlm.nih.gov/pubmed/23467469}{Web-scale Pharmacovigilance: Listening to Signals from the Crowd.} By White et al.}
}
\frame{
\frametitle{Flu Season}
\Large
\only<1>{How many got the sniffles?}
\only<2>{How many got the sniffles in the past month?}
\only<3>{
\begin{figure}[p]
\centering
\includegraphics[scale=0.60]{img/GoogleFlu09Current.png}\\
\end{figure}
\small \href{http://www.google.org/flutrends/us/}{Google Flu Trends}
}
\only<4>{Google Flu is sick.}
\only<5>{\begin{figure}[p]
\centering
\includegraphics[scale=0.45]{img/GoogleFlu.png}\\
\end{figure}
\small \href{http://bit.ly/1KMwZ0Y}{The Parable of Google Flu: Traps in Big Data Analysis.} By Lazer et al.
}
}
\frame{
\frametitle{Spam or Ham}
\only<1>{
\begin{figure}[p]
\centering
\includegraphics[scale=0.75]{img/spam.jpg}\\
\end{figure}
}
\only<2-4>{
\begin{large_enum}
\item[-]<2->``According to Commtouch's Internet Threats Trend Report for the first quarter of 2013, an average of \alert{97.4 billion} spam e-mails and \alert{973 million} malware e-mails were sent worldwide each day in Q1 2013 (h/t Softpedia).''
\item[-]<3->Spam Filter
\item[-]<4->$logit [p(spam)] = \alpha + f'\beta$ where $f$ is frequencies.
\end{large_enum}
}
}
\frame{
\frametitle{Vote}
\only<1-2>{
\begin{figure}[!htb]
\centering
\includegraphics[scale=0.5]{img/61million.png}
\end{figure}
}
\Large
\only<2>{\alert{.39\% direct effect}, and $.01$ to .1\% indirect effect.}
}
\frame{
\frametitle{Vote for Obama}
\begin{large_enum}
\item[--]<1->{Obama 2012 Campaign}
\item[--]<2->{Highly customized messaging: Soccer Moms, NASCAR dads \ldots}
\item[--]<3->{\href{http://citoresearch.com/print/1813}{Used SQL database Vertica for `speed-of-thought' analyses.}}
\end{large_enum}
}
\frame{
\center
\Large What do we mean by big data?
}
\frame{
\frametitle{What do we mean by big data?}
\begin{large_enum}
\item[--]<1->Big in rows (size \textit{n})\\
Big in columns (dimensions \textit{p})
\item[--]<2->Hard to extract value from
\item[--]<3-> `Big data' is high \textbf{volume}, high \textbf{velocity} and high \textbf{variety} information assets that demand cost-effective, innovative forms of information processing[.]\\\normalsize\indent
Gartner, Inc.'s ``3Vs'' definition.
\end{large_enum}
}
\frame{
\frametitle{Sources of (Big) Data}
\begin{large_enum}
\item[--]<1->Data as the by-product of other activities\\
\begin{enumerate}
\item[--]<2-> Click trail, clicks before a purchase
\item[--]<3-> Moving your body (\href{http://www.fitbit.com/}{Fitbit})
\item[--]<4-> Moving yourself without moving your body (\href{http://www.progressive.com/auto/snapshot/?version=default}{Snapshot})
\item[--]<5-> Data were always being generated. They just weren't being captured.
\begin{enumerate}
\item[--]<6->Cheaper, smaller sensors help.
\item[--]<7->So does cheaper storage. (1950 $\sim$ \$10,000/MB. 2015 $\lll$ \$.0001/MB)
\end{enumerate}
\end{enumerate}
\item[--]<8-> Data as the primary goal of activities\\\normalsize
Telescopes, Genetic sequencers, 61 million person experiments \ldots
\end{large_enum}
}
\frame{
\frametitle{How big are the data?}
\begin{large_enum}
\item[--]<2->Web\\\normalsize
20 billion webpages, each 20 KB = 400 TB \\ \pause
Say all stored on a single disk.\\ \pause
Read speed $\sim$ 50MB/sec.\\ \pause
92 days to read from disk to memory.
\item[--]<6->Astronomy \\\normalsize
Apache Point Telescope $\sim$ 200 GB/night.\\
Large Synoptic Survey Telescope: 3 billion pixel camera \\
$\sim$ 30TB/night. In 10 years $\sim$ 60 PB
\item[--]<7->Life Sciences\\\normalsize
High Throughput sequencer $\sim$ 1 TB/day
\item[--]<8->CIA \\\normalsize \pause \pause
REDACTED
\end{large_enum}
}
\frame{
\center
\Large Implications for Statistics, Computation.
}
\frame{
\frametitle{Implications for Statistics}
\begin{large_enum}
\item[--]<2->Little data, Big data\\ \pause \pause \normalsize
Sampling still matters
\item[--]<4->Everything is significant (The Starry Night)\\\normalsize \pause \pause
\begin{equation}\text{False discovery Proportion} = \frac{\text{\# of FP}}{\text{\# of Sig. Results}}\end{equation}\\
Benjamini and Hochberg (1995) (FDR), cost of false discovery, Familywise error rate (Bonferroni)
\item[--]<6->Inverting a matrix\\\normalsize \pause \pause
(Stochastic) Gradient Descent (Ascent), BFGS, \ldots
\item[--]<8->Causal inference\\\normalsize \pause \pause
Large $p$ may help\\ \pause
Passive observation as things change arbitrarily may help
\end{large_enum}
}
\frame{
\frametitle{Implications for Computation}
\begin{large_enum}
\item[--]<1->Conventional Understanding of what is computationally tractable: Polynomial time algorithm ($N^k$)
\item[--]<2->Now it is $(N^k)/m$, where $m$ is the number of computers.
\item[--]<3->For really big data: N*log(N)\\\normalsize
Traversing a binary tree, sort and search N log(N)\\
Streaming application
\end{large_enum}
}
\frame{
\center
\Large MapReduce and PageRank
}
\frame{
\Large
20 billion webpages, each 20 KB = 400 TB \\
Say all data stored on a single disk. Read speed $\sim$ 50MB/sec.\\
92 days to read from disk to memory.
}
\frame{
\frametitle{Solution, Problem}
\begin{large_enum}
\item[--]<1->Parallelize, if 1000 computers, then just $\sim$ 1 hour
\item[--]<2->But \ldots
\begin{enumerate}
\item[--]<3->Nodes can fail.\\
Say single node fails once every 1000 days.\\
But 1000 failures per day if 1 million servers.
\item[--]<4->Network bandwidth $\sim$ 1GBps.\\
If you have 10 TB of day -- takes 1 day.
\item[--]<5->Distributed programming can be very very hard.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Solution: MapReduce}
\begin{large_enum}
\item[--]<1->Store data redundantly
\begin{enumerate}
\item[--]<2->Distributed File Systems, e.g. GFS, HDFS
\item[--]<3->Typical usage pattern: Data rarely updated. Read often. Updated through appends.
\item[--]<4->Implementation:\\
Data kept in chunks, machines called `chunk servers'\\
Chunks replicated. Typically 3x. One in a completely separate rack.\\
Master node (GFS)/Name Node (HDFS) tracks metadata
\end{enumerate}
\item[--]<5->Minimize data movement
\item[--]<6->Simple programming model
\end{large_enum}
}
\frame{
\frametitle{More About MapReduce}
\Large
\begin{large_enum}
\item[--]<1->Name comes from 2004 paper \href{http://static.googleusercontent.com/media/research.google.com/en/us/archive/mapreduce-osdi04.pdf}{MapReduce: Simplified Data Processing on Large Clusters} by Dean and Ghemawat.
\item[--]<2->Implementation - Hadoop (via Yahoo)/Apache
\end{large_enum}
}
\frame{
\frametitle{MapReduce Example}
\begin{large_enum}
\item[--]<1->Count each distinct word in a huge document, e.g. urls
\begin{enumerate}
\item[--]<2->Map function produces key, value pairs\\
Word frequency of every word in each document
\item[--]<3->Send different words to different computers
\item[--]<4->Combine
\end{enumerate}
\end{large_enum}
}
\begin{frame}[fragile]
\begin{verbatim}
map(String input_key, String input_value):
// input_key: document name
// input_value: document contents
for each word w in input_value:
EmitIntermediate(w, ``1'');
reduce(String output_key, Iterator intermediate_values):
// output_key: a word
// output_values: a list of counts
int result = 0;
for each v in intermediate_values:
result += ParseInt(v);
Emit(AsString(result));
\end{verbatim}
\end{frame}
\frame{
\frametitle{PageRank}
\begin{large_enum}
\item[--]<1->\href{http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf}{The PageRank Citation Ranking: Bringing Order to the Web} By Page et al.
\item[--]<2->\href{http://www.cs.uvm.edu/~icdm/algorithms/10Algorithms-08.pdf}{Among the top 10 data mining algorithms}
\item[--]<3->Search
\begin{enumerate}
\item[-]<4->Searching for similarity
\item[-]<5->Works well when you look for a document in your computer.\\ Any small trusted corpora.
\item[-]<6->Web - lots of matches.
\item[-]<7->Web - lots of false positives. Some of it malware peddling sites.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{PageRank}
\begin{large_enum}
\item[--]<1->How to order conditional on similarity?
\begin{enumerate}
\item[-]<2->One can rank by popularity if you have complete web traffic information.
\item[-]<3->But those data are hard to get.
\item[-]<4->Or you can do ad hoc automation and human curation.
\item[-]<5->But costly to implement. And don't scale.
\end{enumerate}
\item[--]<6->Innovation: see Internet as a graph\\ \normalsize
Nodes = webpages, Edges = hyperlinks \\
Ranks based on linking patterns alone.
\item[--]<7->Currency is in-links.
\end{large_enum}
}
\frame{
\frametitle{PageRank}
\Large
\begin{large_enum}
\item[--]<1->Each in-link is a vote.
\item[--]<2->But each vote is not equal.
\item[--]<3->In-links from more important pages count for more
\item[--]<4->Value of each vote:
\begin{enumerate}
\item[-]<5->Proportional to importance of source page
\item[-]<6->Inversely proportional to number of outgoing links on the source page
\item[-]<7->Say page $i$ has importance $r_i$, and $n$ outgoing links, each vote = $r_i/n$
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Page Rank Example}
\begin{figure}[p]
\centering
\includegraphics[scale=0.10]{img/PageRankExample.png}
\end{figure}
Source: Wikipedia
}
\frame{
\frametitle{PageRank}
\Large
\only<1->{Say page $i$ gets links from pages $j$ ($n=5$, $r_j$) and $k$ ($n=2$, $r_k$)}\\
\only<2->{\begin{align*} r_{i} = \frac{r_{j}}{5} + \frac{r_{k}}{2} \end{align*}}
\only<3->{Page $j$ will have its own outlinks, and each will have a value $r_j$}
\only<4->{\begin{align*} r_i = \sum \frac{r_j}{d_j}\end{align*} over $j$ where $j$ tracks all the pages pointing to $i$.}
}
\frame{
\center
\Large Class
}
\frame{
\frametitle{Data Science}
\begin{figure}[p]
\centering
\includegraphics[scale=0.50]{img/DataScience.png}\\
\end{figure}
\small \href{http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram}{Data Science Venn Diagram.} By Drew Conoway
}
\frame{
\frametitle{Course Outline}
\begin{large_enum}
\item[--]<1->Get your own (big) data\\\normalsize
Scrape it, clean it\\
Basics of webscraping\\
Basics of regular expressions
\item[--]<2->Manage your (big) data\\\normalsize
Store it, organize it, use it\\
Relational Databases, SQL (SQLite)\\
Other databases
\item[--]<3->Analyze your (big) data\\\normalsize
Cross-validation\\
(Not) Maximally Likely\\
Numerical Optimization
\end{large_enum}
}
\frame{
\frametitle{Prerequisites}
\begin{large_enum}
\item[--]<1->Basic but important
\item[--]<2->Statistics:
\begin{enumerate}
\item[--]<1->Probability theory, some combinatorics
\item[--]<2->Linear Regression and other standard estimation techniques
\end{enumerate}
\item[--]<3->Computation:
\begin{enumerate}
\item[--]Have written a loop
\item[--]Have written a function
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Software and Programming}
\begin{large_enum}
\item[--]<1->Open Source\\\normalsize
License fees add up if you are running software on 1000's of machines
\item[--]<2->R
\begin{enumerate}
\item[--]<3->{\tt RStudio} IDE for R, Makes it easier to code.
\item[--]<4->{\tt R Markdown} For Documenting.
\end{enumerate}
\item[--]<6->Python \\\normalsize
\begin{enumerate}
\item[--]<7->Academic Version {\tt Enthought Python Distribution}
\item[--]<8->Eclipse, PyCharm, PyDev, Aptana Studio etc.
\end{enumerate}
\item[--]<9->SQLite
\end{large_enum}
}
\end{document}