-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSportsData.tex
10355 lines (8306 loc) · 647 KB
/
SportsData.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
%
\documentclass[
]{book}
\usepackage{lmodern}
\usepackage{amsmath}
\usepackage{ifxetex,ifluatex}
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\usepackage{amssymb}
\else % if luatex or xetex
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\hypersetup{
pdftitle={Sports Data Analysis and Visualization},
pdfauthor={By Matt Waite},
hidelinks,
pdfcreator={LaTeX via pandoc}}
\urlstyle{same} % disable monospaced font for URLs
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage{booktabs}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\ifluatex
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage[]{natbib}
\bibliographystyle{apalike}
\title{Sports Data Analysis and Visualization}
\usepackage{etoolbox}
\makeatletter
\providecommand{\subtitle}[1]{% add subtitle to \maketitle
\apptocmd{\@title}{\par {\large #1 \par}}{}{}
}
\makeatother
\subtitle{Code, data, visuals and the Tidyverse for journalists and other storytellers}
\author{By Matt Waite}
\date{July 29, 2019}
\begin{document}
\maketitle
{
\setcounter{tocdepth}{1}
\tableofcontents
}
\hypertarget{throwing-cold-water-on-hot-takes}{%
\chapter{Throwing cold water on hot takes}\label{throwing-cold-water-on-hot-takes}}
The 2018 season started out disastrously for the Nebraska Cornhuskers. The first game against a probably overmatched opponent? Called on account of an epic thunderstorm that plowed right over Memorial Stadium. The next game? Loss. The one following? Loss. The next four? All losses, after the fanbase was whipped into a hopeful frenzy by the hiring of Scott Frost, national title winning quarterback turned hot young coach come back home to save a mythical football program from the mediocrity it found itself mired in.
All that excitement lay in tatters.
On sports talk radio, on the sports pages and across social media and cafe conversations, one topic kept coming up again and again to explain why the team was struggling: Penalties. The team was just committing too many of them. In fact, six games and no wins into the season, they were dead last in the FBS penalty yards.
Worse yet for this line of reasoning? Nebraska won game 7, against Minnesota, committing only six penalties for 43 yards, just about half their average over the season. Then they won game 8 against FCS patsy Bethune Cookman, committing only five penalties for 35 yards. That's a whopping 75 yards less than when they were losing. See? Cut the penalties, win games screamed the radio show callers.
The problem? It's not true. Penalties might matter for a single drive. They may even throw a single game. But if you look at every top-level college football team since 2009, the number of penalty yards the team racks up means absolutely nothing to the total number of points they score. There's no relationship between them. Penalty yards have no discernible influence on points beyond just random noise.
Put this another way: If you were Scott Frost, and a major college football program was paying you \$5 million a year to make your team better, what should you focus on in practice? If you had growled at some press conference that you're going to work on penalties in practice until your team stops committing them, the results you'd get from all that wasted practice time would be impossible to separate from just random chance. You very well may reduce your penalty yards and still lose.
How do I know this? Simple statistics.
That's one of the three pillars of this book: Simple stats. The three pillars are:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
Simple, easy to understand statistics \ldots{}
\item
\ldots{} extracted using simple code \ldots{}
\item
\ldots{} visualized simply to reveal new and interesting things in sports.
\end{enumerate}
Do you need to be a math whiz to read this book? No.~I'm not one either. What we're going to look at is pretty basic, but that's also why it's so powerful.
Do you need to be a computer science major to write code? Nope. I'm not one of those either. But anyone can think logically, and write simple code that is repeatable and replicable.
Do you need to be an artist to create compelling visuals? I think you see where this is going. No.~I can barely draw stick figures, but I've been paid to make graphics in my career. With a little graphic design know how, you can create publication worthy graphics with code.
\hypertarget{requirements-and-conventions}{%
\section{Requirements and Conventions}\label{requirements-and-conventions}}
This book is all in the R statistical language. To follow along, you'll do the following:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
Install the R language on your computer. Go to the \href{https://www.r-project.org/}{R Project website}, click download R and select a mirror closest to your location. Then download the version for your computer.
\item
Install \href{https://www.rstudio.com/products/rstudio/\#Desktop}{R Studio Desktop}. The free version is great.
\end{enumerate}
Going forward, you'll see passages like this:
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{install.packages}\NormalTok{(}\StringTok{"tidyverse"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}
Don't do it now, but that is code that you'll need to run in your R Studio. When you see that, you'll know what to do.
\hypertarget{about-this-book}{%
\section{About this book}\label{about-this-book}}
This book is the collection of class materials for the author's Sports Data Analysis and Visualization class at the University of Nebraska-Lincoln's College of Journalism and Mass Communications. There's some things you should know about it:
\begin{itemize}
\tightlist
\item
It is free for students.
\item
The topics will remain the same but the text is going to be constantly tinkered with.
\item
What is the work of the author is copyright Matt Waite 2019.
\item
The text is \href{https://creativecommons.org/licenses/by-nc-sa/4.0/}{Attribution-NonCommercial-ShareAlike 4.0 International} Creative Commons licensed. That means you can share it and change it, but only if you share your changes with the same license and it cannot be used for commercial purposes. I'm not making money on this so you can't either.\\
\item
As such, the whole book -- authored in Bookdown -- is \href{https://github.com/mattwaite/sportsdatabook}{open sourced on Github}. Pull requests welcomed!
\end{itemize}
\hypertarget{the-very-basics}{%
\chapter{The very basics}\label{the-very-basics}}
R is a programming language, one specifically geared toward statistical analysis. Like all programming languages, it has certain built-in functions and you can interact with it in multiple ways. The first, and most basic, is the console.
\includegraphics[width=18.97in]{images/verybasics1}
Think of the console like talking directly to R. It's direct, but it has some drawbacks and some quirks we'll get into later. For now, try typing this into the console and hit enter:
\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{2}\SpecialCharTok{+}\DecValTok{2}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## [1] 4
\end{verbatim}
Congrats, you've run some code. It's not very complex, and you knew the answer before hand, but you get the idea. We can compute things. We can also store things. \textbf{In programming languages, these are called variables}. We can assign things to variables using \texttt{\textless{}-}. And then we can do things with them. \textbf{The \texttt{\textless{}-} is a called an assignment operator}.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{number }\OtherTok{\textless{}{-}} \DecValTok{2}
\NormalTok{number }\SpecialCharTok{*}\NormalTok{ number}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## [1] 4
\end{verbatim}
Now assign a different number to the variable number. Try run \texttt{number\ *\ number} again. Get what you expected?
We can have as many variables as we can name. \textbf{We can even reuse them (but be careful you know you're doing that or you'll introduce errors)}. Try this in your console.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{firstnumber }\OtherTok{\textless{}{-}} \DecValTok{1}
\NormalTok{secondnumber }\OtherTok{\textless{}{-}}\DecValTok{2}
\NormalTok{(firstnumber }\SpecialCharTok{+}\NormalTok{ secondnumber) }\SpecialCharTok{*}\NormalTok{ secondnumber}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## [1] 6
\end{verbatim}
\textbf{We can store anything in a variable}. A whole table. An array of numbers. A single word. A whole book. All the books of the 18th century. They're really powerful. We'll explore them at length.
\hypertarget{adding-libraries-part-1}{%
\section{Adding libraries, part 1}\label{adding-libraries-part-1}}
The real strength of any given programming language is the external libraries that power it. The base language can do a lot, but it's the external libraries that solve many specific problems -- even making the base language easier to use.
For this class, we're going to need several external libraries.
The first library we're going to use is called Swirl. So in the console, type \texttt{install.packages(\textquotesingle{}swirl\textquotesingle{})} and hit enter. That installs swirl.
Now, to use the library, type \texttt{library(swirl)} and hit enter. That loads swirl. Then type \texttt{swirl()} and hit enter. Now you're running swirl. Follow the directions on the screen. When you are asked, you want to install course 1 R Programming: The basics of programming in R. Then, when asked, you want to do option 1, R Programming, in that course.
When you are finished with the course -- it will take just a few minutes -- it will first ask you if you want credit on Coursera. You do not. Then type 0 to exit (it will not be very clear that's what you do when you are done).
\hypertarget{adding-libraries-part-2}{%
\section{Adding libraries, part 2}\label{adding-libraries-part-2}}
We'll mostly use two libraries for analysis -- \texttt{dplyr} and \texttt{ggplot2}. To get them, and several other useful libraries, we can install a single collection of libraries called the tidyverse. Type this into your console: \texttt{install.packages(\textquotesingle{}tidyverse\textquotesingle{})}
\textbf{NOTE}: This is a pattern. You should always install libraries in the console.
Then, to help us with learning and replication, we're going to use R Notebooks. So we need to install that library. Type this into your console: \texttt{install.packages(\textquotesingle{}rmarkdown\textquotesingle{})}
\hypertarget{notebooks}{%
\section{Notebooks}\label{notebooks}}
For the rest of the class, we're going to be working in notebooks. In notebooks, you will both run your code and explain each step, much as I am doing here.
To start a notebook, you click on the green plus in the top left corner and go down to R Notebook. Do that now.
\includegraphics[width=11.08in]{images/verybasics2}
You will see that the notebook adds a lot of text for you. It tells you how to work in notebooks -- and you should read it. The most important parts are these:
To add text, simply type. To add code you can click on the \emph{Insert} button on the toolbar or by pressing \emph{Cmd+Option+I} on Mac or \emph{Ctl+Alt+I} on Windows.
Highlight all that text and delete it. You should have a blank document. This document is called a R Markdown file -- it's a special form of text, one that you can style, and one you can include R in the middle of it. Markdown is a simple markup format that you can use to create documents. So first things first, let's give our notebook a big headline. Add this:
\texttt{\#\ My\ awesome\ notebook}
Now, under that, without any markup, just type This is my awesome notebook.
Under that, you can make text bold by writing \texttt{It\ is\ **really**\ awesome}.
If you want it italics, just do this on the next line: \texttt{No,\ it\textquotesingle{}s\ \_really\_\ awesome.\ I\ swear.}
To see what it looks like without the markup, click the Preview or Knit button in the toolbar. That will turn your notebook into a webpage, with the formatting included.
Throughout this book, we're going to use this markdown to explain what we are doing and, more importantly, why we are doing it. Explaining your thinking is a vital part of understanding what you are doing.
That explaination, plus the code, is the real power of notebooks. To add a block of code, follow the instructions from above: click on the \emph{Insert} button on the toolbar or by pressing \emph{Cmd+Option+I} on Mac or \emph{Ctl+Alt+I} on Windows.
In that window, use some of the code from above and add two numbers together. To see it run, click the green triangle on the right. That runs the chunk. You should see the answer to your addition problem.
And that, just that, is the foundation you need to start this book.
\hypertarget{data-structures-and-types}{%
\chapter{Data, structures and types}\label{data-structures-and-types}}
Data are everywhere (and data is plural of datum, thus the use of are in that statement). It surrounds you. Every time you use your phone, you are creating data. Lots of it. Your online life. Any time you buy something. It's everywhere. Sports, like life, is no different. Sports is drowning in data, and more comes along all the time.
In sports, and in this class, we'll be dealing largely with two kinds of data: event level data and summary data. It's not hard to envision event level data in sports. A pitch in baseball. A hit. A play in football. A pass in soccer. They are the events that make up the game. Combine them together -- summarize them -- and you'll have some notion of how the game went. What we usually see is summary data -- who wants to scroll through 50 pitches to find out a player went 2-3 with a double and an RBI? Who wants to scroll through hundreds of pitches to figure out the Rays beat the Yankees?
To start with, we need to understand the shape of data.
\begin{quote}
EXERCISE: Try scoring a child's board game. For example, Chutes and Ladders. If you were placed in charge of analytics for the World Series of Chutes and Ladders, what is your event level data? What summary data do you keep? If you've got the game, try it.
\end{quote}
\hypertarget{rows-and-columns}{%
\section{Rows and columns}\label{rows-and-columns}}
Data, oversimplifying it a bit, is information organized. Generally speaking, it's organized into rows and columns. Rows, generally, are individual elements. A team. A player. A game. Columns, generally, are components of the data, sometimes called variables. So if each row is a player, the first column might be their name. The second is their position. The third is their batting average. And so on.
\includegraphics[width=22in]{images/data1}
One of the critical components of data analysis, especially for beginners, is having a mental picture of your data. What does each row mean? What does each column in each row signify? How many rows do you have? How many columns?
\hypertarget{types}{%
\section{Types}\label{types}}
There are scores of data types in the world, and R has them. In this class, we're primarily going to be dealing with data frames, and each element of our data frames will have a data type.
Typically, they'll be one of four types of data:
\begin{itemize}
\tightlist
\item
Numeric: a number, like the number of touchdown passes in a season or a batting average.
\item
Character: Text, like a name, a team, a conference.
\item
Date: Fully formed dates -- 2019-01-01 -- have a special date type. Elements of a date, like a year (ex. 2019) are not technically dates, so they'll appear as numeric data types.
\item
Logical: Rare, but every now and then we'll have a data type that's Yes or No, True or False, etc.
\end{itemize}
\textbf{Question:} Is a zip code a number? Is a jersey number a number? Trick question, because the answer is no. Numbers are things we do math on. If the thing you want is not something you're going to do math on -- can you add two phone numbers together? -- then make it a character type. If you don't, most every software system on the planet will drop leading zeros. For example, every zip code in Boston starts with 0. If you record that as a number, your zip code will become a four digit number, which isn't a zip code anymore.
\hypertarget{a-simple-way-to-get-data}{%
\section{A simple way to get data}\label{a-simple-way-to-get-data}}
One good thing about sports is that there's lots of interest in it. And that means there's outlets that put sports data on the internet. Now I'm going to show you a trick to getting it easily.
The site sports-reference.com takes NCAA (and other league) stats and puts them online. For instance, \href{https://www.sports-reference.com/cbb/schools/nebraska/2019-gamelogs.html}{here's their page on Nebraska basketball's game logs}, which you should open now.
Now, in a new tab, log into Google Docs/Drive and open a new spreadsheet. In the first cell of the first row, copy and paste this formula in:
\begin{verbatim}
=IMPORTHTML("https://www.sports-reference.com/cbb/schools/nebraska/2019-gamelogs.html", "table", 1)
\end{verbatim}
If it worked right, you've got the data from that page in a spreadsheet.
\hypertarget{cleaning-the-data}{%
\section{Cleaning the data}\label{cleaning-the-data}}
The first thing we need to do is recognize that we don't have data, really. We have the results of a formula. You can tell by putting your cursor on that field, where you'll see the formula again. This is where you'd look:
\includegraphics[width=33.28in]{images/clean1}
The solution is easy:
Edit \textgreater{} Select All or type command/control A
Edit \textgreater{} Copy or type command/control c
Edit \textgreater{} Paste Special \textgreater{} Values Only or type command/control shift v
You can verify that it worked by looking in that same row 1 column A, where you'll see the formula is gone.
\includegraphics[width=36.81in]{images/clean2}
Now you have data, but your headers are all wrong. You want your headers to be one line -- not two, like they have. And the header names repeat -- first for our team, then for theirs. So you have to change each header name to be UsORB or TeamORB and OpponentORB instead of just ORB.
After you've done that, note we have repeating headers. There's two ways to deal with that -- you could just hightlight it and go up to Edit \textgreater{} Delete Rows XX-XX depending on what rows you highlighted. That's the easy way with our data.
But what if you had hundreds of repeating headers like that? Deleting them would take a long time.
You can use sorting to get rid of anything that's not data. So click on Data \textgreater{} Sort Range. You'll want to check the ``Data has header row'' field. Then hit Sort.
\includegraphics[width=21.61in]{images/clean3}
Now all you need to do is search through the data for where your junk data -- extra headers, blanks, etc. -- got sorted and delete it. After you've done that, you can export it for use in R. Go to File \textgreater{} Download as \textgreater{} Comma Separated Values. Remember to put it in the same directory as your R Notebook file so you can import the data easily.
\hypertarget{aggregates}{%
\chapter{Aggregates}\label{aggregates}}
R is a statistical programming language that is purpose built for data analysis.
Base R does a lot, but there are a mountain of external libraries that do things to make R better/easier/more fully featured. We already installed the tidyverse -- or you should have if you followed the instructions for the last assignment -- which isn't exactly a library, but a collection of libraries. Together, they make up the tidyverse. Individually, they are extraordinarily useful for what they do. We can load them all at once using the tidyverse name, or we can load them individually. Let's start with individually.
The two libraries we are going to need for this assignment are \texttt{readr} and \texttt{dplyr}. The library \texttt{readr} reads different types of data in as a dataframe. For this assignment, we're going to read in csv data or Comma Separated Values data. That's data that has a comma between each column of data.
Then we're going to use \texttt{dplyr} to analyze it.
To use a library, you need to import it. Good practice -- one I'm going to insist on -- is that you put all your library steps at the top of your notebooks.
That code looks like this:
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(readr)}
\end{Highlighting}
\end{Shaded}
To load them both, you need to run that code twice:
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(readr)}
\FunctionTok{library}\NormalTok{(dplyr)}
\end{Highlighting}
\end{Shaded}
You can keep doing that for as many libraries as you need. I've seen notebooks with 10 or more library imports.
But the tidyverse has a neat little trick. We can load most of the libraries we'll need for the whole semester with one line:
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}
\textbf{From now on, if that's not the first line of your notebook, you're probably doing it wrong.}
\hypertarget{basic-data-analysis-group-by-and-count}{%
\section{Basic data analysis: Group By and Count}\label{basic-data-analysis-group-by-and-count}}
The first thing we need to do is get some data to work with. We do that by reading it in. In our case, we're going to read data from a csv file -- a comma-separated values file.
The CSV file we're going to read from is a \href{https://www.basketball-reference.com/leagues/NBA_2020_advanced.html}{Basketball Reference} page of advanced metrics for NBA players this season. The Sports Reference sites are a godsend of data, a trove of stuff, and we're going to use it a lot in this class.
So step 2, after setting up our libraries, is most often going to be importing data. In order to analyze data, we need data, so it stands to reason that this would be something we'd do very early.
The code looks \emph{something} like this, but hold off copying it just yet:
\texttt{nbaplayers\ \textless{}-\ read\_csv("\textasciitilde{}/Box/SportsData/nbaadvancedplayers1920.csv")}
Let's unpack that.
The first part -- nbaplayers -- is the name of your variable. A variable is just a name of a thing that stores stuff. In this case, our variable is a data frame, which is R's way of storing data (technically it's a tibble, which is the tidyverse way of storing data, but the differences aren't important and people use them interchangeably). \textbf{We can call this whatever we want.} I always want to name data frames after what is in it. In this case, we're going to import a dataset of NBA players. Variable names, by convention are one word all lower case. You can end a variable with a number, but you can't start one with a number.
The \textless- bit is the variable assignment operator. It's how we know we're assigning something to a word. Think of the arrow as saying ``Take everything on the right of this arrow and stuff it into the thing on the left.'' So we're creating an empty vessel called \texttt{nbaplayers} and stuffing all this data into it.
The \texttt{read\_csv} bits are pretty obvious, except for one thing. What happens in the quote marks is the path to the data. In there, I have to tell R where it will find the data. The easiest thing to do, if you are confused about how to find your data, is to put your data in the same folder as as your notebook (you'll have to save that notebook first). If you do that, then you just need to put the name of the file in there (nbaadvancedplayers1920.csv). In my case, I've got a folder called Box in my home directory (that's the \texttt{\textasciitilde{}} part), and in there is a folder called SportsData that has the file called nbaadvancedplayers1920.csv in it. Some people -- insane people -- leave the data in their downloads folder. The data path then would be \texttt{\textasciitilde{}/Downloads/nameofthedatafilehere.csv} on PC or Mac.
\textbf{What you put in there will be different from mine}. So your first task is to import the data.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/nbaadvancedplayers1920.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Player = col_character(),
## Pos = col_character(),
## Tm = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}
Now we can inspect the data we imported. What does it look like? To do that, we use \texttt{head(nbaplayers)} to show the headers and \textbf{the first six rows of data}. If we wanted to see them all, we could just simply enter \texttt{mountainlions} and run it.
To get the number of records in our dataset, we run \texttt{nrow(nbaplayers)}
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(nbaplayers)}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 6 x 27
## Rk Player Pos Age Tm G MP PER `TS%` `3PAr` FTr `ORB%`
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Steve~ C 26 OKC 63 1680 20.5 0.604 0.006 0.421 14
## 2 2 Bam A~ PF 22 MIA 72 2417 20.3 0.598 0.018 0.484 8.5
## 3 3 LaMar~ C 34 SAS 53 1754 19.7 0.571 0.198 0.241 6.3
## 4 4 Kyle ~ PF 23 MIA 2 13 4.7 0.5 0 0 17.9
## 5 5 Nicke~ SG 21 NOP 47 591 8.9 0.473 0.5 0.139 1.6
## 6 6 Grays~ SG 24 MEM 38 718 12 0.609 0.562 0.179 1.2
## # ... with 15 more variables: `DRB%` <dbl>, `TRB%` <dbl>, `AST%` <dbl>,
## # `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>, OWS <dbl>,
## # DWS <dbl>, WS <dbl>, `WS/48` <dbl>, OBPM <dbl>, DBPM <dbl>, BPM <dbl>,
## # VORP <dbl>
\end{verbatim}
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{nrow}\NormalTok{(nbaplayers)}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## [1] 651
\end{verbatim}
Another way to look at nrow -- we have 651 players from this season in our dataset.
What if we wanted to know how many players there were by position? To do that by hand, we'd have to take each of the 651 records and sort them into a pile. We'd put them in groups and then count them.
\texttt{dplyr} has a \textbf{group by} function in it that does just this. A massive amount of data analysis involves grouping like things together at some point. So it's a good place to start.
So to do this, we'll take our dataset and we'll introduce a new operator: \%\textgreater\%. The best way to read that operator, in my opinion, is to interpret that as ``and then do this.''
After we group them together, we need to count them. We do that first by saying we want to summarize our data (a count is a part of a summary). To get a summary, we have to tell it what we want. So in this case, we want a count. To get that, let's create a thing called total and set it equal to n(), which is \texttt{dplyr}s way of counting something.
Here's the code:
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{ )}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}
\begin{verbatim}
## # A tibble: 9 x 2
## Pos total
## <chr> <int>
## 1 C 111
## 2 C-PF 2
## 3 PF 135
## 4 PF-C 2
## 5 PG 111
## 6 SF 113
## 7 SF-PF 4
## 8 SF-SG 3
## 9 SG 170
\end{verbatim}
So let's walk through that. We start with our dataset -- \texttt{nbaplayers} -- and then we tell it to group the data by a given field in the data which we get by looking at either the output of \texttt{head} or you can look in the environment where you'll see \texttt{nbaplayers}.
In this case, we wanted to group together positions, signified by the field name Pos. After we group the data, we need to count them up. In dplyr, we use \texttt{summarize} \href{http://dplyr.tidyverse.org/reference/summarise.html}{which can do more than just count things}. Inside the parentheses in summarize, we set up the summaries we want. In this case, we just want a count of the positions: \texttt{total\ =\ n(),} says create a new field, called \texttt{total} and set it equal to \texttt{n()}, which might look weird, but it's common in stats. The number of things in a dataset? Statisticians call in n.~There are n number of players in this dataset. So \texttt{n()} is a function that counts the number of things there are.
And when we run that, we get a list of positions with a count next to them. But it's not in any order. So we'll add another And Then Do This \%\textgreater\% and use \texttt{arrange}. Arrange does what you think it does -- it arranges data in order. By default, it's in ascending order -- smallest to largest. But if we want to know the county with the most mountain lion sightings, we need to sort it in descending order. That looks like this:
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{ ) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total))}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}
\begin{verbatim}
## # A tibble: 9 x 2
## Pos total
## <chr> <int>
## 1 SG 170
## 2 PF 135
## 3 SF 113
## 4 C 111
## 5 PG 111
## 6 SF-PF 4
## 7 SF-SG 3
## 8 C-PF 2
## 9 PF-C 2
\end{verbatim}
So the most common position in the NBA? Shooting guard, followed by power forward.
We can, if we want, group by more than one thing. Which team has the most of a single position? To do that, we can group by the team -- called Tm in the data -- and position, or Pos in the data:
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{group\_by}\NormalTok{(Tm, Pos) }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{ ) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total))}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## `summarise()` regrouping output by 'Tm' (override with `.groups` argument)
\end{verbatim}
\begin{verbatim}
## # A tibble: 159 x 3
## # Groups: Tm [31]
## Tm Pos total
## <chr> <chr> <int>
## 1 TOT PF 13
## 2 TOT SG 13
## 3 SAC PF 9
## 4 TOT SF 9
## 5 BRK SG 8
## 6 LAL SG 8
## 7 TOT PG 8
## 8 ATL SG 7
## 9 BRK SF 7
## 10 DAL SG 7
## # ... with 149 more rows
\end{verbatim}
So wait, what team is TOT?
Valuable lesson: whoever collects the data has opinions on how to solve problems. In this case, Basketball Reference, when a player get's traded, records stats for the player's first team, their second team, and a combined season total for a team called TOT, meaning Total. Is there a team abbreviated TOT? No.~So ignore them here.
Sacramento has 9 power forward. Brooklyn has 8 shooting guards, as do the Lakers. You can learn a bit about how a team is assembled by looking at these simple counts.
\hypertarget{other-aggregates-mean-and-median}{%
\section{Other aggregates: Mean and median}\label{other-aggregates-mean-and-median}}
In the last example, we grouped some data together and counted it up, but there's so much more you can do. You can do multiple measures in a single step as well.
Sticking with our NBA player data, we can calculate any number of measures inside summarize. Here, we'll use R's built in mean and median functions to calculate \ldots{} well, you get the idea.
Let's look just a the number of minutes each position gets.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{count =} \FunctionTok{n}\NormalTok{(),}
\AttributeTok{mean\_minutes =} \FunctionTok{mean}\NormalTok{(MP),}
\AttributeTok{median\_minutes =} \FunctionTok{median}\NormalTok{(MP)}
\NormalTok{ )}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}
\begin{verbatim}
## # A tibble: 9 x 4
## Pos count mean_minutes median_minutes
## <chr> <int> <dbl> <dbl>
## 1 C 111 891. 887
## 2 C-PF 2 316. 316.
## 3 PF 135 790. 567
## 4 PF-C 2 1548. 1548.
## 5 PG 111 944. 850
## 6 SF 113 877. 754
## 7 SF-PF 4 638. 286.
## 8 SF-SG 3 1211 1688
## 9 SG 170 843. 654.
\end{verbatim}
So there's 651 players in the data. Let's look at shooting guards. The average shooting guard plays 842 minutes and the median is 653.5 minutes.
Why?
Let's let sort help us.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(MP))}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 651 x 27
## Rk Player Pos Age Tm G MP PER `TS%` `3PAr` FTr `ORB%`
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 323 CJ Mc~ SG 28 POR 70 2556 17 0.541 0.378 0.136 1.9
## 2 55 Devin~ SG 23 PHO 70 2512 20.6 0.618 0.31 0.397 1.3
## 3 198 James~ SG 30 HOU 68 2483 29.1 0.626 0.557 0.528 2.9
## 4 27 Harri~ PF 27 SAC 72 2482 13.3 0.574 0.338 0.337 3.4
## 5 297 Damia~ PG 29 POR 66 2474 26.9 0.627 0.5 0.384 1.4
## 6 204 Tobia~ PF 27 PHI 72 2469 17.2 0.556 0.304 0.184 3.1
## 7 479 P.J. ~ PF 34 HOU 72 2467 8.3 0.559 0.702 0.113 4.7
## 8 175 Shai ~ SG 21 OKC 70 2428 17.7 0.568 0.247 0.352 2.2
## 9 2 Bam A~ PF 22 MIA 72 2417 20.3 0.598 0.018 0.484 8.5
## 10 343 Donov~ SG 23 UTA 69 2364 18.8 0.558 0.352 0.24 2.6
## # ... with 641 more rows, and 15 more variables: `DRB%` <dbl>, `TRB%` <dbl>,
## # `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>,
## # OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/48` <dbl>, OBPM <dbl>, DBPM <dbl>,
## # BPM <dbl>, VORP <dbl>
\end{verbatim}
The player with the most minutes on the floor is a shooting guard. Shooting guard is the most common position, so that means there's CJ McCollum rolling up 2,556 minutes in a season, and then there's Cleveland Cavalier's sensation J.P. Macura. Never heard of J.P. Macura? Might be because he logged one minute in one game this season.
That's a huge difference.
So when choosing a measure of the middle, you have to ask yourself -- could I have extremes? Because a median won't be sensitive to extremes. It will be the point at which half the numbers are above and half are below. The average or mean will be a measure of the middle, but if you have a bunch of pine riders and then one ironman superstar, the average will be wildly skewed.
\hypertarget{even-more-aggregates}{%
\section{Even more aggregates}\label{even-more-aggregates}}
There's a ton of things we can do in summarize -- we'll work with more of them as the course progresses -- but here's a few other questions you can ask.
Which position in the NBA plays the most minutes? And what is the highest and lowest minute total for that position? And how wide is the spread between minutes? We can find that with \texttt{sum} to add up the minutes to get the total minutes, \texttt{min} to find the minimum minutes, \texttt{max} to find the maximum minutes and \texttt{sd} to find the standard deviation in the numbers.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{total =} \FunctionTok{sum}\NormalTok{(MP), }
\AttributeTok{avgminutes =} \FunctionTok{mean}\NormalTok{(MP), }
\AttributeTok{minminutes =} \FunctionTok{min}\NormalTok{(MP),}
\AttributeTok{maxminutes =} \FunctionTok{max}\NormalTok{(MP),}
\AttributeTok{stdev =} \FunctionTok{sd}\NormalTok{(MP)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total))}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}
\begin{verbatim}
## # A tibble: 9 x 6
## Pos total avgminutes minminutes maxminutes stdev
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SG 143229 843. 1 2556 735.
## 2 PF 106654 790. 5 2482 719.
## 3 PG 104745 944. 8 2474 727.
## 4 SF 99109 877. 11 2316 709.
## 5 C 98914 891. 3 2336 619.
## 6 SF-SG 3633 1211 87 1858 977.
## 7 PF-C 3097 1548. 960 2137 832.
## 8 SF-PF 2553 638. 46 1936 873.
## 9 C-PF 633 316. 256 377 85.6
\end{verbatim}
So again, no surprise, shooting guards spend the most minutes on the floor in the NBA. They average 842 minutes, but we noted why that's trouble. The minimum is the J.P. Macura Award, max is the Trailblazer's failing at load management, and the standard deviation is a measure of how spread out the data is. In this case, not the highest spread among positions, but pretty high. So you know you've got some huge minutes players and a bunch of bench players.
\hypertarget{mutating-data}{%
\chapter{Mutating data}\label{mutating-data}}
One of the most common data analysis techniques is to look at change over time. The most common way of comparing change over time is through percent change. The math behind calculating percent change is very simple, and you should know it off the top of your head. The easy way to remember it is:
\texttt{(new\ -\ old)\ /\ old}
Or new minus old divided by old. Your new number minus the old number, the result of which is divided by the old number. To do that in R, we can use \texttt{dplyr} and \texttt{mutate} to calculate new metrics in a new field using existing fields of data.
So first we'll import the tidyverse so we can read in our data and begin to work with it.
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}
Now you'll need a common and simple dataset of total attendance at NCAA football games over the last few seasons.
You'll import it something like this.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{\textquotesingle{}data/attendance.csv\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
##
## -- Column specification --------------------------------------------------------
## cols(
## Institution = col_character(),
## Conference = col_character(),
## `2013` = col_double(),
## `2014` = col_double(),
## `2015` = col_double(),
## `2016` = col_double(),
## `2017` = col_double(),
## `2018` = col_double()
## )
\end{verbatim}
If you want to see the first six rows -- handy to take a peek at your data -- you can use the function \texttt{head}.
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(attendance)}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 6 x 8
## Institution Conference `2013` `2014` `2015` `2016` `2017` `2018`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Air Force MWC 228562 168967 156158 177519 174924 166205
## 2 Akron MAC 107101 55019 108588 62021 117416 92575
## 3 Alabama SEC 710538 710736 707786 712747 712053 710931
## 4 Appalachian St. FBS Independent 149366 NA NA NA NA NA
## 5 Appalachian St. Sun Belt NA 138995 128755 156916 154722 131716
## 6 Arizona Pac-12 285713 354973 308355 338017 255791 318051
\end{verbatim}
The code to calculate percent change is pretty simple. Remember, with \texttt{summarize}, we used \texttt{n()} to count things. With \texttt{mutate}, we use very similar syntax to calculate a new value using other values in our dataset. So in this case, we're trying to do (new-old)/old, but we're doing it with fields. If we look at what we got when we did \texttt{head}, you'll see there's `2018` as the new data, and we'll use `2017` as the old data. So we're looking at one year. Then, to help us, we'll use arrange again to sort it, so we get the fastest growing school over one year.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
\AttributeTok{change =}\NormalTok{ (}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{{-}} \StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}
\NormalTok{) }
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 150 x 9
## Institution Conference `2013` `2014` `2015` `2016` `2017` `2018` change
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Air Force MWC 228562 168967 156158 177519 174924 166205 -0.0498
## 2 Akron MAC 107101 55019 108588 62021 117416 92575 -0.212
## 3 Alabama SEC 710538 710736 707786 712747 712053 710931 -0.00158
## 4 Appalachian ~ FBS Indepen~ 149366 NA NA NA NA NA NA
## 5 Appalachian ~ Sun Belt NA 138995 128755 156916 154722 131716 -0.149
## 6 Arizona Pac-12 285713 354973 308355 338017 255791 318051 0.243
## 7 Arizona St. Pac-12 501509 343073 368985 286417 359660 291091 -0.191
## 8 Arkansas SEC 431174 399124 471279 487067 442569 367748 -0.169
## 9 Arkansas St. Sun Belt 149477 149163 138043 136200 119538 119001 -0.00449
## 10 Army West Po~ FBS Indepen~ 169781 171310 185946 163267 185543 190156 0.0249
## # ... with 140 more rows
\end{verbatim}
What do we see right away? Do those numbers look like we expect them to? No.~They're a decimal expressed as a percentage. So let's fix that by multiplying by 100.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
\AttributeTok{change =}\NormalTok{ ((}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{{-}} \StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{*}\DecValTok{100}
\NormalTok{) }
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 150 x 9
## Institution Conference `2013` `2014` `2015` `2016` `2017` `2018` change
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Air Force MWC 228562 168967 156158 177519 174924 166205 -4.98
## 2 Akron MAC 107101 55019 108588 62021 117416 92575 -21.2
## 3 Alabama SEC 710538 710736 707786 712747 712053 710931 -0.158
## 4 Appalachian S~ FBS Indepen~ 149366 NA NA NA NA NA NA
## 5 Appalachian S~ Sun Belt NA 138995 128755 156916 154722 131716 -14.9
## 6 Arizona Pac-12 285713 354973 308355 338017 255791 318051 24.3
## 7 Arizona St. Pac-12 501509 343073 368985 286417 359660 291091 -19.1
## 8 Arkansas SEC 431174 399124 471279 487067 442569 367748 -16.9
## 9 Arkansas St. Sun Belt 149477 149163 138043 136200 119538 119001 -0.449
## 10 Army West Poi~ FBS Indepen~ 169781 171310 185946 163267 185543 190156 2.49
## # ... with 140 more rows
\end{verbatim}
Now, does this ordering do anything for us? No.~Let's fix that with arrange.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
\AttributeTok{change =}\NormalTok{ ((}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{{-}} \StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{*}\DecValTok{100}
\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(change))}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 150 x 9
## Institution Conference `2013` `2014` `2015` `2016` `2017` `2018` change
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ga. Southern Sun Belt NA 105510 124681 104095 61031 100814 65.2
## 2 La.-Monroe Sun Belt 85177 90540 58659 67057 49640 71048 43.1
## 3 Louisiana Sun Belt 129878 154652 129577 121346 78754 111303 41.3
## 4 Hawaii MWC 185931 192159 164031 170299 145463 205455 41.2
## 5 Buffalo MAC 136418 122418 110743 104957 80102 110280 37.7
## 6 California Pac-12 345303 286051 292797 279769 219290 300061 36.8
## 7 UCF AAC 252505 226869 180388 214814 257924 352148 36.5
## 8 UTSA C-USA 175282 165458 138048 138226 114104 148257 29.9
## 9 Eastern Mich. MAC 20255 75127 29381 106064 73649 95632 29.8
## 10 Louisville ACC NA 317829 294413 324391 276957 351755 27.0
## # ... with 140 more rows
\end{verbatim}
So who had the most growth last year from the year before? Something going on at Georgia Southern.
\hypertarget{a-more-complex-example}{%
\section{A more complex example}\label{a-more-complex-example}}
There's metric in basketball that's easy to understand -- shooting percentage. It's the number of shots made divided by the number of shots attempted. Simple, right? Except it's a little too simple. Because what about three point shooters? They tend to be more vailable because the three point shot is worth more. What about players who get to the line? In shooting percentage, free throws are nowhere to be found.
Basketball nerds, because of these weaknesses, have created a new metric called \href{https://en.wikipedia.org/wiki/True_shooting_percentage}{True Shooting Percentage}. True shooting percentage takes into account all aspects of a players shooting to determine who the real shooters are.
Using \texttt{dplyr} and \texttt{mutate}, we can calculate true shooting percentage. So let's look at a new dataset, one of every college basketball player's season stats in 2018-19 season. It's a dataset of 5,386 players, and we've got 59 variables -- one of them is True Shooting Percentage, but we're going to ignore that.
Import it like this:
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/players19.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## Warning: Missing column names filled in: 'X1' [1]
\end{verbatim}
\begin{verbatim}
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Team = col_character(),
## Conference = col_character(),
## Player = col_character(),
## Class = col_character(),
## Pos = col_character(),
## Height = col_character(),
## Hometown = col_character(),
## `High School` = col_character(),
## Summary = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}
The basic true shooting percentage formula is \texttt{(Points\ /\ (2*(FieldGoalAttempts\ +\ (.44\ *\ FreeThrowAttempts))))\ *\ 100}. Let's talk that through. Points divided by a lot. It's really field goal attempts plus 44 percent of the free throw attempts. Why? Because that's about what a free throw is worth, compared to other ways to score. After adding those things together, you double it. And after you divide points by that number, you multiply the whole lot by 100.
In our data, we need to be able to find the fields so we can complete the formula. To do that, one way is to use the Environment tab in R Studio. In the Environment tab is a listing of all the data you've imported, and if you click the triangle next to it, it'll list all the field names, giving you a bit of information about each one.
\includegraphics[width=18.14in]{images/environment}
So what does True Shooting Percentage look like in code?
Let's think about this differently. Who had the best true shooting season last year?
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{mutate}\NormalTok{(}\AttributeTok{trueshooting =}\NormalTok{ (PTS}\SpecialCharTok{/}\NormalTok{(}\DecValTok{2}\SpecialCharTok{*}\NormalTok{(FGA }\SpecialCharTok{+}\NormalTok{ (.}\DecValTok{44}\SpecialCharTok{*}\NormalTok{FTA))))}\SpecialCharTok{*}\DecValTok{100}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(trueshooting))}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
## # A tibble: 5,386 x 60
## X1 Team Conference Player `#` Class Pos Height Weight Hometown
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 579 Texa~ Big 12 Drayt~ 4 JR G 6-0 156 Austin,~
## 2 843 Ston~ AEC Nick ~ 42 FR F 6-7 240 Port Je~
## 3 1059 Sout~ Southland Patri~ 22 SO F 6-3 210 Folsom,~
## 4 4269 Dayt~ A-10 Camro~ 52 SO G 5-7 160 Country~
## 5 4681 Cali~ Pac-12 David~ 21 JR G 6-4 185 Newbury~
## 6 326 Virg~ ACC Grant~ 1 FR G <NA> NA Charlot~
## 7 410 Vand~ SEC Mac H~ 42 FR G 6-6 182 Chattan~
## 8 1390 Sain~ A-10 Jack ~ 31 JR G 6-6 205 Mattoon~
## 9 2230 NJIT~ A-Sun Patri~ 3 SO G 5-9 160 West Or~
## 10 266 Wash~ Pac-12 Reaga~ 34 FR F 6-6 225 Santa A~
## # ... with 5,376 more rows, and 50 more variables: `High School` <chr>,
## # Summary <chr>, Rk.x <dbl>, G <dbl>, GS <dbl>, MP <dbl>, FG <dbl>,
## # FGA <dbl>, `FG%` <dbl>, `2P` <dbl>, `2PA` <dbl>, `2P%` <dbl>, `3P` <dbl>,
## # `3PA` <dbl>, `3P%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>, ORB <dbl>,
## # DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>,
## # PTS <dbl>, Rk.y <dbl>, PER <dbl>, `TS%` <dbl>, `eFG%` <dbl>, `3PAr` <dbl>,