SportsData.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
%
\documentclass[
]{book}
\usepackage{lmodern}
\usepackage{amsmath}
\usepackage{ifxetex,ifluatex}
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
  \usepackage{amssymb}
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\hypersetup{
  pdftitle={Sports Data Analysis and Visualization},
  pdfauthor={By Matt Waite},
  hidelinks,
  pdfcreator={LaTeX via pandoc}}
\urlstyle{same} % disable monospaced font for URLs
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage{booktabs}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\ifluatex
  \usepackage{selnolig}  % disable illegal ligatures
\fi
\usepackage[]{natbib}
\bibliographystyle{apalike}

\title{Sports Data Analysis and Visualization}
\usepackage{etoolbox}
\makeatletter
\providecommand{\subtitle}[1]{% add subtitle to \maketitle
  \apptocmd{\@title}{\par {\large #1 \par}}{}{}
}
\makeatother
\subtitle{Code, data, visuals and the Tidyverse for journalists and other storytellers}
\author{By Matt Waite}
\date{July 29, 2019}

\begin{document}
\maketitle

{
\setcounter{tocdepth}{1}
\tableofcontents
}
\hypertarget{throwing-cold-water-on-hot-takes}{%
\chapter{Throwing cold water on hot takes}\label{throwing-cold-water-on-hot-takes}}

The 2018 season started out disastrously for the Nebraska Cornhuskers. The first game against a probably overmatched opponent? Called on account of an epic thunderstorm that plowed right over Memorial Stadium. The next game? Loss. The one following? Loss. The next four? All losses, after the fanbase was whipped into a hopeful frenzy by the hiring of Scott Frost, national title winning quarterback turned hot young coach come back home to save a mythical football program from the mediocrity it found itself mired in.

All that excitement lay in tatters.

On sports talk radio, on the sports pages and across social media and cafe conversations, one topic kept coming up again and again to explain why the team was struggling: Penalties. The team was just committing too many of them. In fact, six games and no wins into the season, they were dead last in the FBS penalty yards.

Worse yet for this line of reasoning? Nebraska won game 7, against Minnesota, committing only six penalties for 43 yards, just about half their average over the season. Then they won game 8 against FCS patsy Bethune Cookman, committing only five penalties for 35 yards. That's a whopping 75 yards less than when they were losing. See? Cut the penalties, win games screamed the radio show callers.

The problem? It's not true. Penalties might matter for a single drive. They may even throw a single game. But if you look at every top-level college football team since 2009, the number of penalty yards the team racks up means absolutely nothing to the total number of points they score. There's no relationship between them. Penalty yards have no discernible influence on points beyond just random noise.

Put this another way: If you were Scott Frost, and a major college football program was paying you \$5 million a year to make your team better, what should you focus on in practice? If you had growled at some press conference that you're going to work on penalties in practice until your team stops committing them, the results you'd get from all that wasted practice time would be impossible to separate from just random chance. You very well may reduce your penalty yards and still lose.

How do I know this? Simple statistics.

That's one of the three pillars of this book: Simple stats. The three pillars are:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Simple, easy to understand statistics \ldots{}
\item
  \ldots{} extracted using simple code \ldots{}
\item
  \ldots{} visualized simply to reveal new and interesting things in sports.
\end{enumerate}

Do you need to be a math whiz to read this book? No.~I'm not one either. What we're going to look at is pretty basic, but that's also why it's so powerful.

Do you need to be a computer science major to write code? Nope. I'm not one of those either. But anyone can think logically, and write simple code that is repeatable and replicable.

Do you need to be an artist to create compelling visuals? I think you see where this is going. No.~I can barely draw stick figures, but I've been paid to make graphics in my career. With a little graphic design know how, you can create publication worthy graphics with code.

\hypertarget{requirements-and-conventions}{%
\section{Requirements and Conventions}\label{requirements-and-conventions}}

This book is all in the R statistical language. To follow along, you'll do the following:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Install the R language on your computer. Go to the \href{https://www.r-project.org/}{R Project website}, click download R and select a mirror closest to your location. Then download the version for your computer.
\item
  Install \href{https://www.rstudio.com/products/rstudio/\#Desktop}{R Studio Desktop}. The free version is great.
\end{enumerate}

Going forward, you'll see passages like this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{install.packages}\NormalTok{(}\StringTok{"tidyverse"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Don't do it now, but that is code that you'll need to run in your R Studio. When you see that, you'll know what to do.

\hypertarget{about-this-book}{%
\section{About this book}\label{about-this-book}}

This book is the collection of class materials for the author's Sports Data Analysis and Visualization class at the University of Nebraska-Lincoln's College of Journalism and Mass Communications. There's some things you should know about it:

\begin{itemize}
\tightlist
\item
  It is free for students.
\item
  The topics will remain the same but the text is going to be constantly tinkered with.
\item
  What is the work of the author is copyright Matt Waite 2019.
\item
  The text is \href{https://creativecommons.org/licenses/by-nc-sa/4.0/}{Attribution-NonCommercial-ShareAlike 4.0 International} Creative Commons licensed. That means you can share it and change it, but only if you share your changes with the same license and it cannot be used for commercial purposes. I'm not making money on this so you can't either.\\
\item
  As such, the whole book -- authored in Bookdown -- is \href{https://github.com/mattwaite/sportsdatabook}{open sourced on Github}. Pull requests welcomed!
\end{itemize}

\hypertarget{the-very-basics}{%
\chapter{The very basics}\label{the-very-basics}}

R is a programming language, one specifically geared toward statistical analysis. Like all programming languages, it has certain built-in functions and you can interact with it in multiple ways. The first, and most basic, is the console.

\includegraphics[width=18.97in]{images/verybasics1}

Think of the console like talking directly to R. It's direct, but it has some drawbacks and some quirks we'll get into later. For now, try typing this into the console and hit enter:

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{2}\SpecialCharTok{+}\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4
\end{verbatim}

Congrats, you've run some code. It's not very complex, and you knew the answer before hand, but you get the idea. We can compute things. We can also store things. \textbf{In programming languages, these are called variables}. We can assign things to variables using \texttt{\textless{}-}. And then we can do things with them. \textbf{The \texttt{\textless{}-} is a called an assignment operator}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{number }\OtherTok{\textless{}{-}} \DecValTok{2}

\NormalTok{number }\SpecialCharTok{*}\NormalTok{ number}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4
\end{verbatim}

Now assign a different number to the variable number. Try run \texttt{number\ *\ number} again. Get what you expected?

We can have as many variables as we can name. \textbf{We can even reuse them (but be careful you know you're doing that or you'll introduce errors)}. Try this in your console.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{firstnumber }\OtherTok{\textless{}{-}} \DecValTok{1}
\NormalTok{secondnumber }\OtherTok{\textless{}{-}}\DecValTok{2} 

\NormalTok{(firstnumber }\SpecialCharTok{+}\NormalTok{ secondnumber) }\SpecialCharTok{*}\NormalTok{ secondnumber}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 6
\end{verbatim}

\textbf{We can store anything in a variable}. A whole table. An array of numbers. A single word. A whole book. All the books of the 18th century. They're really powerful. We'll explore them at length.

\hypertarget{adding-libraries-part-1}{%
\section{Adding libraries, part 1}\label{adding-libraries-part-1}}

The real strength of any given programming language is the external libraries that power it. The base language can do a lot, but it's the external libraries that solve many specific problems -- even making the base language easier to use.

For this class, we're going to need several external libraries.

The first library we're going to use is called Swirl. So in the console, type \texttt{install.packages(\textquotesingle{}swirl\textquotesingle{})} and hit enter. That installs swirl.

Now, to use the library, type \texttt{library(swirl)} and hit enter. That loads swirl. Then type \texttt{swirl()} and hit enter. Now you're running swirl. Follow the directions on the screen. When you are asked, you want to install course 1 R Programming: The basics of programming in R. Then, when asked, you want to do option 1, R Programming, in that course.

When you are finished with the course -- it will take just a few minutes -- it will first ask you if you want credit on Coursera. You do not. Then type 0 to exit (it will not be very clear that's what you do when you are done).

\hypertarget{adding-libraries-part-2}{%
\section{Adding libraries, part 2}\label{adding-libraries-part-2}}

We'll mostly use two libraries for analysis -- \texttt{dplyr} and \texttt{ggplot2}. To get them, and several other useful libraries, we can install a single collection of libraries called the tidyverse. Type this into your console: \texttt{install.packages(\textquotesingle{}tidyverse\textquotesingle{})}

\textbf{NOTE}: This is a pattern. You should always install libraries in the console.

Then, to help us with learning and replication, we're going to use R Notebooks. So we need to install that library. Type this into your console: \texttt{install.packages(\textquotesingle{}rmarkdown\textquotesingle{})}

\hypertarget{notebooks}{%
\section{Notebooks}\label{notebooks}}

For the rest of the class, we're going to be working in notebooks. In notebooks, you will both run your code and explain each step, much as I am doing here.

To start a notebook, you click on the green plus in the top left corner and go down to R Notebook. Do that now.

\includegraphics[width=11.08in]{images/verybasics2}

You will see that the notebook adds a lot of text for you. It tells you how to work in notebooks -- and you should read it. The most important parts are these:

To add text, simply type. To add code you can click on the \emph{Insert} button on the toolbar or by pressing \emph{Cmd+Option+I} on Mac or \emph{Ctl+Alt+I} on Windows.

Highlight all that text and delete it. You should have a blank document. This document is called a R Markdown file -- it's a special form of text, one that you can style, and one you can include R in the middle of it. Markdown is a simple markup format that you can use to create documents. So first things first, let's give our notebook a big headline. Add this:

\texttt{\#\ My\ awesome\ notebook}

Now, under that, without any markup, just type This is my awesome notebook.

Under that, you can make text bold by writing \texttt{It\ is\ **really**\ awesome}.

If you want it italics, just do this on the next line: \texttt{No,\ it\textquotesingle{}s\ \_really\_\ awesome.\ I\ swear.}

To see what it looks like without the markup, click the Preview or Knit button in the toolbar. That will turn your notebook into a webpage, with the formatting included.

Throughout this book, we're going to use this markdown to explain what we are doing and, more importantly, why we are doing it. Explaining your thinking is a vital part of understanding what you are doing.

That explaination, plus the code, is the real power of notebooks. To add a block of code, follow the instructions from above: click on the \emph{Insert} button on the toolbar or by pressing \emph{Cmd+Option+I} on Mac or \emph{Ctl+Alt+I} on Windows.

In that window, use some of the code from above and add two numbers together. To see it run, click the green triangle on the right. That runs the chunk. You should see the answer to your addition problem.

And that, just that, is the foundation you need to start this book.

\hypertarget{data-structures-and-types}{%
\chapter{Data, structures and types}\label{data-structures-and-types}}

Data are everywhere (and data is plural of datum, thus the use of are in that statement). It surrounds you. Every time you use your phone, you are creating data. Lots of it. Your online life. Any time you buy something. It's everywhere. Sports, like life, is no different. Sports is drowning in data, and more comes along all the time.

In sports, and in this class, we'll be dealing largely with two kinds of data: event level data and summary data. It's not hard to envision event level data in sports. A pitch in baseball. A hit. A play in football. A pass in soccer. They are the events that make up the game. Combine them together -- summarize them -- and you'll have some notion of how the game went. What we usually see is summary data -- who wants to scroll through 50 pitches to find out a player went 2-3 with a double and an RBI? Who wants to scroll through hundreds of pitches to figure out the Rays beat the Yankees?

To start with, we need to understand the shape of data.

\begin{quote}
EXERCISE: Try scoring a child's board game. For example, Chutes and Ladders. If you were placed in charge of analytics for the World Series of Chutes and Ladders, what is your event level data? What summary data do you keep? If you've got the game, try it.
\end{quote}

\hypertarget{rows-and-columns}{%
\section{Rows and columns}\label{rows-and-columns}}

Data, oversimplifying it a bit, is information organized. Generally speaking, it's organized into rows and columns. Rows, generally, are individual elements. A team. A player. A game. Columns, generally, are components of the data, sometimes called variables. So if each row is a player, the first column might be their name. The second is their position. The third is their batting average. And so on.

\includegraphics[width=22in]{images/data1}

One of the critical components of data analysis, especially for beginners, is having a mental picture of your data. What does each row mean? What does each column in each row signify? How many rows do you have? How many columns?

\hypertarget{types}{%
\section{Types}\label{types}}

There are scores of data types in the world, and R has them. In this class, we're primarily going to be dealing with data frames, and each element of our data frames will have a data type.

Typically, they'll be one of four types of data:

\begin{itemize}
\tightlist
\item
  Numeric: a number, like the number of touchdown passes in a season or a batting average.
\item
  Character: Text, like a name, a team, a conference.
\item
  Date: Fully formed dates -- 2019-01-01 -- have a special date type. Elements of a date, like a year (ex. 2019) are not technically dates, so they'll appear as numeric data types.
\item
  Logical: Rare, but every now and then we'll have a data type that's Yes or No, True or False, etc.
\end{itemize}

\textbf{Question:} Is a zip code a number? Is a jersey number a number? Trick question, because the answer is no. Numbers are things we do math on. If the thing you want is not something you're going to do math on -- can you add two phone numbers together? -- then make it a character type. If you don't, most every software system on the planet will drop leading zeros. For example, every zip code in Boston starts with 0. If you record that as a number, your zip code will become a four digit number, which isn't a zip code anymore.

\hypertarget{a-simple-way-to-get-data}{%
\section{A simple way to get data}\label{a-simple-way-to-get-data}}

One good thing about sports is that there's lots of interest in it. And that means there's outlets that put sports data on the internet. Now I'm going to show you a trick to getting it easily.

The site sports-reference.com takes NCAA (and other league) stats and puts them online. For instance, \href{https://www.sports-reference.com/cbb/schools/nebraska/2019-gamelogs.html}{here's their page on Nebraska basketball's game logs}, which you should open now.

Now, in a new tab, log into Google Docs/Drive and open a new spreadsheet. In the first cell of the first row, copy and paste this formula in:

\begin{verbatim}
=IMPORTHTML("https://www.sports-reference.com/cbb/schools/nebraska/2019-gamelogs.html", "table", 1)
\end{verbatim}

If it worked right, you've got the data from that page in a spreadsheet.

\hypertarget{cleaning-the-data}{%
\section{Cleaning the data}\label{cleaning-the-data}}

The first thing we need to do is recognize that we don't have data, really. We have the results of a formula. You can tell by putting your cursor on that field, where you'll see the formula again. This is where you'd look:

\includegraphics[width=33.28in]{images/clean1}

The solution is easy:

Edit \textgreater{} Select All or type command/control A
Edit \textgreater{} Copy or type command/control c
Edit \textgreater{} Paste Special \textgreater{} Values Only or type command/control shift v

You can verify that it worked by looking in that same row 1 column A, where you'll see the formula is gone.

\includegraphics[width=36.81in]{images/clean2}

Now you have data, but your headers are all wrong. You want your headers to be one line -- not two, like they have. And the header names repeat -- first for our team, then for theirs. So you have to change each header name to be UsORB or TeamORB and OpponentORB instead of just ORB.

After you've done that, note we have repeating headers. There's two ways to deal with that -- you could just hightlight it and go up to Edit \textgreater{} Delete Rows XX-XX depending on what rows you highlighted. That's the easy way with our data.

But what if you had hundreds of repeating headers like that? Deleting them would take a long time.

You can use sorting to get rid of anything that's not data. So click on Data \textgreater{} Sort Range. You'll want to check the ``Data has header row'' field. Then hit Sort.

\includegraphics[width=21.61in]{images/clean3}

Now all you need to do is search through the data for where your junk data -- extra headers, blanks, etc. -- got sorted and delete it. After you've done that, you can export it for use in R. Go to File \textgreater{} Download as \textgreater{} Comma Separated Values. Remember to put it in the same directory as your R Notebook file so you can import the data easily.

\hypertarget{aggregates}{%
\chapter{Aggregates}\label{aggregates}}

R is a statistical programming language that is purpose built for data analysis.

Base R does a lot, but there are a mountain of external libraries that do things to make R better/easier/more fully featured. We already installed the tidyverse -- or you should have if you followed the instructions for the last assignment -- which isn't exactly a library, but a collection of libraries. Together, they make up the tidyverse. Individually, they are extraordinarily useful for what they do. We can load them all at once using the tidyverse name, or we can load them individually. Let's start with individually.

The two libraries we are going to need for this assignment are \texttt{readr} and \texttt{dplyr}. The library \texttt{readr} reads different types of data in as a dataframe. For this assignment, we're going to read in csv data or Comma Separated Values data. That's data that has a comma between each column of data.

Then we're going to use \texttt{dplyr} to analyze it.

To use a library, you need to import it. Good practice -- one I'm going to insist on -- is that you put all your library steps at the top of your notebooks.

That code looks like this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(readr)}
\end{Highlighting}
\end{Shaded}

To load them both, you need to run that code twice:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(readr)}
\FunctionTok{library}\NormalTok{(dplyr)}
\end{Highlighting}
\end{Shaded}

You can keep doing that for as many libraries as you need. I've seen notebooks with 10 or more library imports.

But the tidyverse has a neat little trick. We can load most of the libraries we'll need for the whole semester with one line:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

\textbf{From now on, if that's not the first line of your notebook, you're probably doing it wrong.}

\hypertarget{basic-data-analysis-group-by-and-count}{%
\section{Basic data analysis: Group By and Count}\label{basic-data-analysis-group-by-and-count}}

The first thing we need to do is get some data to work with. We do that by reading it in. In our case, we're going to read data from a csv file -- a comma-separated values file.

The CSV file we're going to read from is a \href{https://www.basketball-reference.com/leagues/NBA_2020_advanced.html}{Basketball Reference} page of advanced metrics for NBA players this season. The Sports Reference sites are a godsend of data, a trove of stuff, and we're going to use it a lot in this class.

So step 2, after setting up our libraries, is most often going to be importing data. In order to analyze data, we need data, so it stands to reason that this would be something we'd do very early.

The code looks \emph{something} like this, but hold off copying it just yet:

\texttt{nbaplayers\ \textless{}-\ read\_csv("\textasciitilde{}/Box/SportsData/nbaadvancedplayers1920.csv")}

Let's unpack that.

The first part -- nbaplayers -- is the name of your variable. A variable is just a name of a thing that stores stuff. In this case, our variable is a data frame, which is R's way of storing data (technically it's a tibble, which is the tidyverse way of storing data, but the differences aren't important and people use them interchangeably). \textbf{We can call this whatever we want.} I always want to name data frames after what is in it. In this case, we're going to import a dataset of NBA players. Variable names, by convention are one word all lower case. You can end a variable with a number, but you can't start one with a number.

The \textless- bit is the variable assignment operator. It's how we know we're assigning something to a word. Think of the arrow as saying ``Take everything on the right of this arrow and stuff it into the thing on the left.'' So we're creating an empty vessel called \texttt{nbaplayers} and stuffing all this data into it.

The \texttt{read\_csv} bits are pretty obvious, except for one thing. What happens in the quote marks is the path to the data. In there, I have to tell R where it will find the data. The easiest thing to do, if you are confused about how to find your data, is to put your data in the same folder as as your notebook (you'll have to save that notebook first). If you do that, then you just need to put the name of the file in there (nbaadvancedplayers1920.csv). In my case, I've got a folder called Box in my home directory (that's the \texttt{\textasciitilde{}} part), and in there is a folder called SportsData that has the file called nbaadvancedplayers1920.csv in it. Some people -- insane people -- leave the data in their downloads folder. The data path then would be \texttt{\textasciitilde{}/Downloads/nameofthedatafilehere.csv} on PC or Mac.

\textbf{What you put in there will be different from mine}. So your first task is to import the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/nbaadvancedplayers1920.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Player = col_character(),
##   Pos = col_character(),
##   Tm = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

Now we can inspect the data we imported. What does it look like? To do that, we use \texttt{head(nbaplayers)} to show the headers and \textbf{the first six rows of data}. If we wanted to see them all, we could just simply enter \texttt{mountainlions} and run it.

To get the number of records in our dataset, we run \texttt{nrow(nbaplayers)}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(nbaplayers)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 27
##      Rk Player Pos     Age Tm        G    MP   PER `TS%` `3PAr`   FTr `ORB%`
##   <dbl> <chr>  <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1     1 Steve~ C        26 OKC      63  1680  20.5 0.604  0.006 0.421   14  
## 2     2 Bam A~ PF       22 MIA      72  2417  20.3 0.598  0.018 0.484    8.5
## 3     3 LaMar~ C        34 SAS      53  1754  19.7 0.571  0.198 0.241    6.3
## 4     4 Kyle ~ PF       23 MIA       2    13   4.7 0.5    0     0       17.9
## 5     5 Nicke~ SG       21 NOP      47   591   8.9 0.473  0.5   0.139    1.6
## 6     6 Grays~ SG       24 MEM      38   718  12   0.609  0.562 0.179    1.2
## # ... with 15 more variables: `DRB%` <dbl>, `TRB%` <dbl>, `AST%` <dbl>,
## #   `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>, OWS <dbl>,
## #   DWS <dbl>, WS <dbl>, `WS/48` <dbl>, OBPM <dbl>, DBPM <dbl>, BPM <dbl>,
## #   VORP <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{nrow}\NormalTok{(nbaplayers)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 651
\end{verbatim}

Another way to look at nrow -- we have 651 players from this season in our dataset.

What if we wanted to know how many players there were by position? To do that by hand, we'd have to take each of the 651 records and sort them into a pile. We'd put them in groups and then count them.

\texttt{dplyr} has a \textbf{group by} function in it that does just this. A massive amount of data analysis involves grouping like things together at some point. So it's a good place to start.

So to do this, we'll take our dataset and we'll introduce a new operator: \%\textgreater\%. The best way to read that operator, in my opinion, is to interpret that as ``and then do this.''

After we group them together, we need to count them. We do that first by saying we want to summarize our data (a count is a part of a summary). To get a summary, we have to tell it what we want. So in this case, we want a count. To get that, let's create a thing called total and set it equal to n(), which is \texttt{dplyr}s way of counting something.

Here's the code:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 9 x 2
##   Pos   total
##   <chr> <int>
## 1 C       111
## 2 C-PF      2
## 3 PF      135
## 4 PF-C      2
## 5 PG      111
## 6 SF      113
## 7 SF-PF     4
## 8 SF-SG     3
## 9 SG      170
\end{verbatim}

So let's walk through that. We start with our dataset -- \texttt{nbaplayers} -- and then we tell it to group the data by a given field in the data which we get by looking at either the output of \texttt{head} or you can look in the environment where you'll see \texttt{nbaplayers}.

In this case, we wanted to group together positions, signified by the field name Pos. After we group the data, we need to count them up. In dplyr, we use \texttt{summarize} \href{http://dplyr.tidyverse.org/reference/summarise.html}{which can do more than just count things}. Inside the parentheses in summarize, we set up the summaries we want. In this case, we just want a count of the positions: \texttt{total\ =\ n(),} says create a new field, called \texttt{total} and set it equal to \texttt{n()}, which might look weird, but it's common in stats. The number of things in a dataset? Statisticians call in n.~There are n number of players in this dataset. So \texttt{n()} is a function that counts the number of things there are.

And when we run that, we get a list of positions with a count next to them. But it's not in any order. So we'll add another And Then Do This \%\textgreater\% and use \texttt{arrange}. Arrange does what you think it does -- it arranges data in order. By default, it's in ascending order -- smallest to largest. But if we want to know the county with the most mountain lion sightings, we need to sort it in descending order. That looks like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{  ) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 9 x 2
##   Pos   total
##   <chr> <int>
## 1 SG      170
## 2 PF      135
## 3 SF      113
## 4 C       111
## 5 PG      111
## 6 SF-PF     4
## 7 SF-SG     3
## 8 C-PF      2
## 9 PF-C      2
\end{verbatim}

So the most common position in the NBA? Shooting guard, followed by power forward.

We can, if we want, group by more than one thing. Which team has the most of a single position? To do that, we can group by the team -- called Tm in the data -- and position, or Pos in the data:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(Tm, Pos) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{  ) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Tm' (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 159 x 3
## # Groups:   Tm [31]
##    Tm    Pos   total
##    <chr> <chr> <int>
##  1 TOT   PF       13
##  2 TOT   SG       13
##  3 SAC   PF        9
##  4 TOT   SF        9
##  5 BRK   SG        8
##  6 LAL   SG        8
##  7 TOT   PG        8
##  8 ATL   SG        7
##  9 BRK   SF        7
## 10 DAL   SG        7
## # ... with 149 more rows
\end{verbatim}

So wait, what team is TOT?

Valuable lesson: whoever collects the data has opinions on how to solve problems. In this case, Basketball Reference, when a player get's traded, records stats for the player's first team, their second team, and a combined season total for a team called TOT, meaning Total. Is there a team abbreviated TOT? No.~So ignore them here.

Sacramento has 9 power forward. Brooklyn has 8 shooting guards, as do the Lakers. You can learn a bit about how a team is assembled by looking at these simple counts.

\hypertarget{other-aggregates-mean-and-median}{%
\section{Other aggregates: Mean and median}\label{other-aggregates-mean-and-median}}

In the last example, we grouped some data together and counted it up, but there's so much more you can do. You can do multiple measures in a single step as well.

Sticking with our NBA player data, we can calculate any number of measures inside summarize. Here, we'll use R's built in mean and median functions to calculate \ldots{} well, you get the idea.

Let's look just a the number of minutes each position gets.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{count =} \FunctionTok{n}\NormalTok{(),}
    \AttributeTok{mean\_minutes =} \FunctionTok{mean}\NormalTok{(MP),}
    \AttributeTok{median\_minutes =} \FunctionTok{median}\NormalTok{(MP)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 9 x 4
##   Pos   count mean_minutes median_minutes
##   <chr> <int>        <dbl>          <dbl>
## 1 C       111         891.           887 
## 2 C-PF      2         316.           316.
## 3 PF      135         790.           567 
## 4 PF-C      2        1548.          1548.
## 5 PG      111         944.           850 
## 6 SF      113         877.           754 
## 7 SF-PF     4         638.           286.
## 8 SF-SG     3        1211           1688 
## 9 SG      170         843.           654.
\end{verbatim}

So there's 651 players in the data. Let's look at shooting guards. The average shooting guard plays 842 minutes and the median is 653.5 minutes.

Why?

Let's let sort help us.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(MP))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 651 x 27
##       Rk Player Pos     Age Tm        G    MP   PER `TS%` `3PAr`   FTr `ORB%`
##    <dbl> <chr>  <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
##  1   323 CJ Mc~ SG       28 POR      70  2556  17   0.541  0.378 0.136    1.9
##  2    55 Devin~ SG       23 PHO      70  2512  20.6 0.618  0.31  0.397    1.3
##  3   198 James~ SG       30 HOU      68  2483  29.1 0.626  0.557 0.528    2.9
##  4    27 Harri~ PF       27 SAC      72  2482  13.3 0.574  0.338 0.337    3.4
##  5   297 Damia~ PG       29 POR      66  2474  26.9 0.627  0.5   0.384    1.4
##  6   204 Tobia~ PF       27 PHI      72  2469  17.2 0.556  0.304 0.184    3.1
##  7   479 P.J. ~ PF       34 HOU      72  2467   8.3 0.559  0.702 0.113    4.7
##  8   175 Shai ~ SG       21 OKC      70  2428  17.7 0.568  0.247 0.352    2.2
##  9     2 Bam A~ PF       22 MIA      72  2417  20.3 0.598  0.018 0.484    8.5
## 10   343 Donov~ SG       23 UTA      69  2364  18.8 0.558  0.352 0.24     2.6
## # ... with 641 more rows, and 15 more variables: `DRB%` <dbl>, `TRB%` <dbl>,
## #   `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>,
## #   OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/48` <dbl>, OBPM <dbl>, DBPM <dbl>,
## #   BPM <dbl>, VORP <dbl>
\end{verbatim}

The player with the most minutes on the floor is a shooting guard. Shooting guard is the most common position, so that means there's CJ McCollum rolling up 2,556 minutes in a season, and then there's Cleveland Cavalier's sensation J.P. Macura. Never heard of J.P. Macura? Might be because he logged one minute in one game this season.

That's a huge difference.

So when choosing a measure of the middle, you have to ask yourself -- could I have extremes? Because a median won't be sensitive to extremes. It will be the point at which half the numbers are above and half are below. The average or mean will be a measure of the middle, but if you have a bunch of pine riders and then one ironman superstar, the average will be wildly skewed.

\hypertarget{even-more-aggregates}{%
\section{Even more aggregates}\label{even-more-aggregates}}

There's a ton of things we can do in summarize -- we'll work with more of them as the course progresses -- but here's a few other questions you can ask.

Which position in the NBA plays the most minutes? And what is the highest and lowest minute total for that position? And how wide is the spread between minutes? We can find that with \texttt{sum} to add up the minutes to get the total minutes, \texttt{min} to find the minimum minutes, \texttt{max} to find the maximum minutes and \texttt{sd} to find the standard deviation in the numbers.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Pos) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{total =} \FunctionTok{sum}\NormalTok{(MP), }
    \AttributeTok{avgminutes =} \FunctionTok{mean}\NormalTok{(MP), }
    \AttributeTok{minminutes =} \FunctionTok{min}\NormalTok{(MP),}
    \AttributeTok{maxminutes =} \FunctionTok{max}\NormalTok{(MP),}
    \AttributeTok{stdev =} \FunctionTok{sd}\NormalTok{(MP)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 9 x 6
##   Pos    total avgminutes minminutes maxminutes stdev
##   <chr>  <dbl>      <dbl>      <dbl>      <dbl> <dbl>
## 1 SG    143229       843.          1       2556 735. 
## 2 PF    106654       790.          5       2482 719. 
## 3 PG    104745       944.          8       2474 727. 
## 4 SF     99109       877.         11       2316 709. 
## 5 C      98914       891.          3       2336 619. 
## 6 SF-SG   3633      1211          87       1858 977. 
## 7 PF-C    3097      1548.        960       2137 832. 
## 8 SF-PF   2553       638.         46       1936 873. 
## 9 C-PF     633       316.        256        377  85.6
\end{verbatim}

So again, no surprise, shooting guards spend the most minutes on the floor in the NBA. They average 842 minutes, but we noted why that's trouble. The minimum is the J.P. Macura Award, max is the Trailblazer's failing at load management, and the standard deviation is a measure of how spread out the data is. In this case, not the highest spread among positions, but pretty high. So you know you've got some huge minutes players and a bunch of bench players.

\hypertarget{mutating-data}{%
\chapter{Mutating data}\label{mutating-data}}

One of the most common data analysis techniques is to look at change over time. The most common way of comparing change over time is through percent change. The math behind calculating percent change is very simple, and you should know it off the top of your head. The easy way to remember it is:

\texttt{(new\ -\ old)\ /\ old}

Or new minus old divided by old. Your new number minus the old number, the result of which is divided by the old number. To do that in R, we can use \texttt{dplyr} and \texttt{mutate} to calculate new metrics in a new field using existing fields of data.

So first we'll import the tidyverse so we can read in our data and begin to work with it.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

Now you'll need a common and simple dataset of total attendance at NCAA football games over the last few seasons.

You'll import it something like this.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{\textquotesingle{}data/attendance.csv\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Institution = col_character(),
##   Conference = col_character(),
##   `2013` = col_double(),
##   `2014` = col_double(),
##   `2015` = col_double(),
##   `2016` = col_double(),
##   `2017` = col_double(),
##   `2018` = col_double()
## )
\end{verbatim}

If you want to see the first six rows -- handy to take a peek at your data -- you can use the function \texttt{head}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(attendance)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 8
##   Institution     Conference      `2013` `2014` `2015` `2016` `2017` `2018`
##   <chr>           <chr>            <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Air Force       MWC             228562 168967 156158 177519 174924 166205
## 2 Akron           MAC             107101  55019 108588  62021 117416  92575
## 3 Alabama         SEC             710538 710736 707786 712747 712053 710931
## 4 Appalachian St. FBS Independent 149366     NA     NA     NA     NA     NA
## 5 Appalachian St. Sun Belt            NA 138995 128755 156916 154722 131716
## 6 Arizona         Pac-12          285713 354973 308355 338017 255791 318051
\end{verbatim}

The code to calculate percent change is pretty simple. Remember, with \texttt{summarize}, we used \texttt{n()} to count things. With \texttt{mutate}, we use very similar syntax to calculate a new value using other values in our dataset. So in this case, we're trying to do (new-old)/old, but we're doing it with fields. If we look at what we got when we did \texttt{head}, you'll see there's `2018` as the new data, and we'll use `2017` as the old data. So we're looking at one year. Then, to help us, we'll use arrange again to sort it, so we get the fastest growing school over one year.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
  \AttributeTok{change =}\NormalTok{ (}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{{-}} \StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}
\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 150 x 9
##    Institution   Conference   `2013` `2014` `2015` `2016` `2017` `2018`   change
##    <chr>         <chr>         <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>    <dbl>
##  1 Air Force     MWC          228562 168967 156158 177519 174924 166205 -0.0498 
##  2 Akron         MAC          107101  55019 108588  62021 117416  92575 -0.212  
##  3 Alabama       SEC          710538 710736 707786 712747 712053 710931 -0.00158
##  4 Appalachian ~ FBS Indepen~ 149366     NA     NA     NA     NA     NA NA      
##  5 Appalachian ~ Sun Belt         NA 138995 128755 156916 154722 131716 -0.149  
##  6 Arizona       Pac-12       285713 354973 308355 338017 255791 318051  0.243  
##  7 Arizona St.   Pac-12       501509 343073 368985 286417 359660 291091 -0.191  
##  8 Arkansas      SEC          431174 399124 471279 487067 442569 367748 -0.169  
##  9 Arkansas St.  Sun Belt     149477 149163 138043 136200 119538 119001 -0.00449
## 10 Army West Po~ FBS Indepen~ 169781 171310 185946 163267 185543 190156  0.0249 
## # ... with 140 more rows
\end{verbatim}

What do we see right away? Do those numbers look like we expect them to? No.~They're a decimal expressed as a percentage. So let's fix that by multiplying by 100.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
  \AttributeTok{change =}\NormalTok{ ((}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{{-}} \StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{*}\DecValTok{100}
\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 150 x 9
##    Institution    Conference   `2013` `2014` `2015` `2016` `2017` `2018`  change
##    <chr>          <chr>         <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
##  1 Air Force      MWC          228562 168967 156158 177519 174924 166205  -4.98 
##  2 Akron          MAC          107101  55019 108588  62021 117416  92575 -21.2  
##  3 Alabama        SEC          710538 710736 707786 712747 712053 710931  -0.158
##  4 Appalachian S~ FBS Indepen~ 149366     NA     NA     NA     NA     NA  NA    
##  5 Appalachian S~ Sun Belt         NA 138995 128755 156916 154722 131716 -14.9  
##  6 Arizona        Pac-12       285713 354973 308355 338017 255791 318051  24.3  
##  7 Arizona St.    Pac-12       501509 343073 368985 286417 359660 291091 -19.1  
##  8 Arkansas       SEC          431174 399124 471279 487067 442569 367748 -16.9  
##  9 Arkansas St.   Sun Belt     149477 149163 138043 136200 119538 119001  -0.449
## 10 Army West Poi~ FBS Indepen~ 169781 171310 185946 163267 185543 190156   2.49 
## # ... with 140 more rows
\end{verbatim}

Now, does this ordering do anything for us? No.~Let's fix that with arrange.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
  \AttributeTok{change =}\NormalTok{ ((}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{{-}} \StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{*}\DecValTok{100}
\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(change))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 150 x 9
##    Institution   Conference `2013` `2014` `2015` `2016` `2017` `2018` change
##    <chr>         <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Ga. Southern  Sun Belt       NA 105510 124681 104095  61031 100814   65.2
##  2 La.-Monroe    Sun Belt    85177  90540  58659  67057  49640  71048   43.1
##  3 Louisiana     Sun Belt   129878 154652 129577 121346  78754 111303   41.3
##  4 Hawaii        MWC        185931 192159 164031 170299 145463 205455   41.2
##  5 Buffalo       MAC        136418 122418 110743 104957  80102 110280   37.7
##  6 California    Pac-12     345303 286051 292797 279769 219290 300061   36.8
##  7 UCF           AAC        252505 226869 180388 214814 257924 352148   36.5
##  8 UTSA          C-USA      175282 165458 138048 138226 114104 148257   29.9
##  9 Eastern Mich. MAC         20255  75127  29381 106064  73649  95632   29.8
## 10 Louisville    ACC            NA 317829 294413 324391 276957 351755   27.0
## # ... with 140 more rows
\end{verbatim}

So who had the most growth last year from the year before? Something going on at Georgia Southern.

\hypertarget{a-more-complex-example}{%
\section{A more complex example}\label{a-more-complex-example}}

There's metric in basketball that's easy to understand -- shooting percentage. It's the number of shots made divided by the number of shots attempted. Simple, right? Except it's a little too simple. Because what about three point shooters? They tend to be more vailable because the three point shot is worth more. What about players who get to the line? In shooting percentage, free throws are nowhere to be found.

Basketball nerds, because of these weaknesses, have created a new metric called \href{https://en.wikipedia.org/wiki/True_shooting_percentage}{True Shooting Percentage}. True shooting percentage takes into account all aspects of a players shooting to determine who the real shooters are.

Using \texttt{dplyr} and \texttt{mutate}, we can calculate true shooting percentage. So let's look at a new dataset, one of every college basketball player's season stats in 2018-19 season. It's a dataset of 5,386 players, and we've got 59 variables -- one of them is True Shooting Percentage, but we're going to ignore that.

Import it like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/players19.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Missing column names filled in: 'X1' [1]
\end{verbatim}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Team = col_character(),
##   Conference = col_character(),
##   Player = col_character(),
##   Class = col_character(),
##   Pos = col_character(),
##   Height = col_character(),
##   Hometown = col_character(),
##   `High School` = col_character(),
##   Summary = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

The basic true shooting percentage formula is \texttt{(Points\ /\ (2*(FieldGoalAttempts\ +\ (.44\ *\ FreeThrowAttempts))))\ *\ 100}. Let's talk that through. Points divided by a lot. It's really field goal attempts plus 44 percent of the free throw attempts. Why? Because that's about what a free throw is worth, compared to other ways to score. After adding those things together, you double it. And after you divide points by that number, you multiply the whole lot by 100.

In our data, we need to be able to find the fields so we can complete the formula. To do that, one way is to use the Environment tab in R Studio. In the Environment tab is a listing of all the data you've imported, and if you click the triangle next to it, it'll list all the field names, giving you a bit of information about each one.

\includegraphics[width=18.14in]{images/environment}

So what does True Shooting Percentage look like in code?

Let's think about this differently. Who had the best true shooting season last year?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{trueshooting =}\NormalTok{ (PTS}\SpecialCharTok{/}\NormalTok{(}\DecValTok{2}\SpecialCharTok{*}\NormalTok{(FGA }\SpecialCharTok{+}\NormalTok{ (.}\DecValTok{44}\SpecialCharTok{*}\NormalTok{FTA))))}\SpecialCharTok{*}\DecValTok{100}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(trueshooting))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5,386 x 60
##       X1 Team  Conference Player   `#` Class Pos   Height Weight Hometown
##    <dbl> <chr> <chr>      <chr>  <dbl> <chr> <chr> <chr>   <dbl> <chr>   
##  1   579 Texa~ Big 12     Drayt~     4 JR    G     6-0       156 Austin,~
##  2   843 Ston~ AEC        Nick ~    42 FR    F     6-7       240 Port Je~
##  3  1059 Sout~ Southland  Patri~    22 SO    F     6-3       210 Folsom,~
##  4  4269 Dayt~ A-10       Camro~    52 SO    G     5-7       160 Country~
##  5  4681 Cali~ Pac-12     David~    21 JR    G     6-4       185 Newbury~
##  6   326 Virg~ ACC        Grant~     1 FR    G     <NA>       NA Charlot~
##  7   410 Vand~ SEC        Mac H~    42 FR    G     6-6       182 Chattan~
##  8  1390 Sain~ A-10       Jack ~    31 JR    G     6-6       205 Mattoon~
##  9  2230 NJIT~ A-Sun      Patri~     3 SO    G     5-9       160 West Or~
## 10   266 Wash~ Pac-12     Reaga~    34 FR    F     6-6       225 Santa A~
## # ... with 5,376 more rows, and 50 more variables: `High School` <chr>,
## #   Summary <chr>, Rk.x <dbl>, G <dbl>, GS <dbl>, MP <dbl>, FG <dbl>,
## #   FGA <dbl>, `FG%` <dbl>, `2P` <dbl>, `2PA` <dbl>, `2P%` <dbl>, `3P` <dbl>,
## #   `3PA` <dbl>, `3P%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>, ORB <dbl>,
## #   DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>,
## #   PTS <dbl>, Rk.y <dbl>, PER <dbl>, `TS%` <dbl>, `eFG%` <dbl>, `3PAr` <dbl>,
## #   FTr <dbl>, PProd <dbl>, `ORB%` <dbl>, `DRB%` <dbl>, `TRB%` <dbl>,
## #   `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>,
## #   OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/40` <dbl>, OBPM <dbl>, DBPM <dbl>,
## #   BPM <dbl>, trueshooting <dbl>
\end{verbatim}

You'll be forgiven if you did not hear about Texas Longhorns shooting sensation Drayton Whiteside. He played in six games, took one shot and \href{https://youtu.be/QNviQf9X4rM?t=186}{actually hit it}. It happened to be a three pointer, which is one more three pointer than I've hit in college basketball. So props to him. Does that mean he had the best true shooting season in college basketball last year?

Not hardly.

We'll talk about how to narrow the pile and filter out data in the next chapter.

\hypertarget{filters-and-selections}{%
\chapter{Filters and selections}\label{filters-and-selections}}

More often than not, we have more data than we want. Sometimes we need to be rid of that data. In \texttt{dplyr}, there's two ways to go about this: filtering and selecting.

\textbf{Filtering creates a subset of the data based on criteria}. All records where the count is greater than 10. All records that match ``Nebraska''. Something like that.

\textbf{Selecting simply returns only the fields named}. So if you only want to see School and Attendance, you select those fields. When you look at your data again, you'll have two columns. If you try to use one of your columns that you had before you used select, you'll get an error.

Let's work with our football attendance data to show some examples.

First we'll need the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

Now import the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{\textquotesingle{}data/attendance.csv\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Institution = col_character(),
##   Conference = col_character(),
##   `2013` = col_double(),
##   `2014` = col_double(),
##   `2015` = col_double(),
##   `2016` = col_double(),
##   `2017` = col_double(),
##   `2018` = col_double()
## )
\end{verbatim}

So, first things first, let's say we don't care about all this Air Force, Akron, Alabama crap and just want to see Dear Old Nebraska U. We do that with \texttt{filter} and then we pass it a condition.

Before we do that, a note about conditions. Most of the conditional operators you'll understand -- greater than and less than are \textgreater{} and \textless. The tough one to remember is equal to. In conditional statements, equal to is == not =. If you haven't noticed, = is a variable assignment operator, not a conditional statement. So equal is == and NOT equal is !=.

So if you want to see Institutions equal to Nebraska, you do this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Institution }\SpecialCharTok{==} \StringTok{"Nebraska"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 8
##   Institution Conference `2013` `2014` `2015` `2016` `2017` `2018`
##   <chr>       <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Nebraska    Big Ten    727466 638744 629983 631402 628583 623240
\end{verbatim}

Or if we want to see schools that had more than half a million people buy tickets to a football game in a season, we do the following. NOTE THE BACKTICKS.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}} \SpecialCharTok{\textgreater{}=} \DecValTok{500000}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 17 x 8
##    Institution    Conference `2013` `2014` `2015` `2016` `2017` `2018`
##    <chr>          <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Alabama        SEC        710538 710736 707786 712747 712053 710931
##  2 Auburn         SEC        685252 612157 612157 695498 605120 591236
##  3 Clemson        ACC        574333 572262 588266 566787 565412 562799
##  4 Florida        SEC        524638 515001 630457 439229 520290 576299
##  5 Georgia        SEC        556476 649222 649222 556476 556476 649222
##  6 LSU            SEC        639927 712063 654084 708618 591034 705733
##  7 Michigan       Big Ten    781144 734364 771174 883741 669534 775156
##  8 Michigan St.   Big Ten    506294 522765 522628 522666 507398 508088
##  9 Nebraska       Big Ten    727466 638744 629983 631402 628583 623240
## 10 Ohio St.       Big Ten    734528 744075 750705 750944 752464 713630
## 11 Oklahoma       Big 12     508334 510972 512139 521142 519119 607146
## 12 Penn St.       Big Ten    676112 711358 698590 701800 746946 738396
## 13 South Carolina SEC        576805 569664 472934 538441 550099 515396
## 14 Tennessee      SEC        669087 698276 704088 706776 670454 650887
## 15 Texas          Big 12     593857 564618 540210 587283 556667 586277
## 16 Texas A&M      SEC        697003 630735 725354 713418 691612 698908
## 17 Wisconsin      Big Ten    552378 556642 546099 476144 551766 540072
\end{verbatim}

But what if we want to see all of the Power Five conferences? We \emph{could} use conditional logic in our filter. The conditional logic operators are \texttt{\textbar{}} for OR and \texttt{\&} for AND. NOTE: AND means all conditions have to be met. OR means any of the conditions work. So be careful about boolean logic.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big 10"} \SpecialCharTok{|}\NormalTok{ Conference }\SpecialCharTok{==} \StringTok{"SEC"} \SpecialCharTok{|}\NormalTok{ Conference }\SpecialCharTok{==} \StringTok{"Pac{-}12"} \SpecialCharTok{|}\NormalTok{ Conference }\SpecialCharTok{==} \StringTok{"ACC"} \SpecialCharTok{|}\NormalTok{ Conference }\SpecialCharTok{==} \StringTok{"Big 12"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 51 x 8
##    Institution    Conference `2013` `2014` `2015` `2016` `2017` `2018`
##    <chr>          <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Alabama        SEC        710538 710736 707786 712747 712053 710931
##  2 Arizona        Pac-12     285713 354973 308355 338017 255791 318051
##  3 Arizona St.    Pac-12     501509 343073 368985 286417 359660 291091
##  4 Arkansas       SEC        431174 399124 471279 487067 442569 367748
##  5 Auburn         SEC        685252 612157 612157 695498 605120 591236
##  6 Baylor         Big 12     321639 280257 276960 275029 262978 248017
##  7 Boston College ACC        198035 239893 211433 192942 215546 263363
##  8 California     Pac-12     345303 286051 292797 279769 219290 300061
##  9 Clemson        ACC        574333 572262 588266 566787 565412 562799
## 10 Colorado       Pac-12     230778 226670 236331 279652 282335 274852
## # ... with 41 more rows
\end{verbatim}

But that's a lot of repetitive code. And a lot of typing. And typing is the devil. So what if we could create a list and pass it into the filter? It's pretty simple.

We can create a new variable -- remember variables can represent just about anything -- and create a list. To do that we use the \texttt{c} operator, which stands for concatenate. That just means take all the stuff in the parenthesis after the c and bunch it into a list.

Note here: text is in quotes. If they were numbers, we wouldn't need the quotes.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{powerfive }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"SEC"}\NormalTok{, }\StringTok{"Big Ten"}\NormalTok{, }\StringTok{"Pac{-}12"}\NormalTok{, }\StringTok{"Big 12"}\NormalTok{, }\StringTok{"ACC"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now with a list, we can use the \%in\% operator. It does what you think it does -- it gives you data that matches things IN the list you give it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ powerfive)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 65 x 8
##    Institution    Conference `2013` `2014` `2015` `2016` `2017` `2018`
##    <chr>          <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Alabama        SEC        710538 710736 707786 712747 712053 710931
##  2 Arizona        Pac-12     285713 354973 308355 338017 255791 318051
##  3 Arizona St.    Pac-12     501509 343073 368985 286417 359660 291091
##  4 Arkansas       SEC        431174 399124 471279 487067 442569 367748
##  5 Auburn         SEC        685252 612157 612157 695498 605120 591236
##  6 Baylor         Big 12     321639 280257 276960 275029 262978 248017
##  7 Boston College ACC        198035 239893 211433 192942 215546 263363
##  8 California     Pac-12     345303 286051 292797 279769 219290 300061
##  9 Clemson        ACC        574333 572262 588266 566787 565412 562799
## 10 Colorado       Pac-12     230778 226670 236331 279652 282335 274852
## # ... with 55 more rows
\end{verbatim}

\hypertarget{selecting-data-to-make-it-easier-to-read}{%
\section{Selecting data to make it easier to read}\label{selecting-data-to-make-it-easier-to-read}}

So now we have our Power Five list. What if we just wanted to see attendance from the most recent season and ignore all the rest? Select to the rescue.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ powerfive) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(Institution, Conference, }\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 65 x 3
##    Institution    Conference `2018`
##    <chr>          <chr>       <dbl>
##  1 Alabama        SEC        710931
##  2 Arizona        Pac-12     318051
##  3 Arizona St.    Pac-12     291091
##  4 Arkansas       SEC        367748
##  5 Auburn         SEC        591236
##  6 Baylor         Big 12     248017
##  7 Boston College ACC        263363
##  8 California     Pac-12     300061
##  9 Clemson        ACC        562799
## 10 Colorado       Pac-12     274852
## # ... with 55 more rows
\end{verbatim}

If you have truly massive data, Select has tools to help you select fields that start\_with the same things or ends with a certain word. \href{https://dplyr.tidyverse.org/reference/select.html}{The documentation will guide you} if you need those someday. For 90 plus percent of what we do, just naming the fields will be sufficient.

\hypertarget{using-conditional-filters-to-set-limits}{%
\section{Using conditional filters to set limits}\label{using-conditional-filters-to-set-limits}}

Let's return to the blistering season of Drayton Whiteside using our dataset of every college basketball player's season stats in 2018-19 season. How can we set limits in something like a question of who had the best season?

Let's get our Drayton Whiteside data from the previous chapter back up.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/players19.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Missing column names filled in: 'X1' [1]
\end{verbatim}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Team = col_character(),
##   Conference = col_character(),
##   Player = col_character(),
##   Class = col_character(),
##   Pos = col_character(),
##   Height = col_character(),
##   Hometown = col_character(),
##   `High School` = col_character(),
##   Summary = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{trueshooting =}\NormalTok{ (PTS}\SpecialCharTok{/}\NormalTok{(}\DecValTok{2}\SpecialCharTok{*}\NormalTok{(FGA }\SpecialCharTok{+}\NormalTok{ (.}\DecValTok{44}\SpecialCharTok{*}\NormalTok{FTA))))}\SpecialCharTok{*}\DecValTok{100}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(trueshooting))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5,386 x 60
##       X1 Team  Conference Player   `#` Class Pos   Height Weight Hometown
##    <dbl> <chr> <chr>      <chr>  <dbl> <chr> <chr> <chr>   <dbl> <chr>   
##  1   579 Texa~ Big 12     Drayt~     4 JR    G     6-0       156 Austin,~
##  2   843 Ston~ AEC        Nick ~    42 FR    F     6-7       240 Port Je~
##  3  1059 Sout~ Southland  Patri~    22 SO    F     6-3       210 Folsom,~
##  4  4269 Dayt~ A-10       Camro~    52 SO    G     5-7       160 Country~
##  5  4681 Cali~ Pac-12     David~    21 JR    G     6-4       185 Newbury~
##  6   326 Virg~ ACC        Grant~     1 FR    G     <NA>       NA Charlot~
##  7   410 Vand~ SEC        Mac H~    42 FR    G     6-6       182 Chattan~
##  8  1390 Sain~ A-10       Jack ~    31 JR    G     6-6       205 Mattoon~
##  9  2230 NJIT~ A-Sun      Patri~     3 SO    G     5-9       160 West Or~
## 10   266 Wash~ Pac-12     Reaga~    34 FR    F     6-6       225 Santa A~
## # ... with 5,376 more rows, and 50 more variables: `High School` <chr>,
## #   Summary <chr>, Rk.x <dbl>, G <dbl>, GS <dbl>, MP <dbl>, FG <dbl>,
## #   FGA <dbl>, `FG%` <dbl>, `2P` <dbl>, `2PA` <dbl>, `2P%` <dbl>, `3P` <dbl>,
## #   `3PA` <dbl>, `3P%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>, ORB <dbl>,
## #   DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>,
## #   PTS <dbl>, Rk.y <dbl>, PER <dbl>, `TS%` <dbl>, `eFG%` <dbl>, `3PAr` <dbl>,
## #   FTr <dbl>, PProd <dbl>, `ORB%` <dbl>, `DRB%` <dbl>, `TRB%` <dbl>,
## #   `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>,
## #   OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/40` <dbl>, OBPM <dbl>, DBPM <dbl>,
## #   BPM <dbl>, trueshooting <dbl>
\end{verbatim}

In most contests, like the batting title in Major League Baseball, there's a minimum number of X to qualify. In baseball, it's at bats. In basketball, it attempts. So let's set a floor and see how it changes. What if we said you had to have played 100 minutes in a season? The top players in college basketball play more than 1000 minutes in a season. So 100 is not that much. Let's try it and see.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{trueshooting =}\NormalTok{ (PTS}\SpecialCharTok{/}\NormalTok{(}\DecValTok{2}\SpecialCharTok{*}\NormalTok{(FGA }\SpecialCharTok{+}\NormalTok{ (.}\DecValTok{44}\SpecialCharTok{*}\NormalTok{FTA))))}\SpecialCharTok{*}\DecValTok{100}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(trueshooting)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(MP }\SpecialCharTok{\textgreater{}} \DecValTok{100}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3,659 x 60
##       X1 Team  Conference Player   `#` Class Pos   Height Weight Hometown
##    <dbl> <chr> <chr>      <chr>  <dbl> <chr> <chr> <chr>   <dbl> <chr>   
##  1  4634 Cent~ Southland  Jorda~    33 JR    G     6-1       185 Harriso~
##  2  3623 Hart~ AEC        Max T~    20 SR    G     6-5       200 Rye, NY 
##  3  2675 Mich~ Big Ten    Thoma~    15 FR    F     6-8       225 Clarkst~
##  4  5175 Litt~ Sun Belt   Kris ~    32 SO    F     6-8       194 Dewitt,~
##  5  5205 Ariz~ Pac-12     De'Qu~    32 SR    F     6-10      225 St. Tho~
##  6  4099 ETSU~ Southern   Lucas~    25 JR    C     7-0       220 De Lier~
##  7  3006 Loui~ Sun Belt   Brand~     0 SR    G     6-4       180 Hawthor~
##  8   570 Texa~ Big 12     Jaxso~    10 FR    F     6-11      220 Lovelan~
##  9  1704 Pepp~ WCC        Victo~    34 FR    C     6-9       200 Owerri,~
## 10  4056 East~ MAC        Jalen~    30 SO    F     6-9       215 Pasco, ~
## # ... with 3,649 more rows, and 50 more variables: `High School` <chr>,
## #   Summary <chr>, Rk.x <dbl>, G <dbl>, GS <dbl>, MP <dbl>, FG <dbl>,
## #   FGA <dbl>, `FG%` <dbl>, `2P` <dbl>, `2PA` <dbl>, `2P%` <dbl>, `3P` <dbl>,
## #   `3PA` <dbl>, `3P%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>, ORB <dbl>,
## #   DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>,
## #   PTS <dbl>, Rk.y <dbl>, PER <dbl>, `TS%` <dbl>, `eFG%` <dbl>, `3PAr` <dbl>,
## #   FTr <dbl>, PProd <dbl>, `ORB%` <dbl>, `DRB%` <dbl>, `TRB%` <dbl>,
## #   `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>,
## #   OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/40` <dbl>, OBPM <dbl>, DBPM <dbl>,
## #   BPM <dbl>, trueshooting <dbl>
\end{verbatim}

Now you get Central Arkansas Bears Junior Jordan Grant, who played in 25 games and was on the floor for 152 minutes. So he played regularly. But in that time, he only attempted 16 shots, and made 68 percent of them. In other words, when he shot, he probably scored. He just rarely shot.

So is 100 minutes our level? Here's the truth -- there's not really an answer here. We're picking a cutoff. If you can cite a reason for it and defend it, then it probably works.

\hypertarget{top-list}{%
\section{Top list}\label{top-list}}

One last little dplyr trick that's nice to have in the toolbox is a shortcut for selecting only the top values for your dataset. Want to make a Top 10 List? Or Top 25? Or Top Whatever You Want? It's easy.

So what are the top 10 Power Five schools by season attendance. All we're doing here is chaining commands together with what we've already got. We're \emph{filtering} by our list of Power Five conferences, we're \emph{selecting} the three fields we need, now we're going to \emph{arrange} it by total attendance and then we'll introduce the new function: \texttt{top\_n}. The \texttt{top\_n} function just takes a number. So we want a top 10 list? We do it like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ powerfive) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(Institution, Conference, }\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by 2018
\end{verbatim}

\begin{verbatim}
## # A tibble: 10 x 3
##    Institution Conference `2018`
##    <chr>       <chr>       <dbl>
##  1 Michigan    Big Ten    775156
##  2 Penn St.    Big Ten    738396
##  3 Ohio St.    Big Ten    713630
##  4 Alabama     SEC        710931
##  5 LSU         SEC        705733
##  6 Texas A&M   SEC        698908
##  7 Tennessee   SEC        650887
##  8 Georgia     SEC        649222
##  9 Nebraska    Big Ten    623240
## 10 Oklahoma    Big 12     607146
\end{verbatim}

That's all there is to it. Just remember -- for it to work correctly, you need to sort your data BEFORE you run top\_n.~Otherwise, you're just getting the first 10 values in the list. The function doesn't know what field you want the top values of. You have to do it.

\hypertarget{transforming-data}{%
\chapter{Transforming data}\label{transforming-data}}

Sometimes long data needs to be wide, and sometimes wide data needs to be long. I'll explain.

You are soon going to discover that long before you can visualize data, \textbf{you need to have it in a form that the visualization library can deal with}. One of the ways that isn't immediately obvious is \textbf{how your data is cast}. Most of the data you will encounter will be \textbf{wide -- each row will represent a single entity with multiple measures for that entity}. So think of states. Your row of your dataset could have the state name, population, average life expectancy and other demographic data.

But what if your visualization library needs one row for each measure? So state, data type and the data. Nebraska, Population, 1,929,000. That's one row. Then the next row is Nebraska, Average Life Expectancy, 76. That's the next row. That's where recasting your data comes in.

We can use a library called \texttt{tidyr} to \texttt{pivot\_longer} or \texttt{pivot\_wider} the data, depending on what we need. We'll use a dataset of college football attendance to demonstrate.

First we need some libraries.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

Now we'll load the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{\textquotesingle{}data/attendance.csv\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Institution = col_character(),
##   Conference = col_character(),
##   `2013` = col_double(),
##   `2014` = col_double(),
##   `2015` = col_double(),
##   `2016` = col_double(),
##   `2017` = col_double(),
##   `2018` = col_double()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 150 x 8
##    Institution     Conference      `2013` `2014` `2015` `2016` `2017` `2018`
##    <chr>           <chr>            <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Air Force       MWC             228562 168967 156158 177519 174924 166205
##  2 Akron           MAC             107101  55019 108588  62021 117416  92575
##  3 Alabama         SEC             710538 710736 707786 712747 712053 710931
##  4 Appalachian St. FBS Independent 149366     NA     NA     NA     NA     NA
##  5 Appalachian St. Sun Belt            NA 138995 128755 156916 154722 131716
##  6 Arizona         Pac-12          285713 354973 308355 338017 255791 318051
##  7 Arizona St.     Pac-12          501509 343073 368985 286417 359660 291091
##  8 Arkansas        SEC             431174 399124 471279 487067 442569 367748
##  9 Arkansas St.    Sun Belt        149477 149163 138043 136200 119538 119001
## 10 Army West Point FBS Independent 169781 171310 185946 163267 185543 190156
## # ... with 140 more rows
\end{verbatim}

So as you can see, each row represents a school, and then each column represents a year. This is great for calculating the percent change -- we can subtract a column from a column and divide by that column. But later, when we want to chart each school's attendance over the years, we have to have each row be one team for one year. Nebraska in 2013, then Nebraska in 2014, and Nebraska in 2015 and so on.

To do that, we use \texttt{pivot\_longer} because we're making wide data long. Since all of the columns we want to make rows start with 20, we can use that in our \texttt{cols} directive. Then we give that column a name -- Year -- and the values for each year need a name too. Those are the attendance figure. We can see right away how this works.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{pivot\_longer}\NormalTok{(}\AttributeTok{cols =} \FunctionTok{starts\_with}\NormalTok{(}\StringTok{"20"}\NormalTok{), }\AttributeTok{names\_to =} \StringTok{"Year"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"Attendance"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 900 x 4
##    Institution Conference Year  Attendance
##    <chr>       <chr>      <chr>      <dbl>
##  1 Air Force   MWC        2013      228562
##  2 Air Force   MWC        2014      168967
##  3 Air Force   MWC        2015      156158
##  4 Air Force   MWC        2016      177519
##  5 Air Force   MWC        2017      174924
##  6 Air Force   MWC        2018      166205
##  7 Akron       MAC        2013      107101
##  8 Akron       MAC        2014       55019
##  9 Akron       MAC        2015      108588
## 10 Akron       MAC        2016       62021
## # ... with 890 more rows
\end{verbatim}

We've gone from 150 rows to 900, but that's expected when we have 6 years for each team.

\hypertarget{making-long-data-wide}{%
\section{Making long data wide}\label{making-long-data-wide}}

We can reverse this process using \texttt{pivot\_wider}, which makes long data wide.

Why do any of this?

In some cases, you're going to be given long data and you need to calculate some metric using two of the years -- a percent change for instance. So you'll need to make the data wide to do that. You might then have to re-lengthen the data now with the percent change. Some project require you to do all kinds of flexing like this. It just depends on the data.

So let's take what we made above and turn it back into wide data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{longdata }\OtherTok{\textless{}{-}}\NormalTok{ attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{pivot\_longer}\NormalTok{(}\AttributeTok{cols =} \FunctionTok{starts\_with}\NormalTok{(}\StringTok{"20"}\NormalTok{), }\AttributeTok{names\_to =} \StringTok{"Year"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"Attendance"}\NormalTok{)}

\NormalTok{longdata}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 900 x 4
##    Institution Conference Year  Attendance
##    <chr>       <chr>      <chr>      <dbl>
##  1 Air Force   MWC        2013      228562
##  2 Air Force   MWC        2014      168967
##  3 Air Force   MWC        2015      156158
##  4 Air Force   MWC        2016      177519
##  5 Air Force   MWC        2017      174924
##  6 Air Force   MWC        2018      166205
##  7 Akron       MAC        2013      107101
##  8 Akron       MAC        2014       55019
##  9 Akron       MAC        2015      108588
## 10 Akron       MAC        2016       62021
## # ... with 890 more rows
\end{verbatim}

To \texttt{pivot\_wider}, we just need to say where our column names are coming from -- the Year -- and where the data under it should come from -- Attendance.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{longdata }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ Year, }\AttributeTok{values\_from =}\NormalTok{ Attendance)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 150 x 8
##    Institution     Conference      `2013` `2014` `2015` `2016` `2017` `2018`
##    <chr>           <chr>            <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Air Force       MWC             228562 168967 156158 177519 174924 166205
##  2 Akron           MAC             107101  55019 108588  62021 117416  92575
##  3 Alabama         SEC             710538 710736 707786 712747 712053 710931
##  4 Appalachian St. FBS Independent 149366     NA     NA     NA     NA     NA
##  5 Appalachian St. Sun Belt            NA 138995 128755 156916 154722 131716
##  6 Arizona         Pac-12          285713 354973 308355 338017 255791 318051
##  7 Arizona St.     Pac-12          501509 343073 368985 286417 359660 291091
##  8 Arkansas        SEC             431174 399124 471279 487067 442569 367748
##  9 Arkansas St.    Sun Belt        149477 149163 138043 136200 119538 119001
## 10 Army West Point FBS Independent 169781 171310 185946 163267 185543 190156
## # ... with 140 more rows
\end{verbatim}

And just like that, we're back.

\hypertarget{why-this-matters}{%
\section{Why this matters}\label{why-this-matters}}

This matters because certain visualization types need wide or long data. A significant hurdle you will face for the rest of the semester is getting the data in the right format for what you want to do.

So let me walk you through an example using this data.

Let's look at Nebraska's attendance over the time period. In order to do that, I need long data because that's what the charting library, \texttt{ggplot2}, needs. You're going to learn a lot more about ggplot later.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nebraska }\OtherTok{\textless{}{-}}\NormalTok{ longdata }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Institution }\SpecialCharTok{==} \StringTok{"Nebraska"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now that we have long data for just Nebraska, we can chart it.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(nebraska, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Year, }\AttributeTok{y=}\NormalTok{Attendance, }\AttributeTok{group=}\DecValTok{1}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{labels =}\NormalTok{ scales}\SpecialCharTok{::}\NormalTok{comma) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Year"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Attendance"}\NormalTok{, }\AttributeTok{title=}\StringTok{"We\textquotesingle{}ll all stick together?"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"It\textquotesingle{}s not as bad as you think {-}{-} they widened the seats, cutting the number."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{, }\AttributeTok{color =} \StringTok{"Outcome"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.title.y =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{axis.text =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{7}\NormalTok{),}
    \AttributeTok{axis.ticks =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{panel.grid.major.x =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{legend.position=}\StringTok{"bottom"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-57-1.pdf}

\hypertarget{significance-tests}{%
\chapter{Significance tests}\label{significance-tests}}

Now that we've worked with data a little, it's time to start asking more probing questions of our data. One of the most probing questions we can ask -- one that so few sports journalists ask -- is if the difference between this thing and the normal thing is real.

We have a perfect natural experiment going on in sports right now to show how significance tests work. The NBA, to salvage a season and get to the playoffs, put their players in a bubble -- more accurately a hotel complex at Disney World in Orlando -- and had them play games without fans.

So are the games different from other regular season games that had fans?

To answer this, we need to understand that a significance test is a way to determine if two numbers are \emph{significantly} different from each other. Generally speaking, we're asking if a subset of data -- a sample -- is different from the total data pool -- the population. Typically, this relies on data being in a normal distribution.

\includegraphics[width=17.64in]{images/simulations2}
If it is, then we know certain things about it. Like the mean -- the average -- will be a line right at the peak of cases. And that 66 percent of cases will be in that red area -- the first standard deviation.

A significance test will determine if a sample taken from that group is different from the total.

Significance testing involves stating a hypothesis. In our case, our hypothesis is that there is a difference between bubble games without people and regular games with people.

In statistics, the \textbf{null hypothesis} is the opposite of your hypothesis. In this case, that there is no difference between fans and no fans.

What we're driving toward is a metric called a p-value, which is the probability that you'd get your sample mean \emph{if the null hypothesis is true.} So in our case, it's the probability we'd see the numbers we get if there was no difference between fans and no fans. If that probability is below .05, then we consider the difference significant and we reject the null hypothesis.

So let's see. We'll need a log of every game last NBA season. In this data, there's a field called COVID, which labels the game as a regular game or a bubble game.

Load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And import the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/nbabubble.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Season = col_character(),
##   Conference = col_character(),
##   Team = col_character(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   COVID = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

First, let's just look at scoring. Here's a theory: fans make players nervous. The screaming makes players tense up, and tension makes for bad shooting. An alternative to this: screaming fans make you defend harder. So my hypothesis is that not only is the scoring different, it's lower.

First things first, let's create a new field, called \texttt{totalpoints} and add the two scores together. We'll need this, so we're going to make this a new dataframe called \texttt{points}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{points }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{totalpoints =}\NormalTok{ TeamScore }\SpecialCharTok{+}\NormalTok{ OpponentScore )}
\end{Highlighting}
\end{Shaded}

Typically speaking, with significance tests, the process involves creating two different means and then running a bunch of formulas on them. R makes this easy by giving you a \texttt{t.test} function, which does all the work for you. What we have to tell it is what is the value we are testing, over which groups, and from what data. It looks like this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{t.test}\NormalTok{(totalpoints }\SpecialCharTok{\textasciitilde{}}\NormalTok{ COVID, }\AttributeTok{data=}\NormalTok{points)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
##  Welch Two Sample t-test
## 
## data:  totalpoints by COVID
## t = -5.232, df = 206.88, p-value = 4.099e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11.64698  -5.27178
## sample estimates:
##    mean in group With Fans mean in group Without Fans 
##                   222.8929                   231.3523
\end{verbatim}

Now let's talk about the output. I prefer to read these bottom up. So at the bottom, it says that the mean number of points score in an NBA game With Fans is 222.89. The mean scored in games Without Fans is 231.35. That means teams are scoring almost 8.5 points MORE without fans on average.

But, some games are defenseless track meets, some games are defensive slugfests. We learned that averages can be skewed by extremes. So the next thing we need to look at is the p-value. Remember, this is the probability that we'd get this sample mean -- the without fans mean -- if there was no difference between fans and no fans.

The probability? 4.099e-07 or 4.099 x 10 to the -7 power. Don't remember your scientific notation? That's .00000004099. The decimal, seven zeros and the number.

Remember, if the probability is below .05, then we determine that this number is statistically significant. We'll talk more about statistical significance soon, but in this case, statistical significance means that our hypothesis is correct: points are different without fans than with. And since our hypothesis is correct, we \emph{reject the null hypothesis} and we can confidently say that bubble teams are scoring more than they were when fans packed arenas.

\hypertarget{accepting-the-null-hypothesis}{%
\section{Accepting the null hypothesis}\label{accepting-the-null-hypothesis}}

So what does it look like when your hypothesis is wrong?

Let's test another thing that may have been impacted by bubble games: home court advantage. If you're the home team, but you're not at home, does it affect you? It has to, right? Your fans aren't there. Home and away are just positions on the scoreboard. It can't matter, can it?

My hypothesis is that home court is no longer an advantage, and the home team will score less relative to the away team.

First things first: We need to make a dataframe where Team is the home team. And then we'll create a differential between the home team and away team. If home court is an advantage, the differential should average out to be positive -- the home team scores more than the away team.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{homecourt }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(}\FunctionTok{is.na}\NormalTok{(HomeAway) }\SpecialCharTok{==} \ConstantTok{TRUE}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{differential =}\NormalTok{ TeamScore }\SpecialCharTok{{-}}\NormalTok{ OpponentScore)}
\end{Highlighting}
\end{Shaded}

Now let's test it.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{t.test}\NormalTok{(differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ COVID, }\AttributeTok{data=}\NormalTok{homecourt)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
##  Welch Two Sample t-test
## 
## data:  differential by COVID
## t = 0.36892, df = 107.84, p-value = 0.7129
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.301628  3.354268
## sample estimates:
##    mean in group With Fans mean in group Without Fans 
##                   2.174047                   1.647727
\end{verbatim}

So again, start at the bottom. With Fans, the home team averages 2.17 more points than the away team. Without fans, they average 1.64 more.

If you are a bad sportswriter or a hack sports talk radio host, you look at this and scream ``the bubble killed home court!''

But two things: first, the home team is STILL, on average, scoring more than the away team on the whole.

And two: Look at the p-value. It's .7129. Is that less than .05? No, no it is not. So that means we have to \textbf{accept the null hypothesis} that there is no difference between fans and no fans when it comes to the difference between the home team and the away team's score.

Now, does this mean that the bubble hasn't impacted the magic of home court? Not necessarily. What it's saying is that the variance between one and the other is too large to be able to say that they're different. It could just be random noise that's causing the difference, and so it's not real. More to the point, it's saying that this metric isn't capable of telling you that there's no home court in the bubble.

We're going to be analyzing these bubble games for \emph{years} trying to find the true impact of fans.

\hypertarget{correlations-and-regression}{%
\chapter{Correlations and regression}\label{correlations-and-regression}}

Throughout sports, you will find no shortage of opinions. From people yelling at their TV screens to an entire industry of people paid to have opinions, there are no shortage of reasons why this team sucks and that player is great. They may have their reasons, but a better question is, does that reason really matter?

Can we put some numbers behind that? Can we prove it or not?

This is what we're going to start to answer. And we'll do it with correlations and regressions.

First, we need data from the 2020 college football season.

Then load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

Now import the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{correlations }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

To do this, we need all FBS college football teams and their season stats from last year. How much, over the course of a season, does a thing matter? That's the question you're going to answer.

In our case, we want to know how much does a team's accumulated penalties influence the number of points they score in a season? How much difference can we explain in points with penalties?

We're going to use two different methods here and they're closely related. Correlations -- specifically the Pearson Correlation Coefficient -- is a measure of how related two numbers are in a linear fashion. In other words -- if our X value goes up one, what happens to Y? If it also goes up 1, that's a perfect correlation. X goes up 1, Y goes up 1. Every time. Correlation coefficients are a number between 0 and 1, with zero being no correlation and 1 being perfect correlation \textbf{if our data is linear}. We'll soon go over scatterplots to visually determine if our data is linear, but for now, we have a hypothesis: More penalties are bad. Penalties hurt. So if a team gets lots of them, they should have worse outcomes than teams that get few of them. That is an argument for a linear relationship between them.

But is there one?

We're going create a new dataframe called newcorrelations that takes our data that we imported and adds a column called \texttt{differential} because we don't have separate offense and defense penalties, and then we'll use correlations to see how related those two things are.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{newcorrelations }\OtherTok{\textless{}{-}}\NormalTok{ correlations }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{differential =}\NormalTok{ TeamScore }\SpecialCharTok{{-}}\NormalTok{ OpponentScore, }
    \AttributeTok{TotalPenalties =}\NormalTok{ Penalties}\SpecialCharTok{+}\NormalTok{DefPenalties, }
    \AttributeTok{TotalPenaltyYards =}\NormalTok{ PenaltyYds}\SpecialCharTok{+}\NormalTok{DefPenaltyYds}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

In R, there is a \texttt{cor} function, and it works much the same as \texttt{mean} or \texttt{median}. So we want to see if \texttt{differential} is correlated with \texttt{TotalPenaltyYards}, which is the yards of penalties a team gets in a game. We do that by referencing \texttt{differential} and \texttt{TotalPenaltyYards} and specifying we want a \texttt{pearson} correlation. The number we get back is the correlation coefficient.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{newcorrelations }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{correlation =} \FunctionTok{cor}\NormalTok{(differential, TotalPenaltyYards, }\AttributeTok{method=}\StringTok{"pearson"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   correlation
##         <dbl>
## 1    -0.00655
\end{verbatim}

So on a scale of -1 to 1, where 0 means there's no relationship at all and 1 or -1 means a perfect relationship, penalty yards and whether or not the team scores more points than it give up are at -0.0065. You could say they're .7 percent related toward the negative -- more penalties, the lower your differential. Another way to say it? They're 99.3 percent not related.

What about the number of penalties instead of the yards?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{newcorrelations }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{correlation =} \FunctionTok{cor}\NormalTok{(differential, TotalPenalties, }\AttributeTok{method=}\StringTok{"pearson"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   correlation
##         <dbl>
## 1    0.000676
\end{verbatim}

So wait, what does this all mean?

It means that when you look at every game in college football, the number of penalties and penalty yards does have a negative impact on the score difference between your team and the other team. But the relationship between penalties, penalty yards and the difference between scores is barely anything at all. Like 99 percent plus not related.

Normally, at this point, you'd quit while you were ahead. A correlation coefficient that shows there's no relationship between two things means stop. It's pointless to go on. But let's beat a dead horse a bit for the sake of talk radio callers who want to complain about undisciplined football teams.

Enter regression. Regression is how we try to fit our data into a line that explains the relationship the best. Regressions will help us predict things as well -- if we have a team that has so many penalties, what kind of point differential could we expect? So regressions are about prediction, correlations are about description. Correlations describe a relationship. Regressions help us predict what that relationship means and what it might look like in the real world. Specifically, it tells us how much of the change in a dependent variable can be explained by the independent variable.

Another thing regressions do is give us some other tools to evaluate if the relationship is real or not.

Here's an example of using linear modeling to look at penalty yards. Think of the \texttt{\textasciitilde{}} character as saying ``is predicted by''. The output looks like a lot, but what we need is a small part of it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fit }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ TotalPenaltyYards, }\AttributeTok{data =}\NormalTok{ newcorrelations)}
\FunctionTok{summary}\NormalTok{(fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = differential ~ TotalPenaltyYards, data = newcorrelations)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67.02 -14.68   0.24  14.04  64.98 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)        1.226995   1.817342   0.675    0.500
## TotalPenaltyYards -0.003383   0.015816  -0.214    0.831
## 
## Residual standard error: 21.46 on 1068 degrees of freedom
## Multiple R-squared:  4.284e-05,  Adjusted R-squared:  -0.0008935 
## F-statistic: 0.04575 on 1 and 1068 DF,  p-value: 0.8307
\end{verbatim}

There's three things we need here:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  First we want to look at the p-value. It's at the bottom right corner of the output. In the case of Total Penalty Yards, the p-value is .8307. The threshold we're looking for here is .05. If it's less than .05, then the relationship is considered to be \emph{statistically significant}. Significance here does not mean it's a big deal. It means it's not random. That's it. Just that. Not random. So in our case, the relationship between total penalty yards and a team's aggregate point differential are \textbf{not statistically significant}. The differences in score difference and penalty yards could be completely random. This is another sign we should just stop with this.
\item
  Second, we look at the Adjusted R-squared value. It's right above the p-value. Adjusted R-squared is a measure of how much of the difference between teams aggregate point values can be explained by penalty yards. Our correlation coefficient said they're .7 percent related to each other, but penalty yard's ability to explain the difference between teams? About .08 percent. That's \ldots{} not much. It's really nothing. Again, we should quit.
\item
  The third thing we can look at, and we only bother if the first two are meaningful, is the coefficients. In the middle, you can see the (Intercept) is 1.226995 and the TotalPenaltyYards coefficient is -0.003383. Remember high school algebra? Remember learning the equation of a line? Remember swearing that learning \texttt{y=mx+b} is stupid because you'll never need it again? Surprise. It's useful again. In this case, we could try to predict a team's score differential in a game -- will they score more than they give up -- by using \texttt{y=mx+b}. In this case, y is the aggregate score, m is -0.003383 and b is 1.226995. So we would multiply a teams total penalty yards by -0.003383 and then add 1.226995 to it. The result would tell you what the total aggregate score in the game would be, according to our model. Chance that your even close with this? About .08 percent. In other words, you've got a 99.92 percent chance of being completely wrong. Did I say we should quit? Yeah.
\end{enumerate}

So penalty yards are totally meaningless to the outcome of a game.

You can see the problem in a graph. On the X axis is penalty yards, on the y is aggregate score. If these elements had a strong relationship, we'd see a clear pattern moving from right to left, sloping down. On the left would be the teams with few penalties and a positive point differential. On right would be teams with high penalty yards and negative point differentials. Do you see that below?

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-73-1.pdf}

\hypertarget{a-more-predictive-example}{%
\section{A more predictive example}\label{a-more-predictive-example}}

So we've \textbf{firmly} established that penalties aren't predictive. But what is?

So instead of looking at penalty yards, let's make a new metric: Net Yards. Can we predict the score differential by looking at the yards a team gained minus the yards they gave up.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{regressions }\OtherTok{\textless{}{-}}\NormalTok{ newcorrelations }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{NetYards =}\NormalTok{ OffensiveYards }\SpecialCharTok{{-}}\NormalTok{ DefYards)}
\end{Highlighting}
\end{Shaded}

First, let's look at the correlation coefficent.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{regressions }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{correlation =} \FunctionTok{cor}\NormalTok{(differential, NetYards, }\AttributeTok{method=}\StringTok{"pearson"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   correlation
##         <dbl>
## 1       0.806
\end{verbatim}

Answer: 81 percent. Not a perfect relationship, but very good. But how meaningful is that relationship and how predictive is it?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{net }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ NetYards, }\AttributeTok{data =}\NormalTok{ regressions)}
\FunctionTok{summary}\NormalTok{(net)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = differential ~ NetYards, data = regressions)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.379  -8.520   0.059   8.487  48.748 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.315632   0.388994   0.811    0.417    
## NetYards    0.102473   0.002307  44.427   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.72 on 1068 degrees of freedom
## Multiple R-squared:  0.6489, Adjusted R-squared:  0.6486 
## F-statistic:  1974 on 1 and 1068 DF,  p-value: < 2.2e-16
\end{verbatim}

First we check p-value. See that e-16? That means scientific notation. That means our number is 2.2 times 10 to the -16 power. So -.000000000000000022. That's sixteen zeros between the decimal and 22. Is that less than .05? Uh, yeah. So this is really, really, really not random. But anyone who has watched a game of football knows this is true. It makes intuitive sense.

Second, Adjusted R-squared: 0.6486. So we can predict a whopping 65 percent of the difference in the score differential by simply looking at the net yards the team has.

Third, the coefficients: In this case, our \texttt{y=mx+b} formula looks like \texttt{y\ =\ 0.102473x\ +\ 0.315632}. So if we were applying this, let's look at Nebraska's 26-20 loss to Iowa in 2020. Nebraska's net yards that game? 16. That's right -- we outgained them.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(}\FloatTok{0.102473}\SpecialCharTok{*}\DecValTok{16}\NormalTok{)}\SpecialCharTok{+}\FloatTok{0.315632} 
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1.9552
\end{verbatim}

So by our model, Nebraska should have won by 1.96 points. Some games are closer than others. But when you can explain 65 percent of the difference, this is the kind of result you get. What would improve the model? Using more data to start. And using more inputs.

\hypertarget{multiple-regression}{%
\chapter{Multiple regression}\label{multiple-regression}}

Last chapter, we looked at correlations and linear regression to predict how one element of a game would predict the score. But we know that a single variable, in all but the rarest instances, is not going to be that predictive. We need more than one. Enter multiple regression. Multiple regression lets us add -- wait for it -- multiple predictors to our equation to help us get a better fit to reality.

That presents it's own problems. So let's get set up. The dataset we'll use is all college football games since the 2011 season.

We need the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs1120.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

One way to show how successful a footballl team was for a game is to show the differential between the team's score and the opponent's score. Score a lot more than the opponent = good, score a lot less than the opponent = bad. And, relatively speaking, the more the better. So let's create that differential. Let's also get our net yardage stat back. And because we'll need it later, let's add the turnover margin.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}
  \AttributeTok{Differential =}\NormalTok{ TeamScore }\SpecialCharTok{{-}}\NormalTok{ OpponentScore, }
  \AttributeTok{NetYards =}\NormalTok{ OffensiveYards }\SpecialCharTok{{-}}\NormalTok{ DefYards, }
  \AttributeTok{TurnoverMargin =}\NormalTok{ DefTotalTurnovers }\SpecialCharTok{{-}}\NormalTok{ TotalTurnovers)}
\end{Highlighting}
\end{Shaded}

The linear model code we used before is pretty straight forward. Its \texttt{field} is predicted by \texttt{field}. Here's a simple linear model that looks at predicting a team's point differential by looking at their net yards.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{yards }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(Differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ NetYards, }\AttributeTok{data=}\NormalTok{logs)}
\FunctionTok{summary}\NormalTok{(yards)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = Differential ~ NetYards, data = logs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.052  -8.736  -0.004   8.753  64.486 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.4918375  0.1052811   4.672 3.01e-06 ***
## NetYards    0.1046592  0.0005882 177.920  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.12 on 15605 degrees of freedom
## Multiple R-squared:  0.6698, Adjusted R-squared:  0.6698 
## F-statistic: 3.166e+04 on 1 and 15605 DF,  p-value: < 2.2e-16
\end{verbatim}

Remember: There's a lot here, but only some of it we care about. What is the Adjusted R-squared value? What's the p-value and is it less than .05? In this case, we can predict 67 percent of the difference in differential with the net yardage in the game.

To add more predictors to this mix, we merely add them. But it's not that simple, as you'll see in a moment. So first, let's look at adding how well the other team shot to our prediction model:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model1 }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(Differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ NetYards }\SpecialCharTok{+}\NormalTok{ TurnoverMargin, }\AttributeTok{data=}\NormalTok{logs)}
\FunctionTok{summary}\NormalTok{(model1)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = Differential ~ NetYards + TurnoverMargin, data = logs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.031  -6.983  -0.025   6.932  40.789 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.4608403  0.0846844   5.442 5.35e-08 ***
## NetYards       0.0966321  0.0004811 200.864  < 2e-16 ***
## TurnoverMargin 4.1968337  0.0454801  92.278  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.55 on 15604 degrees of freedom
## Multiple R-squared:  0.7864, Adjusted R-squared:  0.7864 
## F-statistic: 2.872e+04 on 2 and 15604 DF,  p-value: < 2.2e-16
\end{verbatim}

First things first: What is the adjusted R-squared?

Second: what is the p-value and is it less than .05?

Third: Compare the residual standard error. We went from 13.12 to 10.55. The meaning of this is both really opaque and also simple -- by adding data, we reduced the amount of error in our model. Residual standard error is the total distance between what our model would predict and what we actually have in the data. So lots of residual error means the distance between reality and our model is wider. So the width of our predictive range in this example shrank while we improved the amount of the difference we could predict. That's good, and not always going to be the case.

One of the more difficult things to understand about multiple regression is the issue of multicollinearity. What that means is that there is significant correlation overlap between two variables -- the two are related to each other as well as to the target output -- and all you are doing by adding both of them is adding error with no real value to the R-squared. In pure statistics, we don't want any multicollinearity at all. Violating that assumption limits the applicability of what you are doing. So if we have some multicollinearity, it limits our scope of application to college football We can't say this will work for every football league and level everywhere. What we need to do is see how correlated each value is to each other and throw out ones that are highly co-correlated.

So to find those, we have to create a correlation matrix that shows us how each value is correlated to our outcome variable, but also with each other. We can do that in the \texttt{Hmisc} library. We install that in the console with \texttt{install.packages("Hmisc")}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(Hmisc)}
\end{Highlighting}
\end{Shaded}

We can pass in every numeric value to the Hmisc library and get a correlation matrix out of it, but since we have a large number of values -- and many of them character values -- we should strip that down and reorder them. So that's what I'm doing here. I'm saying give me differential first, and then columns 9-24, and then 26-41. Why the skip? There's a blank column in the middle of the data -- a remnant of the scraper I used.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{simplelogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select\_if}\NormalTok{(is.numeric) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{Game) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(Differential, NetYards, TurnoverMargin, }\FunctionTok{everything}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

Before we proceed, what we're looking to do is follow the Differential column down, looking for correlation values near 1 or -1. Correlations go from -1, meaning perfect negative correlation, to 0, meaning no correlation, to 1, meaning perfect positive correlation. So we're looking for numbers near 1 or -1 for their predictive value. BUT: We then need to see if that value is also highly correlated with something else. If it is, we have a decision to make.

We get our correlation matrix like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{cormatrix }\OtherTok{\textless{}{-}} \FunctionTok{rcorr}\NormalTok{(}\FunctionTok{as.matrix}\NormalTok{(simplelogs))}

\NormalTok{cormatrix}\SpecialCharTok{$}\NormalTok{r}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                   Differential      NetYards TurnoverMargin  PassingCmp
## Differential        1.00000000  8.184193e-01   0.4837849198  0.03118170
## NetYards            0.81841934  1.000000e+00   0.1808173764  0.19290045
## TurnoverMargin      0.48378492  1.808174e-01   1.0000000000 -0.09233553
## PassingCmp          0.03118170  1.929004e-01  -0.0923355289  1.00000000
## PassingAtt         -0.19686950 -7.097271e-03  -0.1939446793  0.88589623
## PassingPct          0.42410345  4.230815e-01   0.1567432526  0.50717498
## PassingYds          0.22322068  3.660003e-01  -0.0255596419  0.80654722
## PassingTD           0.41571492  3.776967e-01   0.1458497043  0.42231692
## RushingAtt          0.36621257  4.169451e-01   0.1942339975 -0.36911655
## RushingYds          0.52945141  5.517793e-01   0.1959559925 -0.32669337
## RushingAvg          0.49317675  4.892094e-01   0.1512425735 -0.19464168
## RushingTD           0.57578332  4.734649e-01   0.2627916039 -0.13305246
## OffensivePlays      0.14922605  3.856779e-01  -0.0098582568  0.53304628
## OffensiveYards      0.58963784  7.222939e-01   0.1308007817  0.39936463
## OffenseAvg          0.61303088  6.273595e-01   0.1640078840  0.15135101
## FirstDownPass       0.17742548  3.296124e-01  -0.0519043178  0.85823184
## FirstDownRush       0.46104465  5.138962e-01   0.1438888221 -0.24594648
## FirstDownPen       -0.00567194 -1.419963e-02  -0.0212171554  0.15771675
## FirstDownTotal      0.46569860  6.160784e-01   0.0606678892  0.48947643
## Penalties          -0.01446172  6.431274e-02   0.0245826177  0.13394910
## PenaltyYds          0.01760113  9.250198e-02   0.0375348365  0.12542719
## Fumbles            -0.14420105 -9.215913e-05  -0.4544647395  0.01753247
## Interceptions      -0.34740859 -1.828408e-01  -0.5658983150  0.09952907
## TotalTurnovers     -0.35226029 -1.357271e-01  -0.7176732980  0.08533780
## TeamScore           0.78106738  6.372500e-01   0.3770822640  0.18334268
## OpponentScore      -0.76964262 -6.319258e-01  -0.3731455129  0.13856056
## DefPassingCmp      -0.06473817 -2.227403e-01   0.0828218918  0.08346001
## DefPassingAtt       0.15048839 -3.476179e-02   0.1819010221  0.07599800
## DefPassingPct      -0.42046253 -4.221770e-01  -0.1547820852  0.03694525
## DefPassingYds      -0.25733994 -3.976691e-01   0.0170156333  0.12715093
## DefPassingTD       -0.41930782 -3.805155e-01  -0.1479955551  0.11207117
## DefRushingAtt      -0.36480427 -4.156720e-01  -0.1927488957 -0.03564161
## DefRushingYds      -0.52342461 -5.478198e-01  -0.1942127993  0.01864717
## DefRushingAvg      -0.48630318 -4.871370e-01  -0.1469244778  0.04905028
## DefRushingTD       -0.56152960 -4.645077e-01  -0.2584394301  0.06194592
## DefPlays           -0.19091439 -4.210217e-01   0.0001525553  0.04182614
## DefYards           -0.59891323 -7.299795e-01  -0.1317978120  0.11616839
## DefAvg             -0.61875849 -6.395677e-01  -0.1615742932  0.10878889
## DefFirstDownPass   -0.21709433 -3.658307e-01   0.0425861829  0.09123305
## DefFirstDownRush   -0.45863262 -5.137995e-01  -0.1442285028 -0.01962705
## DefFirstDownPen    -0.02107814 -9.546899e-03   0.0146372877  0.05162783
## DefFirstDownTotal  -0.48687095 -6.329252e-01  -0.0664607512  0.06425565
## DefPenalties        0.01317935 -6.452130e-02  -0.0302781991  0.09772103
## DefPenaltyYds      -0.02890210 -1.011338e-01  -0.0461923015  0.10806661
## DefFumbles          0.15069928  1.689405e-03   0.4611567538 -0.04092384
## DefInterceptions    0.33233883  1.665064e-01   0.5702003054 -0.02870947
## DefTotalTurnovers   0.34523008  1.250021e-01   0.7239648943 -0.04795582
## Season             -0.00363621 -1.569119e-03  -0.0028876028 -0.03091173
##                     PassingAtt  PassingPct   PassingYds    PassingTD
## Differential      -0.196869502  0.42410345  0.223220680  0.415714923
## NetYards          -0.007097271  0.42308150  0.366000270  0.377696725
## TurnoverMargin    -0.193944679  0.15674325 -0.025559642  0.145849704
## PassingCmp         0.885896233  0.50717498  0.806547223  0.422316922
## PassingAtt         1.000000000  0.09315536  0.687271795  0.272813521
## PassingPct         0.093155364  1.00000000  0.474576058  0.394989670
## PassingYds         0.687271795  0.47457606  1.000000000  0.628178519
## PassingTD          0.272813521  0.39498967  0.628178519  1.000000000
## RushingAtt        -0.467518367  0.03613210 -0.263107487 -0.059376574
## RushingYds        -0.449975567  0.10449207 -0.200609277  0.066350138
## RushingAvg        -0.303577976  0.13444998 -0.082888644  0.150499189
## RushingTD         -0.292374841  0.24175736  0.011342402 -0.029015586
## OffensivePlays     0.553796697  0.12658649  0.435433701  0.215293539
## OffensiveYards     0.207675438  0.46417599  0.653171610  0.558994355
## OffenseAvg        -0.083538631  0.47899455  0.504666780  0.524444297
## FirstDownPass      0.746157690  0.47052335  0.883657791  0.521551760
## FirstDownRush     -0.367019503  0.12282841 -0.161359790  0.067439565
## FirstDownPen       0.197351786 -0.01284499  0.129006081  0.102871859
## FirstDownTotal     0.327399378  0.43330605  0.563311167  0.456860192
## Penalties          0.141768049  0.03463586  0.151550273  0.082259734
## PenaltyYds         0.119567124  0.05599523  0.151131721  0.100160624
## Fumbles            0.027478866 -0.01783733  0.008811146 -0.048018859
## Interceptions      0.254940410 -0.23627286  0.017618719 -0.114458545
## TotalTurnovers     0.207169253 -0.18699980  0.018846333 -0.116391065
## TeamScore         -0.035933316  0.44835810  0.431051906  0.618539629
## OpponentScore      0.272001348 -0.20669709  0.090668157 -0.019512592
## DefPassingCmp      0.083570904  0.02206021  0.126357936  0.091773507
## DefPassingAtt      0.057944813  0.05692377  0.145063141  0.144169653
## DefPassingPct      0.077374243 -0.07337905 -0.002052903 -0.081413409
## DefPassingYds      0.156497317 -0.02196601  0.155256065  0.075376864
## DefPassingTD       0.171067592 -0.08316878  0.097982769  0.035053345
## DefRushingAtt      0.083993629 -0.21380037 -0.037088960 -0.051053515
## DefRushingYds      0.119747148 -0.17947974 -0.010300250 -0.071383520
## DefRushingAvg      0.114462808 -0.11505322  0.008649510 -0.065672712
## DefRushingTD       0.155716440 -0.15511709  0.028151375 -0.033926688
## DefPlays           0.135316859 -0.14277058  0.108624632  0.094751037
## DefYards           0.215484942 -0.15211246  0.117075612  0.007116432
## DefAvg             0.175755717 -0.10704691  0.066058087 -0.058840155
## DefFirstDownPass   0.109208886 -0.01119101  0.123142671  0.065395375
## DefFirstDownRush   0.062313958 -0.15434159 -0.034360788 -0.068080474
## DefFirstDownPen    0.049336590  0.02691594  0.078576992  0.072627007
## DefFirstDownTotal  0.134090419 -0.11025611  0.083106670  0.016190934
## DefPenalties       0.144271935 -0.04301213  0.092139354  0.087539127
## DefPenaltyYds      0.162654164 -0.05390508  0.091809187  0.087458617
## DefFumbles        -0.035268306 -0.01779875 -0.037573349  0.043415306
## DefInterceptions  -0.067393814  0.06886121  0.008612120  0.088384660
## DefTotalTurnovers -0.073067543  0.03967676 -0.018006148  0.093981546
## Season            -0.042815333  0.01153492 -0.006117554  0.007749168
##                     RushingAtt   RushingYds   RushingAvg    RushingTD
## Differential       0.366212575  0.529451415  0.493176753  0.575783321
## NetYards           0.416945120  0.551779347  0.489209446  0.473464850
## TurnoverMargin     0.194233997  0.195955992  0.151242574  0.262791604
## PassingCmp        -0.369116550 -0.326693369 -0.194641684 -0.133052460
## PassingAtt        -0.467518367 -0.449975567 -0.303577976 -0.292374841
## PassingPct         0.036132096  0.104492066  0.134449978  0.241757360
## PassingYds        -0.263107487 -0.200609277 -0.082888644  0.011342402
## PassingTD         -0.059376574  0.066350138  0.150499189 -0.029015586
## RushingAtt         1.000000000  0.736583943  0.364284984  0.490658949
## RushingYds         0.736583943  1.000000000  0.871565174  0.695903189
## RushingAvg         0.364284984  0.871565174  1.000000000  0.607637826
## RushingTD          0.490658949  0.695903189  0.607637826  1.000000000
## OffensivePlays     0.477140312  0.246460859  0.041324341  0.171497768
## OffensiveYards     0.356671646  0.610784556  0.606659400  0.547046394
## OffenseAvg         0.138425296  0.576907524  0.708081627  0.545667695
## FirstDownPass     -0.235040292 -0.195601164 -0.092469432 -0.002819467
## FirstDownRush      0.788796974  0.868884675  0.659404370  0.595246572
## FirstDownPen      -0.003204555 -0.066910301 -0.080759132 -0.001136082
## FirstDownTotal     0.404815319  0.474808460  0.392505703  0.432561158
## Penalties         -0.024921198 -0.003651045  0.017975757 -0.025539221
## PenaltyYds        -0.002199279  0.033168959  0.054284078  0.009108827
## Fumbles            0.025578601 -0.028118399 -0.058261942 -0.058709124
## Interceptions     -0.193975079 -0.216449107 -0.172564128 -0.233234180
## TotalTurnovers    -0.127168356 -0.179027989 -0.166216652 -0.211526460
## TeamScore          0.343571991  0.571429522  0.560340983  0.698724429
## OpponentScore     -0.223024981 -0.246049036 -0.200498698 -0.188554772
## DefPassingCmp     -0.044763075  0.002351343  0.033307036  0.043038468
## DefPassingAtt      0.073608528  0.098937001  0.093487708  0.130716692
## DefPassingPct     -0.216287969 -0.180720848 -0.115717537 -0.152791861
## DefPassingYds     -0.050233964 -0.030415762 -0.010078082  0.005118324
## DefPassingTD      -0.059010431 -0.078502621 -0.070786506 -0.044702528
## DefRushingAtt     -0.410216642 -0.276569387 -0.117694938 -0.196242310
## DefRushingYds     -0.279183263 -0.225077345 -0.131739440 -0.196349300
## DefRushingAvg     -0.121621052 -0.134136582 -0.108134174 -0.151165356
## DefRushingTD      -0.194448775 -0.192176011 -0.145311374 -0.156635569
## DefPlays          -0.309019722 -0.159689121 -0.017255978 -0.053600005
## DefYards          -0.249498265 -0.193040997 -0.106779574 -0.142960915
## DefAvg            -0.119690358 -0.146487656 -0.127188735 -0.152049693
## DefFirstDownPass  -0.090092408 -0.047490894 -0.008586427 -0.007229665
## DefFirstDownRush  -0.317355086 -0.220166727 -0.100294586 -0.173767081
## DefFirstDownPen   -0.047381988 -0.006469058  0.026743015  0.011628275
## DefFirstDownTotal -0.301942200 -0.194065174 -0.073946954 -0.126211617
## DefPenalties      -0.044234601 -0.066155832 -0.057171981  0.001478081
## DefPenaltyYds     -0.060368138 -0.088133097 -0.076423381 -0.023191416
## DefFumbles         0.071037391  0.023590761 -0.006904102  0.081934337
## DefInterceptions   0.143215339  0.118930804  0.076394767  0.153598788
## DefTotalTurnovers  0.152731898  0.103830416  0.052362318  0.167539382
## Season            -0.006470762  0.015090607  0.029310251  0.010761615
##                   OffensivePlays OffensiveYards  OffenseAvg FirstDownPass
## Differential        0.1492260492    0.589637838  0.61303088   0.177425485
## NetYards            0.3856779392    0.722293918  0.62735954   0.329612380
## TurnoverMargin     -0.0098582568    0.130800782  0.16400788  -0.051904318
## PassingCmp          0.5330462787    0.399364632  0.15135101   0.858231839
## PassingAtt          0.5537966967    0.207675438 -0.08353863   0.746157690
## PassingPct          0.1265864872    0.464175994  0.47899455   0.470523352
## PassingYds          0.4354337011    0.653171610  0.50466678   0.883657791
## PassingTD           0.2152935388    0.558994355  0.52444430   0.521551760
## RushingAtt          0.4771403121    0.356671646  0.13842530  -0.235040292
## RushingYds          0.2464608593    0.610784556  0.57690752  -0.195601164
## RushingAvg          0.0413243412    0.606659400  0.70808163  -0.092469432
## RushingTD           0.1714977683    0.547046394  0.54566769  -0.002819467
## OffensivePlays      1.0000000000    0.542424397  0.04733582   0.520413497
## OffensiveYards      0.5424243966    1.000000000  0.85379007   0.563011577
## OffenseAvg          0.0473358159    0.853790068  1.00000000   0.350092632
## FirstDownPass       0.5204134972    0.563011577  0.35009263   1.000000000
## FirstDownRush       0.3781141523    0.541165081  0.41380414  -0.176035283
## FirstDownPen        0.1931821714    0.052549729 -0.04818553   0.127923002
## FirstDownTotal      0.7067980430    0.822273393  0.54765028   0.632578575
## Penalties           0.1174670579    0.119665050  0.06686068   0.095769976
## PenaltyYds          0.1167981247    0.147785771  0.10159079   0.096191798
## Fumbles             0.0514118771   -0.014611934 -0.04910112  -0.002669702
## Interceptions       0.0707420861   -0.153058437 -0.22086304   0.047918056
## TotalTurnovers      0.0861769432   -0.123142627 -0.19605167   0.033805622
## TeamScore           0.2878976199    0.790058468  0.76105515   0.348506805
## OpponentScore       0.0603404859   -0.116896356 -0.18323754   0.078087301
## DefPassingCmp       0.0409196811    0.103943303  0.09421622   0.091467148
## DefPassingAtt       0.1269410280    0.193714572  0.15018634   0.099017964
## DefPassingPct      -0.1268359214   -0.141363954 -0.09631045   0.008201755
## DefPassingYds       0.1082675145    0.101973122  0.04818640   0.125262682
## DefPassingTD        0.1144859698    0.018515887 -0.05340694   0.090586531
## DefRushingAtt      -0.3028923478   -0.243742972 -0.10829113  -0.078858613
## DefRushingYds      -0.1439228873   -0.182292260 -0.13449228  -0.026930969
## DefRushingAvg      -0.0007670721   -0.096687847 -0.11851074   0.011869282
## DefRushingTD       -0.0283493049   -0.125784441 -0.13842291   0.018320394
## DefPlays           -0.1565479001   -0.035633936  0.04743620   0.024332221
## DefYards           -0.0207822877   -0.054582037 -0.06201003   0.080510479
## DefAvg              0.0619862494   -0.059835995 -0.11515218   0.073494860
## DefFirstDownPass    0.0237109843    0.062820458  0.05355860   0.094812163
## DefFirstDownRush   -0.2369763215   -0.197943110 -0.09623351  -0.048360863
## DefFirstDownPen     0.0044182172    0.058507968  0.06481826   0.059488255
## DefFirstDownTotal  -0.1511006437   -0.082828263 -0.01566227   0.043655644
## DefPenalties        0.1017644325    0.023336190 -0.02925259   0.074673629
## DefPenaltyYds       0.1048427505    0.006082620 -0.05169993   0.076942358
## DefFumbles          0.0318497808   -0.012133968 -0.02853415  -0.048511328
## DefInterceptions    0.0678983530    0.098884830  0.08018058  -0.012722149
## DefTotalTurnovers   0.0712216676    0.065699798  0.04112451  -0.040989258
## Season             -0.0486606092    0.006719496  0.03847022  -0.021594636
##                   FirstDownRush  FirstDownPen FirstDownTotal    Penalties
## Differential       0.4610446515 -0.0056719400    0.465698595 -0.014461721
## NetYards           0.5138962499 -0.0141996290    0.616078422  0.064312738
## TurnoverMargin     0.1438888221 -0.0212171554    0.060667889  0.024582618
## PassingCmp        -0.2459464797  0.1577167545    0.489476431  0.133949104
## PassingAtt        -0.3670195030  0.1973517860    0.327399378  0.141768049
## PassingPct         0.1228284090 -0.0128449897    0.433306049  0.034635855
## PassingYds        -0.1613597905  0.1290060812    0.563311167  0.151550273
## PassingTD          0.0674395650  0.1028718590    0.456860192  0.082259734
## RushingAtt         0.7887969735 -0.0032045552    0.404815319 -0.024921198
## RushingYds         0.8688846753 -0.0669103014    0.474808460 -0.003651045
## RushingAvg         0.6594043700 -0.0807591323    0.392505703  0.017975757
## RushingTD          0.5952465724 -0.0011360821    0.432561158 -0.025539221
## OffensivePlays     0.3781141523  0.1931821714    0.706798043  0.117467058
## OffensiveYards     0.5411650814  0.0525497292    0.822273393  0.119665050
## OffenseAvg         0.4138041377 -0.0481855308    0.547650277  0.066860679
## FirstDownPass     -0.1760352833  0.1279230020    0.632578575  0.095769976
## FirstDownRush      1.0000000000 -0.0640698685    0.578677075 -0.036810209
## FirstDownPen      -0.0640698685  1.0000000000    0.271006893  0.131796886
## FirstDownTotal     0.5786770748  0.2710068926    1.000000000  0.073197638
## Penalties         -0.0368102087  0.1317968858    0.073197638  1.000000000
## PenaltyYds        -0.0038110643  0.1331976516    0.098139564  0.903246349
## Fumbles           -0.0127589089 -0.0001733341   -0.010522510 -0.005346654
## Interceptions     -0.1823861332  0.0214575248   -0.091526136  0.027625989
## TotalTurnovers    -0.1436894309  0.0158077753   -0.074806518  0.016995007
## TeamScore          0.4922799315  0.0696961188    0.632897568  0.045390014
## OpponentScore     -0.2196973570  0.0801568972   -0.083219329  0.069089389
## DefPassingCmp     -0.0351952073  0.0550827638    0.054621111  0.102064433
## DefPassingAtt      0.0425376517  0.0504467878    0.115086837  0.148129510
## DefPassingPct     -0.1563389121  0.0361246166   -0.098512229 -0.036848531
## DefPassingYds     -0.0530266913  0.0854845045    0.073302813  0.096286575
## DefPassingTD      -0.0724142119  0.0819003858    0.033416927  0.089942486
## DefRushingAtt     -0.3153770802 -0.0437639450   -0.299959508 -0.045386998
## DefRushingYds     -0.2196474882  0.0049716253   -0.181281160 -0.069403770
## DefRushingAvg     -0.1024397880  0.0402209526   -0.059266250 -0.059746435
## DefRushingTD      -0.1691695555  0.0296305740   -0.103644138  0.001729374
## DefPlays          -0.2514420854  0.0090586645   -0.165515581  0.103930020
## DefYards          -0.2071478834  0.0724343129   -0.076869161  0.025406067
## DefAvg            -0.1077475877  0.0813642808   -0.006762158 -0.026690341
## DefFirstDownPass  -0.0684673287  0.0650428178    0.030148892  0.078466818
## DefFirstDownRush  -0.2246581052 -0.0164089821   -0.213167195 -0.080014867
## DefFirstDownPen   -0.0271049617  0.0709464960    0.034896710  0.544815636
## DefFirstDownTotal -0.2243526184  0.0464600265   -0.123036459  0.124940137
## DefPenalties      -0.0775132732  0.5421750750    0.124255326  0.190344623
## DefPenaltyYds     -0.0976956398  0.6585481016    0.137953942  0.193217609
## DefFumbles         0.0007492654 -0.0219681886   -0.039437448  0.017507901
## DefInterceptions   0.0855201980 -0.0006859365    0.051837528  0.054737340
## DefTotalTurnovers  0.0641260141 -0.0147851433    0.012948409  0.052108799
## Season             0.0087181647  0.0662421947    0.003573008  0.029358706
##                     PenaltyYds       Fumbles Interceptions TotalTurnovers
## Differential       0.017601130 -1.442010e-01  -0.347408586   -0.352260285
## NetYards           0.092501983 -9.215913e-05  -0.182840770   -0.135727097
## TurnoverMargin     0.037534837 -4.544647e-01  -0.565898315   -0.717673298
## PassingCmp         0.125427191  1.753247e-02   0.099529068    0.085337798
## PassingAtt         0.119567124  2.747887e-02   0.254940410    0.207169253
## PassingPct         0.055995229 -1.783733e-02  -0.236272865   -0.186999804
## PassingYds         0.151131721  8.811146e-03   0.017618719    0.018846333
## PassingTD          0.100160624 -4.801886e-02  -0.114458545   -0.116391065
## RushingAtt        -0.002199279  2.557860e-02  -0.193975079   -0.127168356
## RushingYds         0.033168959 -2.811840e-02  -0.216449107   -0.179027989
## RushingAvg         0.054284078 -5.826194e-02  -0.172564128   -0.166216652
## RushingTD          0.009108827 -5.870912e-02  -0.233234180   -0.211526460
## OffensivePlays     0.116798125  5.141188e-02   0.070742086    0.086176943
## OffensiveYards     0.147785771 -1.461193e-02  -0.153058437   -0.123142627
## OffenseAvg         0.101590794 -4.910112e-02  -0.220863037   -0.196051667
## FirstDownPass      0.096191798 -2.669702e-03   0.047918056    0.033805622
## FirstDownRush     -0.003811064 -1.275891e-02  -0.182386133   -0.143689431
## FirstDownPen       0.133197652 -1.733341e-04   0.021457525    0.015807775
## FirstDownTotal     0.098139564 -1.052251e-02  -0.091526136   -0.074806518
## Penalties          0.903246349 -5.346654e-03   0.027625989    0.016995007
## PenaltyYds         1.000000000 -1.403761e-02   0.015917475    0.002612761
## Fumbles           -0.014037613  1.000000e+00   0.020686856    0.670582481
## Interceptions      0.015917475  2.068686e-02   1.000000000    0.755548472
## TotalTurnovers     0.002612761  6.705825e-01   0.755548472    1.000000000
## TeamScore          0.081291125 -9.221149e-02  -0.276837519   -0.265831719
## OpponentScore      0.055514238  1.318612e-01   0.261769161    0.280630915
## DefPassingCmp      0.111411452 -3.973445e-02  -0.017382571   -0.038933078
## DefPassingAtt      0.162686876 -3.109202e-02  -0.058249476   -0.063593263
## DefPassingPct     -0.043181735 -2.298935e-02   0.079458063    0.043886543
## DefPassingYds      0.096147185 -3.794686e-02   0.023141664   -0.007693062
## DefPassingTD       0.093042717  3.917292e-02   0.099335433    0.099373662
## DefRushingAtt     -0.059428915  7.073269e-02   0.145548041    0.154342125
## DefRushingYds     -0.086487888  2.161863e-02   0.125655719    0.107400997
## DefRushingAvg     -0.073680859 -9.993266e-03   0.083883084    0.055685410
## DefRushingTD      -0.016742606  7.668263e-02   0.161302037    0.169930092
## DefPlays           0.105229463  3.512879e-02   0.077938067    0.080847102
## DefYards           0.012497218 -1.430737e-02   0.112722764    0.074264925
## DefAvg            -0.043324322 -3.375146e-02   0.094214745    0.047783145
## DefFirstDownPass   0.080096704 -4.961707e-02   0.001262556   -0.031573932
## DefFirstDownRush  -0.095022978  1.225369e-03   0.092104959    0.069144207
## DefFirstDownPen    0.656398393 -1.908304e-02   0.005624566   -0.008330444
## DefFirstDownTotal  0.140738712 -3.865811e-02   0.066245867    0.023823953
## DefPenalties       0.193097182  2.205344e-02   0.053183584    0.053912026
## DefPenaltyYds      0.197512296  2.200282e-02   0.064072314    0.061958232
## DefFumbles         0.016184547  2.463791e-02  -0.012823924    0.006628298
## DefInterceptions   0.061722386 -5.156149e-03  -0.074209524   -0.058441483
## DefTotalTurnovers  0.056446760  1.217256e-02  -0.063555262   -0.039181745
## Season             0.043715876 -7.685922e-02  -0.041631219   -0.081250800
##                      TeamScore OpponentScore DefPassingCmp DefPassingAtt
## Differential       0.781067377  -0.769642620  -0.064738171   0.150488394
## NetYards           0.637250008  -0.631925770  -0.222740348  -0.034761786
## TurnoverMargin     0.377082264  -0.373145513   0.082821892   0.181901022
## PassingCmp         0.183342682   0.138560556   0.083460009   0.075998001
## PassingAtt        -0.035933316   0.272001348   0.083570904   0.057944813
## PassingPct         0.448358095  -0.206697090   0.022060210   0.056923769
## PassingYds         0.431051906   0.090668157   0.126357936   0.145063141
## PassingTD          0.618539629  -0.019512592   0.091773507   0.144169653
## RushingAtt         0.343571991  -0.223024981  -0.044763075   0.073608528
## RushingYds         0.571429522  -0.246049036   0.002351343   0.098937001
## RushingAvg         0.560340983  -0.200498698   0.033307036   0.093487708
## RushingTD          0.698724429  -0.188554772   0.043038468   0.130716692
## OffensivePlays     0.287897620   0.060340486   0.040919681   0.126941028
## OffensiveYards     0.790058468  -0.116896356   0.103943303   0.193714572
## OffenseAvg         0.761055147  -0.183237536   0.094216225   0.150186343
## FirstDownPass      0.348506805   0.078087301   0.091467148   0.099017964
## FirstDownRush      0.492279931  -0.219697357  -0.035195207   0.042537652
## FirstDownPen       0.069696119   0.080156897   0.055082764   0.050446788
## FirstDownTotal     0.632897568  -0.083219329   0.054621111   0.115086837
## Penalties          0.045390014   0.069089389   0.102064433   0.148129510
## PenaltyYds         0.081291125   0.055514238   0.111411452   0.162686876
## Fumbles           -0.092211487   0.131861239  -0.039734445  -0.031092018
## Interceptions     -0.276837519   0.261769161  -0.017382571  -0.058249476
## TotalTurnovers    -0.265831719   0.280630915  -0.038933078  -0.063593263
## TeamScore          1.000000000  -0.202449142   0.110635407   0.235566171
## OpponentScore     -0.202449142   1.000000000   0.214646894   0.004853732
## DefPassingCmp      0.110635407   0.214646894   1.000000000   0.888534051
## DefPassingAtt      0.235566171   0.004853732   0.888534051   1.000000000
## DefPassingPct     -0.205209988   0.449588246   0.527936650   0.124019463
## DefPassingYds      0.056396074   0.461238083   0.811002597   0.696116964
## DefPassingTD      -0.026902967   0.630074932   0.431985918   0.288984139
## DefRushingAtt     -0.220249045   0.346909797  -0.357667664  -0.457042860
## DefRushingYds     -0.246450675   0.568876938  -0.303784505  -0.424394812
## DefRushingAvg     -0.202890701   0.555199375  -0.170640401  -0.274715155
## DefRushingTD      -0.184304923   0.692177201  -0.107643356  -0.260693452
## DefPlays           0.027517830   0.327538771   0.543938457   0.561465097
## DefYards          -0.139273423   0.796847454   0.424314893   0.241629847
## DefAvg            -0.199723885   0.766161290   0.189980837  -0.036500573
## DefFirstDownPass   0.040803814   0.382180089   0.858417270   0.750367056
## DefFirstDownRush  -0.221952586   0.492314907  -0.220058651  -0.338567902
## DefFirstDownPen    0.060041899   0.094446672   0.166792838   0.202250647
## DefFirstDownTotal -0.112838954   0.648164723   0.507510150   0.352847040
## DefPenalties       0.066777296   0.047608838   0.129228123   0.138563162
## DefPenaltyYds      0.045061806   0.091400033   0.124944592   0.119897735
## DefFumbles         0.141183882  -0.091979574   0.015863025   0.025781372
## DefInterceptions   0.249951833  -0.265625593   0.094022164   0.243583489
## DefTotalTurnovers  0.277739467  -0.257430498   0.080273452   0.198013119
## Season             0.006049668   0.011888084  -0.028328659  -0.041701025
##                   DefPassingPct DefPassingYds DefPassingTD DefRushingAtt
## Differential       -0.420462531  -0.257339941  -0.41930782 -0.3648042744
## NetYards           -0.422176950  -0.397669138  -0.38051554 -0.4156719928
## TurnoverMargin     -0.154782085   0.017015633  -0.14799556 -0.1927488957
## PassingCmp          0.036945250   0.127150934   0.11207117 -0.0356416142
## PassingAtt          0.077374243   0.156497317   0.17106759  0.0839936289
## PassingPct         -0.073379047  -0.021966011  -0.08316878 -0.2138003701
## PassingYds         -0.002052903   0.155256065   0.09798277 -0.0370889603
## PassingTD          -0.081413409   0.075376864   0.03505334 -0.0510535147
## RushingAtt         -0.216287969  -0.050233964  -0.05901043 -0.4102166420
## RushingYds         -0.180720848  -0.030415762  -0.07850262 -0.2765693874
## RushingAvg         -0.115717537  -0.010078082  -0.07078651 -0.1176949379
## RushingTD          -0.152791861   0.005118324  -0.04470253 -0.1962423099
## OffensivePlays     -0.126835921   0.108267515   0.11448597 -0.3028923478
## OffensiveYards     -0.141363954   0.101973122   0.01851589 -0.2437429725
## OffenseAvg         -0.096310447   0.048186399  -0.05340694 -0.1082911321
## FirstDownPass       0.008201755   0.125262682   0.09058653 -0.0788586132
## FirstDownRush      -0.156338912  -0.053026691  -0.07241421 -0.3153770802
## FirstDownPen        0.036124617   0.085484505   0.08190039 -0.0437639450
## FirstDownTotal     -0.098512229   0.073302813   0.03341693 -0.2999595083
## Penalties          -0.036848531   0.096286575   0.08994249 -0.0453869976
## PenaltyYds         -0.043181735   0.096147185   0.09304272 -0.0594289149
## Fumbles            -0.022989345  -0.037946859   0.03917292  0.0707326851
## Interceptions       0.079458063   0.023141664   0.09933543  0.1455480406
## TotalTurnovers      0.043886543  -0.007693062   0.09937366  0.1543421251
## TeamScore          -0.205209988   0.056396074  -0.02690297 -0.2202490448
## OpponentScore       0.449588246   0.461238083   0.63007493  0.3469097966
## DefPassingCmp       0.527936650   0.811002597   0.43198592 -0.3576676643
## DefPassingAtt       0.124019463   0.696116964   0.28898414 -0.4570428600
## DefPassingPct       1.000000000   0.489075819   0.39650239  0.0334096923
## DefPassingYds       0.489075819   1.000000000   0.63385166 -0.2418870888
## DefPassingTD        0.396502392   0.633851663   1.00000000 -0.0473965023
## DefRushingAtt       0.033409692  -0.241887089  -0.04739650  1.0000000000
## DefRushingYds       0.102635384  -0.172013159   0.07771343  0.7391873224
## DefRushingAvg       0.133036964  -0.057509274   0.15729694  0.3720422494
## DefRushingTD        0.238808298   0.038341710  -0.01416026  0.4883122699
## DefPlays            0.153529419   0.461801976   0.24103860  0.4794022253
## DefYards            0.469912225   0.674929292   0.56768532  0.3592633482
## DefAvg              0.483874653   0.534876301   0.53421323  0.1519570904
## DefFirstDownPass    0.485967332   0.888603335   0.53272207 -0.2149967952
## DefFirstDownRush    0.123910001  -0.129646601   0.08022841  0.7888528350
## DefFirstDownPen     0.004410449   0.141000183   0.11566582 -0.0002455711
## DefFirstDownTotal   0.442541414   0.587380037   0.47044073  0.4033934413
## DefPenalties        0.034579594   0.147205592   0.08036485 -0.0238558553
## DefPenaltyYds       0.058957963   0.152557145   0.10315158  0.0025346468
## DefFumbles         -0.018162723   0.007005965  -0.05021171  0.0263769198
## DefInterceptions   -0.224175953   0.016451788  -0.10924136 -0.1892407555
## DefTotalTurnovers  -0.178623380   0.016794868  -0.11391813 -0.1236824864
## Season              0.017373437  -0.003226989   0.01101258 -0.0065777142
##                   DefRushingYds DefRushingAvg DefRushingTD      DefPlays
## Differential       -0.523424607 -0.4863031759 -0.561529600 -0.1909143856
## NetYards           -0.547819825 -0.4871370004 -0.464507699 -0.4210217236
## TurnoverMargin     -0.194212799 -0.1469244778 -0.258439430  0.0001525553
## PassingCmp          0.018647173  0.0490502813  0.061945922  0.0418261399
## PassingAtt          0.119747148  0.1144628084  0.155716440  0.1353168586
## PassingPct         -0.179479743 -0.1150532210 -0.155117094 -0.1427705801
## PassingYds         -0.010300250  0.0086495098  0.028151375  0.1086246317
## PassingTD          -0.071383520 -0.0656727117 -0.033926688  0.0947510369
## RushingAtt         -0.279183263 -0.1216210521 -0.194448775 -0.3090197221
## RushingYds         -0.225077345 -0.1341365822 -0.192176011 -0.1596891208
## RushingAvg         -0.131739440 -0.1081341745 -0.145311374 -0.0172559779
## RushingTD          -0.196349300 -0.1511653556 -0.156635569 -0.0536000054
## OffensivePlays     -0.143922887 -0.0007670721 -0.028349305 -0.1565479001
## OffensiveYards     -0.182292260 -0.0966878468 -0.125784441 -0.0356339359
## OffenseAvg         -0.134492281 -0.1185107448 -0.138422906  0.0474361958
## FirstDownPass      -0.026930969  0.0118692816  0.018320394  0.0243322208
## FirstDownRush      -0.219647488 -0.1024397880 -0.169169555 -0.2514420854
## FirstDownPen        0.004971625  0.0402209526  0.029630574  0.0090586645
## FirstDownTotal     -0.181281160 -0.0592662502 -0.103644138 -0.1655155810
## Penalties          -0.069403770 -0.0597464352  0.001729374  0.1039300196
## PenaltyYds         -0.086487888 -0.0736808591 -0.016742606  0.1052294630
## Fumbles             0.021618628 -0.0099932661  0.076682629  0.0351287926
## Interceptions       0.125655719  0.0838830841  0.161302037  0.0779380669
## TotalTurnovers      0.107400997  0.0556854104  0.169930092  0.0808471024
## TeamScore          -0.246450675 -0.2028907014 -0.184304923  0.0275178301
## OpponentScore       0.568876938  0.5551993748  0.692177201  0.3275387714
## DefPassingCmp      -0.303784505 -0.1706404012 -0.107643356  0.5439384572
## DefPassingAtt      -0.424394812 -0.2747151550 -0.260693452  0.5614650973
## DefPassingPct       0.102635384  0.1330369637  0.238808298  0.1535294189
## DefPassingYds      -0.172013159 -0.0575092741  0.038341710  0.4618019756
## DefPassingTD        0.077713426  0.1572969436 -0.014160258  0.2410385975
## DefRushingAtt       0.739187322  0.3720422494  0.488312270  0.4794022253
## DefRushingYds       1.000000000  0.8730947621  0.688245075  0.2689666214
## DefRushingAvg       0.873094762  1.0000000000  0.600041582  0.0750924781
## DefRushingTD        0.688245075  0.6000415822  1.000000000  0.1970839405
## DefPlays            0.268966621  0.0750924781  0.197083940  1.0000000000
## DefYards            0.610787296  0.6079029094  0.546346178  0.5726537077
## DefAvg              0.579906813  0.7047277389  0.543402451  0.1054439997
## DefFirstDownPass   -0.165502747 -0.0636257511  0.025657171  0.5403468891
## DefFirstDownRush    0.868314472  0.6624669099  0.587340983  0.3998567473
## DefFirstDownPen    -0.055707347 -0.0663814432  0.015665968  0.1993277518
## DefFirstDownTotal   0.480393124  0.4048150894  0.437620119  0.7234459497
## DefPenalties       -0.002806677  0.0189733608 -0.021067247  0.1145227495
## DefPenaltyYds       0.039258125  0.0602355258  0.018572174  0.1206585558
## DefFumbles         -0.026783262 -0.0547759457 -0.054120831  0.0499778396
## DefInterceptions   -0.208131868 -0.1613333457 -0.224866874  0.0642770948
## DefTotalTurnovers  -0.172283106 -0.1556512003 -0.202500002  0.0803063695
## Season              0.015352689  0.0291418446  0.016225977 -0.0472650922
##                       DefYards       DefAvg DefFirstDownPass DefFirstDownRush
## Differential      -0.598913227 -0.618758489     -0.217094325     -0.458632617
## NetYards          -0.729979536 -0.639567669     -0.365830735     -0.513799549
## TurnoverMargin    -0.131797812 -0.161574293      0.042586183     -0.144228503
## PassingCmp         0.116168390  0.108788890      0.091233051     -0.019627052
## PassingAtt         0.215484942  0.175755717      0.109208886      0.062313958
## PassingPct        -0.152112461 -0.107046906     -0.011191012     -0.154341585
## PassingYds         0.117075612  0.066058087      0.123142671     -0.034360788
## PassingTD          0.007116432 -0.058840155      0.065395375     -0.068080474
## RushingAtt        -0.249498265 -0.119690358     -0.090092408     -0.317355086
## RushingYds        -0.193040997 -0.146487656     -0.047490894     -0.220166727
## RushingAvg        -0.106779574 -0.127188735     -0.008586427     -0.100294586
## RushingTD         -0.142960915 -0.152049693     -0.007229665     -0.173767081
## OffensivePlays    -0.020782288  0.061986249      0.023710984     -0.236976322
## OffensiveYards    -0.054582037 -0.059835995      0.062820458     -0.197943110
## OffenseAvg        -0.062010033 -0.115152176      0.053558602     -0.096233511
## FirstDownPass      0.080510479  0.073494860      0.094812163     -0.048360863
## FirstDownRush     -0.207147883 -0.107747588     -0.068467329     -0.224658105
## FirstDownPen       0.072434313  0.081364281      0.065042818     -0.016408982
## FirstDownTotal    -0.076869161 -0.006762158      0.030148892     -0.213167195
## Penalties          0.025406067 -0.026690341      0.078466818     -0.080014867
## PenaltyYds         0.012497218 -0.043324322      0.080096704     -0.095022978
## Fumbles           -0.014307373 -0.033751463     -0.049617071      0.001225369
## Interceptions      0.112722764  0.094214745      0.001262556      0.092104959
## TotalTurnovers     0.074264925  0.047783145     -0.031573932      0.069144207
## TeamScore         -0.139273423 -0.199723885      0.040803814     -0.221952586
## OpponentScore      0.796847454  0.766161290      0.382180089      0.492314907
## DefPassingCmp      0.424314893  0.189980837      0.858417270     -0.220058651
## DefPassingAtt      0.241629847 -0.036500573      0.750367056     -0.338567902
## DefPassingPct      0.469912225  0.483874653      0.485967332      0.123910001
## DefPassingYds      0.674929292  0.534876301      0.888603335     -0.129646601
## DefPassingTD       0.567685318  0.534213234      0.532722072      0.080228407
## DefRushingAtt      0.359263348  0.151957090     -0.214996795      0.788852835
## DefRushingYds      0.610787296  0.579906813     -0.165502747      0.868314472
## DefRushingAvg      0.607902909  0.704727739     -0.063625751      0.662466910
## DefRushingTD       0.546346178  0.543402451      0.025657171      0.587340983
## DefPlays           0.572653708  0.105444000      0.540346889      0.399856747
## DefYards           1.000000000  0.864462231      0.590268014      0.546201809
## DefAvg             0.864462231  1.000000000      0.390009234      0.423536375
## DefFirstDownPass   0.590268014  0.390009234      1.000000000     -0.141850255
## DefFirstDownRush   0.546201809  0.423536375     -0.141850255      1.000000000
## DefFirstDownPen    0.071605026 -0.025614875      0.137431352     -0.052058450
## DefFirstDownTotal  0.831958681  0.576278354      0.653641009      0.584664115
## DefPenalties       0.116217881  0.068228282      0.091887049     -0.035947615
## DefPenaltyYds      0.152027850  0.109505111      0.099033975      0.001989852
## DefFumbles        -0.014430707 -0.047710501     -0.002813518     -0.011352217
## DefInterceptions  -0.142677056 -0.206269402      0.042542473     -0.176151476
## DefTotalTurnovers -0.115548882 -0.184499527      0.029829434     -0.138458223
## Season             0.008906115  0.041294557     -0.019898843      0.009579210
##                   DefFirstDownPen DefFirstDownTotal DefPenalties DefPenaltyYds
## Differential        -0.0210781403      -0.486870948  0.013179353  -0.028902103
## NetYards            -0.0095468986      -0.632925216 -0.064521305  -0.101133844
## TurnoverMargin       0.0146372877      -0.066460751 -0.030278199  -0.046192301
## PassingCmp           0.0516278342       0.064255648  0.097721027   0.108066606
## PassingAtt           0.0493365901       0.134090419  0.144271935   0.162654164
## PassingPct           0.0269159393      -0.110256110 -0.043012134  -0.053905077
## PassingYds           0.0785769921       0.083106670  0.092139354   0.091809187
## PassingTD            0.0726270069       0.016190934  0.087539127   0.087458617
## RushingAtt          -0.0473819882      -0.301942200 -0.044234601  -0.060368138
## RushingYds          -0.0064690578      -0.194065174 -0.066155832  -0.088133097
## RushingAvg           0.0267430155      -0.073946954 -0.057171981  -0.076423381
## RushingTD            0.0116282754      -0.126211617  0.001478081  -0.023191416
## OffensivePlays       0.0044182172      -0.151100644  0.101764432   0.104842750
## OffensiveYards       0.0585079679      -0.082828263  0.023336190   0.006082620
## OffenseAvg           0.0648182595      -0.015662267 -0.029252587  -0.051699931
## FirstDownPass        0.0594882546       0.043655644  0.074673629   0.076942358
## FirstDownRush       -0.0271049617      -0.224352618 -0.077513273  -0.097695640
## FirstDownPen         0.0709464960       0.046460027  0.542175075   0.658548102
## FirstDownTotal       0.0348967103      -0.123036459  0.124255326   0.137953942
## Penalties            0.5448156356       0.124940137  0.190344623   0.193217609
## PenaltyYds           0.6563983927       0.140738712  0.193097182   0.197512296
## Fumbles             -0.0190830400      -0.038658110  0.022053436   0.022002823
## Interceptions        0.0056245655       0.066245867  0.053183584   0.064072314
## TotalTurnovers      -0.0083304443       0.023823953  0.053912026   0.061958232
## TeamScore            0.0600418989      -0.112838954  0.066777296   0.045061806
## OpponentScore        0.0944466717       0.648164723  0.047608838   0.091400033
## DefPassingCmp        0.1667928379       0.507510150  0.129228123   0.124944592
## DefPassingAtt        0.2022506473       0.352847040  0.138563162   0.119897735
## DefPassingPct        0.0044104488       0.442541414  0.034579594   0.058957963
## DefPassingYds        0.1410001827       0.587380037  0.147205592   0.152557145
## DefPassingTD         0.1156658207       0.470440732  0.080364847   0.103151581
## DefRushingAtt       -0.0002455711       0.403393441 -0.023855855   0.002534647
## DefRushingYds       -0.0557073473       0.480393124 -0.002806677   0.039258125
## DefRushingAvg       -0.0663814432       0.404815089  0.018973361   0.060235526
## DefRushingTD         0.0156659684       0.437620119 -0.021067247   0.018572174
## DefPlays             0.1993277518       0.723445950  0.114522749   0.120658556
## DefYards             0.0716050265       0.831958681  0.116217881   0.152027850
## DefAvg              -0.0256148747       0.576278354  0.068228282   0.109505111
## DefFirstDownPass     0.1374313523       0.653641009  0.091887049   0.099033975
## DefFirstDownRush    -0.0520584497       0.584664115 -0.035947615   0.001989852
## DefFirstDownPen      1.0000000000       0.282939522  0.131267111   0.134211435
## DefFirstDownTotal    0.2829395221       1.000000000  0.070161291   0.103058264
## DefPenalties         0.1312671113       0.070161291  1.000000000   0.900744066
## DefPenaltyYds        0.1342114354       0.103058264  0.900744066   1.000000000
## DefFumbles          -0.0026003776      -0.010026151 -0.008397144  -0.017131676
## DefInterceptions     0.0194055745      -0.087683835  0.020716236   0.008367985
## DefTotalTurnovers    0.0127507987      -0.071764094  0.009959436  -0.004905025
## Season               0.0678208894       0.005517816  0.027121670   0.041720288
##                      DefFumbles DefInterceptions DefTotalTurnovers       Season
## Differential       0.1506992798     0.3323388335       0.345230083 -0.003636210
## NetYards           0.0016894054     0.1665064148       0.125002132 -0.001569119
## TurnoverMargin     0.4611567538     0.5702003054       0.723964894 -0.002887603
## PassingCmp        -0.0409238359    -0.0287094744      -0.047955818 -0.030911733
## PassingAtt        -0.0352683058    -0.0673938145      -0.073067543 -0.042815333
## PassingPct        -0.0177987463     0.0688612134       0.039676760  0.011534916
## PassingYds        -0.0375733487     0.0086121204      -0.018006148 -0.006117554
## PassingTD          0.0434153059     0.0883846599       0.093981546  0.007749168
## RushingAtt         0.0710373911     0.1432153386       0.152731898 -0.006470762
## RushingYds         0.0235907609     0.1189308042       0.103830416  0.015090607
## RushingAvg        -0.0069041024     0.0763947673       0.052362318  0.029310251
## RushingTD          0.0819343368     0.1535987879       0.167539382  0.010761615
## OffensivePlays     0.0318497808     0.0678983530       0.071221668 -0.048660609
## OffensiveYards    -0.0121339680     0.0988848301       0.065699798  0.006719496
## OffenseAvg        -0.0285341525     0.0801805815       0.041124510  0.038470215
## FirstDownPass     -0.0485113283    -0.0127221493      -0.040989258 -0.021594636
## FirstDownRush      0.0007492654     0.0855201980       0.064126014  0.008718165
## FirstDownPen      -0.0219681886    -0.0006859365      -0.014785143  0.066242195
## FirstDownTotal    -0.0394374483     0.0518375280       0.012948409  0.003573008
## Penalties          0.0175079007     0.0547373404       0.052108799  0.029358706
## PenaltyYds         0.0161845471     0.0617223857       0.056446760  0.043715876
## Fumbles            0.0246379075    -0.0051561488       0.012172560 -0.076859221
## Interceptions     -0.0128239235    -0.0742095243      -0.063555262 -0.041631219
## TotalTurnovers     0.0066282979    -0.0584414828      -0.039181745 -0.081250800
## TeamScore          0.1411838824     0.2499518329       0.277739467  0.006049668
## OpponentScore     -0.0919795740    -0.2656255930      -0.257430498  0.011888084
## DefPassingCmp      0.0158630249     0.0940221643       0.080273452 -0.028328659
## DefPassingAtt      0.0257813722     0.2435834885       0.198013119 -0.041701025
## DefPassingPct     -0.0181627225    -0.2241759530      -0.178623380  0.017373437
## DefPassingYds      0.0070059647     0.0164517885       0.016794868 -0.003226989
## DefPassingTD      -0.0502117148    -0.1092413599      -0.113918126  0.011012582
## DefRushingAtt      0.0263769198    -0.1892407555      -0.123682486 -0.006577714
## DefRushingYds     -0.0267832623    -0.2081318682      -0.172283106  0.015352689
## DefRushingAvg     -0.0547759457    -0.1613333457      -0.155651200  0.029141845
## DefRushingTD      -0.0541208313    -0.2248668737      -0.202500002  0.016225977
## DefPlays           0.0499778396     0.0642770948       0.080306370 -0.047265092
## DefYards          -0.0144307070    -0.1426770564      -0.115548882  0.008906115
## DefAvg            -0.0477105007    -0.2062694023      -0.184499527  0.041294557
## DefFirstDownPass  -0.0028135177     0.0425424733       0.029829434 -0.019898843
## DefFirstDownRush  -0.0113522169    -0.1761514759      -0.138458223  0.009579210
## DefFirstDownPen   -0.0026003776     0.0194055745       0.012750799  0.067820889
## DefFirstDownTotal -0.0100261507    -0.0876838353      -0.071764094  0.005517816
## DefPenalties      -0.0083971436     0.0207162360       0.009959436  0.027121670
## DefPenaltyYds     -0.0171316758     0.0083679848      -0.004905025  0.041720288
## DefFumbles         1.0000000000     0.0248441007       0.668277532 -0.077380805
## DefInterceptions   0.0248441007     1.0000000000       0.760285188 -0.046159898
## DefTotalTurnovers  0.6682775322     0.7602851879       1.000000000 -0.084630780
## Season            -0.0773808053    -0.0461598980      -0.084630780  1.000000000
\end{verbatim}

Notice right away -- NetYards is highly correlated. But NetYards's also highly correlated with RushingYards, OffensiveYards and DefYards. And that makes sense: those things all feed into NetYards. Including all of these measures would be pointless -- they would add error without adding much in the way of predictive power.

\begin{quote}
\textbf{Your turn}: What else do you see? What other values have predictive power and aren't co-correlated?
\end{quote}

We can add more just by simply adding them. Let's add the average yard per play for both offense and defense. They're correlated to NetYards, but not as much as you might expect.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model2 }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(Differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ NetYards }\SpecialCharTok{+}\NormalTok{ TurnoverMargin }\SpecialCharTok{+}\NormalTok{ DefAvg }\SpecialCharTok{+}\NormalTok{ OffenseAvg, }\AttributeTok{data=}\NormalTok{logs)}
\FunctionTok{summary}\NormalTok{(model2)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = Differential ~ NetYards + TurnoverMargin + DefAvg + 
##     OffenseAvg, data = logs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.254  -6.255  -0.002   6.229  37.507 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.4960303  0.4292118   1.156    0.248    
## NetYards        0.0547465  0.0008007  68.376   <2e-16 ***
## TurnoverMargin  3.8806793  0.0410484  94.539   <2e-16 ***
## DefAvg         -3.9374905  0.0738754 -53.299   <2e-16 ***
## OffenseAvg      3.9152803  0.0729586  53.664   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.45 on 15601 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8287, Adjusted R-squared:  0.8287 
## F-statistic: 1.887e+04 on 4 and 15601 DF,  p-value: < 2.2e-16
\end{verbatim}

Go down the list:

What is the Adjusted R-squared now?
What is the p-value and is it less than .05?
What is the Residual standard error?

The final thing we can do with this is predict things. Look at our coefficients table. See the Estimates? We can build a formula from that, same as we did with linear regressions.

How does this apply in the real world? Let's pretend for a minute that you are Scott Frost, and you have a mess on your hands. Your job is to win conference titles. To do that, we need to know what attributes of a team should we emphasize. We can do that by looking at what previous Big Ten conference champions looked like.

So if our goal is to predict a conference champion team, we need to know what those teams did. Here's the regular season conference champions in this dataset.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Ohio State"} \SpecialCharTok{\&}\NormalTok{ Season }\SpecialCharTok{==} \DecValTok{2020} \SpecialCharTok{|}\NormalTok{ Team }\SpecialCharTok{==} \StringTok{"Ohio State"} \SpecialCharTok{\&}\NormalTok{ Season }\SpecialCharTok{==} \DecValTok{2019} \SpecialCharTok{|}\NormalTok{ Team }\SpecialCharTok{==} \StringTok{"Ohio State"} \SpecialCharTok{\&}\NormalTok{ Season }\SpecialCharTok{==} \DecValTok{2018}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{meanNetYards =} \FunctionTok{mean}\NormalTok{(NetYards),}
    \AttributeTok{meanTurnoverMargin =} \FunctionTok{mean}\NormalTok{(TurnoverMargin),}
    \AttributeTok{meanDefAvg =} \FunctionTok{mean}\NormalTok{(DefAvg),}
    \AttributeTok{meanOffenseAvg =} \FunctionTok{mean}\NormalTok{(OffenseAvg)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 4
##   meanNetYards meanTurnoverMargin meanDefAvg meanOffenseAvg
##          <dbl>              <dbl>      <dbl>          <dbl>
## 1         196.              0.676       5.04           6.91
\end{verbatim}

Now it's just plug and chug.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(}\FloatTok{0.0547465}\SpecialCharTok{*}\FloatTok{195.8824}\NormalTok{) }\SpecialCharTok{+}\NormalTok{ (}\FloatTok{3.8806793}\SpecialCharTok{*}\FloatTok{0.6764706}\NormalTok{) }\SpecialCharTok{+}\NormalTok{ (}\SpecialCharTok{{-}}\FloatTok{3.9374905}\SpecialCharTok{*}\FloatTok{5.044118}\NormalTok{ ) }\SpecialCharTok{+}\NormalTok{ (}\FloatTok{3.9152803}\SpecialCharTok{*}\FloatTok{6.908824}\NormalTok{) }\SpecialCharTok{+} \FloatTok{0.4960303}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 21.03389
\end{verbatim}

So a team with those numbers is going to average scoring 21 more points per game than their opponent. Sound like Ohio State in the last three years?

How does that compare to Nebraska this season?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(}
\NormalTok{    Team }\SpecialCharTok{==} \StringTok{"Nebraska"} \SpecialCharTok{\&}\NormalTok{ Season }\SpecialCharTok{==} \DecValTok{2020}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{meanNetYards =} \FunctionTok{mean}\NormalTok{(NetYards),}
    \AttributeTok{meanTurnoverMargin =} \FunctionTok{mean}\NormalTok{(TurnoverMargin),}
    \AttributeTok{meanDefAvg =} \FunctionTok{mean}\NormalTok{(DefAvg),}
    \AttributeTok{meanOffenseAvg =} \FunctionTok{mean}\NormalTok{(OffenseAvg)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 4
##   meanNetYards meanTurnoverMargin meanDefAvg meanOffenseAvg
##          <dbl>              <dbl>      <dbl>          <dbl>
## 1            5              -1.38       5.44           5.54
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(}\FloatTok{0.0547465}\SpecialCharTok{*}\DecValTok{5}\NormalTok{) }\SpecialCharTok{+}\NormalTok{ (}\FloatTok{3.8806793}\SpecialCharTok{*{-}}\FloatTok{1.375}\NormalTok{) }\SpecialCharTok{+}\NormalTok{ (}\SpecialCharTok{{-}}\FloatTok{3.9374905}\SpecialCharTok{*}\FloatTok{5.4375}\NormalTok{) }\SpecialCharTok{+}\NormalTok{ (}\FloatTok{3.9152803}\SpecialCharTok{*}\FloatTok{5.5375}\NormalTok{) }\SpecialCharTok{+} \FloatTok{0.4960303}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] -4.295411
\end{verbatim}

By this model, it predicted we would average being outscored by our opponents by 4.3 points over the season. Reality? We were outscored by 6.25 on average.

\hypertarget{residuals}{%
\chapter{Residuals}\label{residuals}}

When looking at a linear model of your data, there's a measure you need to be aware of called residuals. The residual is the distance between what the model predicted and what the real outcome is. Take our model at the end of the correlation and regression chapter. Our model predicted Nebraska, given a 5 net yardage margin would beat Iowa by 1.96 points. They lost by 6. So our residual is -7.96.

Residuals can tell you several things, but most important is if a linear model the right model for your data. If the residuals appear to be random, then a linear model is appropriate. If they have a pattern, it means something else is going on in your data and a linear model isn't appropriate.

Residuals can also tell you who is underperforming and overperforming the model. And the more robust the model -- the better your r-squared value is -- the more meaningful that label of under or overperforming is.

Let's go back to our net yards model.

Then load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

First, let's make the columns we'll need.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{residualmodel }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{differential =}\NormalTok{ TeamScore }\SpecialCharTok{{-}}\NormalTok{ OpponentScore, }\AttributeTok{NetYards =}\NormalTok{ OffensiveYards }\SpecialCharTok{{-}}\NormalTok{ DefYards)}
\end{Highlighting}
\end{Shaded}

Now let's create our model.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fit }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ NetYards, }\AttributeTok{data =}\NormalTok{ residualmodel)}
\FunctionTok{summary}\NormalTok{(fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = differential ~ NetYards, data = residualmodel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.379  -8.520   0.059   8.487  48.748 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.315632   0.388994   0.811    0.417    
## NetYards    0.102473   0.002307  44.427   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.72 on 1068 degrees of freedom
## Multiple R-squared:  0.6489, Adjusted R-squared:  0.6486 
## F-statistic:  1974 on 1 and 1068 DF,  p-value: < 2.2e-16
\end{verbatim}

We've seen this output before, but let's review because if you are using scatterplots to make a point, you should do this. First, note the Min and Max residual at the top. A team has underperformed the model by 39 points (!), and a team has overperformed it by 39 points (!!). The median residual, where half are above and half are below, is just slightly above the fit line. Close here is good.

Next: Look at the Adjusted R-squared value. What that says is that 72 percent of a team's scoring output can be predicted by their net yards.

Last: Look at the p-value. We are looking for a p-value smaller than .05. At .05, we can say that our correlation didn't happen at random. And, in this case, it REALLY didn't happen at random. But if you know a little bit about football, it doesn't surprise you that the more you outgain your opponent, the more you win by. It's an intuitive result.

What we want to do now is look at those residuals. We want to add them to our individual game records. We can do that by creating two new fields -- predicted and residuals -- to our dataframe like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{residualmodel}\SpecialCharTok{$}\NormalTok{predicted }\OtherTok{\textless{}{-}} \FunctionTok{predict}\NormalTok{(fit)}
\NormalTok{residualmodel}\SpecialCharTok{$}\NormalTok{residuals }\OtherTok{\textless{}{-}} \FunctionTok{residuals}\NormalTok{(fit)}
\end{Highlighting}
\end{Shaded}

Now we can sort our data by those residuals. Sorting in descending order gives us the games where teams overperformed the model. To make it easier to read, I'm going to use select to give us just the columns we need to see.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{residualmodel }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(residuals)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(Team, Opponent, Result, NetYards, residuals)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,070 x 5
##    Team              Opponent          Result    NetYards residuals
##    <chr>             <chr>             <chr>        <dbl>     <dbl>
##  1 Arizona State     Arizona           W (70-7)       136      48.7
##  2 Kentucky          Mississippi State W (24-2)      -138      35.8
##  3 Mississippi State Vanderbilt        W (24-17)     -274      34.8
##  4 Kansas State      Kansas            W (55-14)       61      34.4
##  5 Boise State       Colorado State    W (52-21)      -24      33.1
##  6 Texas             Oklahoma State    W (41-34)     -243      31.6
##  7 Army              Mercer            W (49-3)       139      31.4
##  8 Notre Dame        South Florida     W (52-0)       198      31.4
##  9 BYU               North Alabama     W (66-14)      201      31.1
## 10 Rutgers           Nebraska          L (21-28)     -368      30.4
## # ... with 1,060 more rows
\end{verbatim}

So looking at this table, what you see here are the teams who scored more than their net yards would indicate. One of them should jump off the page at you.

Remember Nebraska vs Rutgers? We won and everyone was happy and rel*ieved the season was over? We outgained Rutgers by \textbf{368 yards} in that game and won by 7. Our model predicted Nebraska should have won that game by \textbf{37} points. We should have blown Rutgers out of their own barn. But Rutgers isn't as hard done as Arizona, which should have lost by 48, but ended up losing by 63.

But, before we can bestow any validity on this model, we need to see if this linear model is appropriate. We've done that some looking at our p-values and R-squared values. But one more check is to look at the residuals themselves. We do that by plotting the residuals with the predictor. We'll get into plotting soon, but for now just seeing it is enough.

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-99-1.pdf}

The lack of a shape here -- the seemingly random nature -- is a good sign that a linear model works for our data. If there was a pattern, that would indicate something else was going on in our data and we needed a different model.

Another way to view your residuals is by connecting the predicted value with the actual value.

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-100-1.pdf}

The blue line here separates underperformers from overperformers.

\hypertarget{penalties}{%
\section{Penalties}\label{penalties}}

Now let's look at it where it doesn't work: Penalties.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{penalties }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{differential =}\NormalTok{ TeamScore }\SpecialCharTok{{-}}\NormalTok{ OpponentScore, }
    \AttributeTok{TotalPenalties =}\NormalTok{ Penalties}\SpecialCharTok{+}\NormalTok{DefPenalties,}
    \AttributeTok{TotalPenaltyYards =}\NormalTok{ PenaltyYds}\SpecialCharTok{+}\NormalTok{DefPenaltyYds}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pfit }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(differential }\SpecialCharTok{\textasciitilde{}}\NormalTok{ TotalPenaltyYards, }\AttributeTok{data =}\NormalTok{ penalties)}
\FunctionTok{summary}\NormalTok{(pfit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = differential ~ TotalPenaltyYards, data = penalties)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67.02 -14.68   0.24  14.04  64.98 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)        1.226995   1.817342   0.675    0.500
## TotalPenaltyYards -0.003383   0.015816  -0.214    0.831
## 
## Residual standard error: 21.46 on 1068 degrees of freedom
## Multiple R-squared:  4.284e-05,  Adjusted R-squared:  -0.0008935 
## F-statistic: 0.04575 on 1 and 1068 DF,  p-value: 0.8307
\end{verbatim}

So from top to bottom:

\begin{itemize}
\tightlist
\item
  Our min and max go from -67 to positive 65
\item
  Our adjusted R-squared is \ldots{} -0.0008935. Not much at all.
\item
  Our p-value is \ldots{} 0.8307, which is more than than .05.
\end{itemize}

So what we can say about this model is that it's statistically insignificant and utterly meaningless. Normally, we'd stop right here -- why bother going forward with a predictive model that isn't predictive? But let's do it anyway.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{penalties}\SpecialCharTok{$}\NormalTok{predicted }\OtherTok{\textless{}{-}} \FunctionTok{predict}\NormalTok{(pfit)}
\NormalTok{penalties}\SpecialCharTok{$}\NormalTok{residuals }\OtherTok{\textless{}{-}} \FunctionTok{residuals}\NormalTok{(pfit)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{penalties }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(residuals)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(Team, Opponent, Result, TotalPenaltyYards, residuals)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,070 x 5
##    Team          Opponent         Result    TotalPenaltyYards residuals
##    <chr>         <chr>            <chr>                 <dbl>     <dbl>
##  1 Clemson       Georgia Tech     W (73-7)                 60      65.0
##  2 Arizona State Arizona          W (70-7)                119      62.2
##  3 Alabama       Kentucky         W (63-3)                 99      59.1
##  4 Marshall      Eastern Kentucky W (59-0)                 55      58.0
##  5 Texas         Texas-El Paso    W (59-3)                 85      55.1
##  6 Pitt          Austin Peay      W (55-0)                 57      54.0
##  7 Oklahoma      Kansas           W (62-9)                100      52.1
##  8 Wake Forest   Campbell         W (66-14)                82      51.1
##  9 BYU           North Alabama    W (66-14)                74      51.0
## 10 Notre Dame    South Florida    W (52-0)                 65      51.0
## # ... with 1,060 more rows
\end{verbatim}

First, note all of the biggest misses here are all blowout games. The worst games of the season, the worst being Clemson vs Georgia Tech. The model missed that differential by \ldots{} 65 points. The margin of victory? 66 points. In other words, this model is terrible. But let's look at it anyway.

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-105-1.pdf}

Well \ldots{} it actually says that a linear model is appropriate. Which an important lesson -- just because your residual plot says a linear model works here, that doesn't say your linear model is good. There are other measures for that, and you need to use them.

Here's the segment plot of residuals -- you'll see some really long lines. That's a bad sign. Another bad sign? A flat fit line. It means there's no relationship between these two things. Which we already know.

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-106-1.pdf}

\hypertarget{z-scores}{%
\chapter{Z-scores}\label{z-scores}}

Z-scores are a handy way to standardize numbers so you can compare things across groupings or time. In this class, we may want to compare teams by year, or era. We can use z-scores to answer questions like who was the greatest X of all time, because a z-score can put them in context to their era.

A z-score is a measure of how a particular stat is from the mean. It's measured in standard deviations from that mean. A standard deviation is a measure of how much variation -- how spread out -- numbers are in a data set. What it means here, with regards to z-scores, is that zero is perfectly average. If it's 1, it's one standard deviation above the mean, and 34 percent of all cases are between 0 and 1.

\includegraphics[width=17.64in]{images/simulations2}

If you think of the normal distribution, it means that 84.3 percent of all case are below that 1. If it were -1, it would mean the number is one standard deviation below the mean, and 84.3 percent of cases would be above that -1. So if you have numbers with z-scores of 3 or even 4, that means that number is waaaaaay above the mean.

So let's use last year's Nebraska basketball team, which if haven't been paying attention to current events, was not good at basketball.

\hypertarget{calculating-a-z-score-in-r}{%
\section{Calculating a Z score in R}\label{calculating-a-z-score-in-r}}

For this we'll need the logs of all college basketball games last season.

Load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And load the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gamelogs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

The first thing we need to do is select some fields we think represent team quality and a few things to help us keep things straight. So I'm going to pick shooting percentage, rebounding and the opponent version of the same two:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{teamquality }\OtherTok{\textless{}{-}}\NormalTok{ gamelogs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(Conference, Team, TeamFGPCT, TeamTotalRebounds, OpponentFGPCT, OpponentTotalRebounds)}
\end{Highlighting}
\end{Shaded}

And since we have individual game data, we need to collapse this into one record for each team. We do that with \ldots{} group by.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{teamtotals }\OtherTok{\textless{}{-}}\NormalTok{ teamquality }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Conference, Team) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{FGAvg =} \FunctionTok{mean}\NormalTok{(TeamFGPCT), }
    \AttributeTok{ReboundAvg =} \FunctionTok{mean}\NormalTok{(TeamTotalRebounds), }
    \AttributeTok{OppFGAvg =} \FunctionTok{mean}\NormalTok{(OpponentFGPCT),}
    \AttributeTok{OffRebAvg =} \FunctionTok{mean}\NormalTok{(OpponentTotalRebounds)}
\NormalTok{    ) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Conference' (override with `.groups` argument)
\end{verbatim}

To calculate a z-score in R, the easiest way is to use the \texttt{scale} function in base R. To use it, you use \texttt{scale(FieldName,\ center=TRUE,\ scale=TRUE)}. The center and scale indicate if you want to subtract from the mean and if you want to divide by the standard deviation, respectively. We do.

When we have multiple z-scores, it's pretty standard practice to add them together into a composite score. That's what we're doing at the end here with \texttt{TotalZscore}. Note: We have to invert OppZscore and OppRebZScore by multiplying it by a negative 1 because the lower someone's opponent number is, the better.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{teamzscore }\OtherTok{\textless{}{-}}\NormalTok{ teamtotals }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{FGzscore =} \FunctionTok{as.numeric}\NormalTok{(}\FunctionTok{scale}\NormalTok{(FGAvg, }\AttributeTok{center =} \ConstantTok{TRUE}\NormalTok{, }\AttributeTok{scale =} \ConstantTok{TRUE}\NormalTok{)),}
    \AttributeTok{RebZscore =} \FunctionTok{as.numeric}\NormalTok{(}\FunctionTok{scale}\NormalTok{(ReboundAvg, }\AttributeTok{center =} \ConstantTok{TRUE}\NormalTok{, }\AttributeTok{scale =} \ConstantTok{TRUE}\NormalTok{)),}
    \AttributeTok{OppZscore =} \FunctionTok{as.numeric}\NormalTok{(}\FunctionTok{scale}\NormalTok{(OppFGAvg, }\AttributeTok{center =} \ConstantTok{TRUE}\NormalTok{, }\AttributeTok{scale =} \ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{*} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{,}
    \AttributeTok{OppRebZScore =} \FunctionTok{as.numeric}\NormalTok{(}\FunctionTok{scale}\NormalTok{(OffRebAvg, }\AttributeTok{center =} \ConstantTok{TRUE}\NormalTok{, }\AttributeTok{scale =} \ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{*} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{,}
    \AttributeTok{TotalZscore =}\NormalTok{ FGzscore }\SpecialCharTok{+}\NormalTok{ RebZscore }\SpecialCharTok{+}\NormalTok{ OppZscore }\SpecialCharTok{+}\NormalTok{ OppRebZScore}
\NormalTok{  )  }
\end{Highlighting}
\end{Shaded}

So now we have a dataframe called \texttt{teamzscore} that has 353 basketball teams with Z scores. What does it look like?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(teamzscore)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 11
## # Groups:   Conference [1]
##   Conference Team  FGAvg ReboundAvg OppFGAvg OffRebAvg FGzscore RebZscore
##   <chr>      <chr> <dbl>      <dbl>    <dbl>     <dbl>    <dbl>     <dbl>
## 1 A-10       Davi~ 0.454       31.1    0.437      30.4    0.505   -0.619 
## 2 A-10       Dayt~ 0.525       32.5    0.413      29.0    2.59     0.0352
## 3 A-10       Duqu~ 0.444       32.4    0.427      32.4    0.216   -0.0168
## 4 A-10       Ford~ 0.384       30.0    0.402      33.9   -1.53    -1.13  
## 5 A-10       Geor~ 0.424       33.8    0.440      30.5   -0.358    0.620 
## 6 A-10       Geor~ 0.422       30.5    0.452      32.7   -0.410   -0.904 
## # ... with 3 more variables: OppZscore <dbl>, OppRebZScore <dbl>,
## #   TotalZscore <dbl>
\end{verbatim}

A way to read this -- a team at zero is precisely average. The larger the positive number, the more exceptional they are. The larger the negative number, the more truly terrible they are.

So who are the best teams in the country?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{teamzscore }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(TotalZscore))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 353 x 11
## # Groups:   Conference [32]
##    Conference Team  FGAvg ReboundAvg OppFGAvg OffRebAvg FGzscore RebZscore
##    <chr>      <chr> <dbl>      <dbl>    <dbl>     <dbl>    <dbl>     <dbl>
##  1 Big West   UC-I~ 0.473       36.6    0.390      27.1    1.60     2.23  
##  2 Big 12     Kans~ 0.482       35.9    0.378      29.0    2.36     1.13  
##  3 WCC        Gonz~ 0.517       37.4    0.424      28.2    1.73     1.90  
##  4 Southland  Step~ 0.490       34.2    0.427      26.6    1.76     1.05  
##  5 Big Ten    Mich~ 0.460       37.7    0.382      29.6    1.38     1.55  
##  6 OVC        Murr~ 0.477       35.3    0.401      29.2    1.31     1.36  
##  7 Summit     Sout~ 0.492       35.5    0.423      31.3    1.58     1.52  
##  8 A-10       Dayt~ 0.525       32.5    0.413      29.0    2.59     0.0352
##  9 A-10       Sain~ 0.457       37.4    0.403      30.5    0.598    2.21  
## 10 ACC        Loui~ 0.457       36.6    0.392      29.8    1.11     1.37  
## # ... with 343 more rows, and 3 more variables: OppZscore <dbl>,
## #   OppRebZScore <dbl>, TotalZscore <dbl>
\end{verbatim}

Don't sleep on the Anteaters! Would have been a tough out at the tournament that never happened.

But closer to home, how is Nebraska doing.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{teamzscore }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(TotalZscore))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 14 x 11
## # Groups:   Conference [1]
##    Conference Team  FGAvg ReboundAvg OppFGAvg OffRebAvg FGzscore RebZscore
##    <chr>      <chr> <dbl>      <dbl>    <dbl>     <dbl>    <dbl>     <dbl>
##  1 Big Ten    Mich~ 0.460       37.7    0.382      29.6    1.38      1.55 
##  2 Big Ten    Rutg~ 0.449       37      0.385      31.1    0.727     1.22 
##  3 Big Ten    Ohio~ 0.447       33.6    0.400      28.4    0.592    -0.393
##  4 Big Ten    Illi~ 0.444       36.1    0.418      29.1    0.439     0.779
##  5 Big Ten    Indi~ 0.445       35.1    0.419      29.4    0.480     0.306
##  6 Big Ten    Mary~ 0.419       36.1    0.401      31.9   -0.952     0.794
##  7 Big Ten    Mich~ 0.463       33.0    0.428      31.9    1.56     -0.682
##  8 Big Ten    Penn~ 0.432       35.6    0.411      34.2   -0.237     0.550
##  9 Big Ten    Minn~ 0.426       35.5    0.411      33     -0.560     0.520
## 10 Big Ten    Iowa~ 0.452       34.2    0.430      32.4    0.918    -0.104
## 11 Big Ten    Purd~ 0.418       33.8    0.410      29.3   -1.02     -0.271
## 12 Big Ten    Wisc~ 0.426       31.3    0.410      32.0   -0.587    -1.49 
## 13 Big Ten    Nort~ 0.417       30.5    0.422      34.8   -1.12     -1.84 
## 14 Big Ten    Nebr~ 0.408       32.4    0.453      42.2   -1.62     -0.947
## # ... with 3 more variables: OppZscore <dbl>, OppRebZScore <dbl>,
## #   TotalZscore <dbl>
\end{verbatim}

So, as we can see, with our composite Z Score, Nebraska is \ldots{} not good. Not good at all.

\hypertarget{writing-about-z-scores}{%
\section{Writing about z-scores}\label{writing-about-z-scores}}

The great thing about z-scores is that they make it very easy for you, the sports analyst, to create your own measures of who is better than who. The downside: Only a small handful of sports fans know what the hell a z-score is.

As such, you should try as hard as you can to avoid writing about them.

If the word z-score appears in your story or in a chart, you need to explain what it is. ``The ranking uses a statistical measure of the distance from the mean called a z-score'' is a good way to go about it. You don't need a full stats textbook definition, just a quick explanation. And keep it simple.

\textbf{Never use z-score in a headline.} Write around it. Away from it. Z-score in a headline is attention repellent. You won't get anyone to look at it. So ``Tottenham tops in z-score'' bad, ``Tottenham tops in the Premiere League'' good.

\hypertarget{clustering}{%
\chapter{Clustering}\label{clustering}}

One common effort in sports is to classify teams and players -- who are this players peers? What teams are like this one? Who should we compare a player to? Truth is, most sports commentators use nothing more sophisticated that looking at a couple of stats or use the ``eye test'' to say a player is like this or that.

There's better ways.

In this chapter, we're going to use a method that sounds advanced but it really quite simple called k-means clustering. It's based on the concept of the k-nearest neighbor algorithm. You're probably already scared. Don't be.

Imagine two dots on a scatterplot. If you took a ruler out and measured the distance between those dots, you'd know how far apart they are. In math, that's called the Euclidean distance. It's just the space between them in numbers. Where k-nearest neighbor comes in, you have lots of dots and you want measure the distance between all of them. What does k-means clustering do? It lumps them into groups based on the average distance between them. Players who are good on offense but bad on defense are over here, good offense good defense are over here. And using the Euclidean distance between them, we can decide who is in and who is out of those groups.

For this exercise, I want to look at Cam Mack, Nebraska's point guard and probably the most interesting player on Fred Hoiberg's first team. This was Mack's first -- only? -- year in major college basketball. I believe Mack could have been one of the best players Nebraska ever had, but it didn't work out. So who does Cam Mack compare to?

To answer this, we'll use k-means clustering.

First thing we do is load some libraries and set a seed, so if we run this repeatedly, our random numbers are generated from the same base. If you don't have the cluster library, just add it on the console with \texttt{install.packages("cluster")}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(cluster)}

\FunctionTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

I've gone and scraped stats for every player in that season.

Now load that data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/players20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Team = col_character(),
##   Player = col_character(),
##   Class = col_character(),
##   Pos = col_character(),
##   Height = col_character(),
##   Hometown = col_character(),
##   `High School` = col_character(),
##   Summary = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

To cluster this data properly, we have some work to do.

First, it won't do to have players who haven't played, so we can use filter to find anyone with greater than 0 minutes played. Next, Cam Mack is a guard, so let's just look at guards. Third, we want to limit the data to things that make sense to look at for Cam Mack -- things like shooting, three point shooting, assists, turnovers and points.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playersselected }\OtherTok{\textless{}{-}}\NormalTok{ players }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(MP}\SpecialCharTok{\textgreater{}}\DecValTok{0}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Pos }\SpecialCharTok{==} \StringTok{"G"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(Player, Team, Pos, MP, }\StringTok{\textasciigrave{}}\AttributeTok{FG\%}\StringTok{\textasciigrave{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{3P\%}\StringTok{\textasciigrave{}}\NormalTok{, AST, TOV, PTS) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{na.omit}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

Now, k-means clustering doesn't work as well with data that can be on different scales. So comparing a percentage to a count metric -- shooting percentage to points -- would create chaos because shooting percentages are a fraction of 1 and points, depending on when they are in the season, could be quite large. So we have to scale each metric -- put them on a similar basis using the distance from the max value as our guide. Also, k-means clustering won't work with text data, so we need to create a dataframe that's just the numbers, but scaled. We can do that with another select, and using mutate\_all with the scale function. The \texttt{na.omit()} means get rid of any blanks, because they too will cause errors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playersscaled }\OtherTok{\textless{}{-}}\NormalTok{ playersselected }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(MP, }\StringTok{\textasciigrave{}}\AttributeTok{FG\%}\StringTok{\textasciigrave{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{3P\%}\StringTok{\textasciigrave{}}\NormalTok{, AST, TOV, PTS) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate\_all}\NormalTok{(scale) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{na.omit}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

With k-means clustering, we decide how many clusters we want. Most often, researchers will try a handful of different cluster numbers and see what works. But there are methods for finding the optimal number. One method is called the Elbow method. One implementation of this, \href{https://uc-r.github.io/kmeans_clustering}{borrowed from the University of Cincinnati's Business Analytics program}, does this quite nicely with a graph that will help you decide for yourself.

All you need to do in this code is change out the data frame -- \texttt{playersscaled} in this case -- and run it.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# function to compute total within{-}cluster sum of square }
\NormalTok{wss }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(k) \{}
  \FunctionTok{kmeans}\NormalTok{(playersscaled, k, }\AttributeTok{nstart =} \DecValTok{10}\NormalTok{ )}\SpecialCharTok{$}\NormalTok{tot.withinss}
\NormalTok{\}}

\CommentTok{\# Compute and plot wss for k = 1 to k = 15}
\NormalTok{k.values }\OtherTok{\textless{}{-}} \DecValTok{1}\SpecialCharTok{:}\DecValTok{15}

\CommentTok{\# extract wss for 2{-}15 clusters}
\NormalTok{wss\_values }\OtherTok{\textless{}{-}} \FunctionTok{map\_dbl}\NormalTok{(k.values, wss)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot}\NormalTok{(k.values, wss\_values,}
       \AttributeTok{type=}\StringTok{"b"}\NormalTok{, }\AttributeTok{pch =} \DecValTok{19}\NormalTok{, }\AttributeTok{frame =} \ConstantTok{FALSE}\NormalTok{, }
       \AttributeTok{xlab=}\StringTok{"Number of clusters K"}\NormalTok{,}
       \AttributeTok{ylab=}\StringTok{"Total within{-}clusters sum of squares"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-122-1.pdf}

The Elbow method -- so named because you're looking for the ``elbow'' where the line flattens out. In this case, it looks like a K of 5 is ideal. So let's try that. We're going to use the kmeans function, saving it to an object called k5. We just need to tell it our dataframe name, how many centers (k) we want, and we'll use a sensible default for how many different configurations to try.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{k5 }\OtherTok{\textless{}{-}} \FunctionTok{kmeans}\NormalTok{(playersscaled, }\AttributeTok{centers =} \DecValTok{5}\NormalTok{, }\AttributeTok{nstart =} \DecValTok{25}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Let's look at what we get.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{k5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## K-means clustering with 5 clusters of sizes 242, 58, 495, 989, 863
## 
## Cluster means:
##           MP         FG%         3P%        AST        TOV        PTS
## 1 -1.3602270 -1.98012908 -1.65325745 -0.9392522 -1.1172744 -1.1417360
## 2 -1.3785814  3.22934178  3.92207534 -0.9251265 -1.1485250 -1.1077564
## 3  1.2279089  0.26624900  0.17613521  1.5255303  1.5159849  1.4306790
## 4  0.5451701  0.14083415  0.12854961  0.1392254  0.2539075  0.3248641
## 5 -0.8549890  0.02411493 -0.04833668 -0.7090093 -0.7700257 -0.7982928
## 
## Clustering vector:
##    [1] 3 4 4 4 4 5 5 5 5 4 4 3 4 5 5 5 3 4 5 5 1 1 3 4 4 4 4 4 5 2 4 4 3 4 4 5 1
##   [38] 3 3 4 4 5 4 5 5 3 4 4 4 5 5 4 3 4 4 4 4 5 5 1 4 3 5 4 4 5 5 5 5 5 4 4 3 4
##   [75] 4 5 1 5 1 3 4 4 4 5 5 5 5 3 4 4 4 4 5 1 3 4 4 5 5 5 3 4 5 4 5 5 5 5 2 4 4
##  [112] 5 5 5 4 5 1 3 4 4 4 5 5 5 3 4 4 5 1 1 3 4 4 5 5 5 3 4 4 4 4 5 5 3 3 4 4 4
##  [149] 4 1 3 4 4 4 5 5 5 1 3 4 4 4 4 3 5 1 3 3 4 4 4 4 5 5 4 4 4 4 5 5 5 5 5 5 3
##  [186] 4 1 1 3 4 4 4 5 5 1 5 3 3 4 4 5 5 5 2 1 3 4 4 3 4 4 5 5 5 1 3 4 5 5 5 1 1
##  [223] 3 3 4 4 4 4 4 1 3 4 4 3 5 3 4 3 4 4 5 5 5 4 4 4 4 4 5 1 3 3 3 4 4 4 5 5 1
##  [260] 1 3 3 4 4 4 5 5 5 1 3 3 4 4 4 5 5 5 3 4 4 4 1 3 3 4 4 5 1 4 3 3 4 3 4 5 5
##  [297] 3 4 4 4 5 5 5 5 5 3 4 4 5 1 5 1 1 3 3 4 4 5 5 3 3 4 4 4 5 5 4 4 4 4 5 5 5
##  [334] 3 3 4 4 4 4 5 3 3 4 5 5 5 1 3 4 4 4 4 5 1 3 3 4 5 4 5 5 1 1 3 4 4 4 4 5 5
##  [371] 5 3 3 4 4 5 5 5 5 5 5 5 3 3 4 3 4 5 5 5 1 3 3 4 5 5 3 3 4 4 5 1 3 3 5 5 5
##  [408] 3 3 4 5 5 1 3 4 4 4 4 5 5 5 2 3 4 4 4 4 5 5 5 3 3 4 5 5 4 3 4 4 4 4 5 1 3
##  [445] 3 4 4 5 5 5 5 3 3 4 4 5 1 3 4 4 4 5 5 5 5 4 3 4 4 4 5 5 5 3 4 4 5 5 5 5 1
##  [482] 3 3 4 4 5 5 1 1 3 3 4 5 5 4 5 3 3 4 4 4 5 1 1 4 4 4 3 5 5 3 4 4 4 4 5 1 3
##  [519] 4 4 4 5 5 2 5 3 3 4 4 4 5 1 2 3 3 4 3 5 5 5 1 3 4 4 4 4 5 5 5 5 3 3 4 5 5
##  [556] 5 3 3 4 4 4 5 5 1 4 4 4 4 5 1 1 1 4 4 4 5 5 5 1 3 3 3 5 5 5 5 1 1 3 3 4 4
##  [593] 5 4 3 3 4 4 4 4 5 5 2 3 4 4 5 1 1 1 3 4 4 4 4 5 5 1 1 3 3 4 4 5 3 4 4 4 5
##  [630] 5 1 3 4 3 4 5 5 1 3 4 4 4 5 2 1 3 4 4 4 4 5 4 4 4 4 4 4 5 5 5 3 4 3 5 5 5
##  [667] 3 4 4 3 5 5 5 1 2 3 4 4 5 5 5 3 3 4 4 4 5 5 5 1 3 4 4 4 3 5 5 4 4 4 5 1 1
##  [704] 1 4 3 4 4 4 4 2 5 1 4 3 3 4 3 5 1 3 3 4 4 5 5 1 5 4 3 4 5 5 4 4 3 4 4 5 4
##  [741] 5 5 3 3 4 5 3 3 4 5 5 5 5 2 1 3 4 4 5 5 3 4 4 4 4 5 5 3 3 3 4 4 5 2 4 4 4
##  [778] 5 5 5 5 5 3 4 4 4 4 5 5 1 1 4 4 4 4 4 2 5 2 5 5 4 4 3 4 4 4 5 3 4 4 4 5 5
##  [815] 5 4 3 3 4 4 1 5 1 1 3 4 4 4 5 1 3 4 4 5 5 5 5 5 5 4 4 4 4 4 5 1 1 3 4 4 4
##  [852] 5 5 4 5 3 4 4 4 5 3 4 5 5 5 3 4 4 4 5 5 5 3 3 4 4 5 5 5 2 3 4 4 4 1 4 3 4
##  [889] 5 4 5 5 5 2 3 4 4 1 5 5 1 4 3 4 4 5 4 4 3 4 5 5 1 1 1 3 3 4 4 5 1 1 3 4 5
##  [926] 5 5 4 4 4 4 5 4 5 5 5 5 3 3 4 3 5 5 5 5 5 3 4 4 4 5 5 3 4 4 4 5 5 5 3 4 4
##  [963] 4 4 5 5 3 4 4 5 5 5 1 1 3 3 4 4 4 5 5 5 1 3 3 4 5 5 5 5 5 5 3 3 4 5 4 1 3
## [1000] 4 4 5 5 5 1 3 3 4 4 5 5 5 5 5 3 4 4 4 4 4 5 5 1 4 3 4 4 4 4 4 5 2 1 3 3 4
## [1037] 4 1 5 3 4 4 4 5 1 5 4 3 4 5 4 5 4 1 4 3 4 4 4 5 1 3 4 3 3 4 5 5 1 3 4 3 5
## [1074] 5 1 1 3 4 3 4 5 5 5 5 5 5 5 3 4 4 5 5 5 5 3 3 3 4 5 5 5 5 1 3 4 4 4 5 4 5
## [1111] 5 2 3 3 5 4 5 5 5 5 3 3 4 4 4 5 4 5 5 5 1 4 4 4 5 4 4 5 5 2 5 1 3 4 4 4 5
## [1148] 5 3 3 3 4 5 1 3 3 4 4 5 1 4 3 4 4 4 5 5 5 3 3 4 4 4 5 1 5 3 4 4 5 5 2 1 5
## [1185] 1 3 4 3 5 1 5 1 3 4 4 5 1 5 1 5 1 3 4 4 4 5 5 5 1 5 3 4 3 4 4 5 3 3 4 4 5
## [1222] 3 3 4 5 5 4 3 4 4 4 5 5 3 3 4 4 4 1 5 2 3 4 4 4 5 5 1 4 4 4 4 5 4 5 5 3 3
## [1259] 4 5 5 1 1 5 3 3 3 4 5 5 1 3 4 4 4 4 1 3 4 4 5 1 1 3 3 4 4 4 5 2 5 4 3 3 4
## [1296] 4 5 1 3 4 4 4 5 2 1 3 4 5 5 5 5 1 1 3 4 4 4 4 5 5 4 4 4 4 4 5 3 4 4 4 4 5
## [1333] 5 1 5 3 3 4 4 4 5 5 5 4 4 4 3 4 5 5 4 3 4 5 4 4 5 5 4 3 5 4 5 5 2 1 3 3 4
## [1370] 4 4 5 1 1 3 4 4 4 5 1 2 2 4 4 4 4 5 4 5 1 3 4 4 5 5 5 5 1 3 3 4 5 5 5 3 4
## [1407] 3 5 5 2 3 4 4 4 5 5 1 1 4 4 4 4 4 5 1 2 1 3 5 5 5 5 5 1 3 3 4 4 4 5 5 2 1
## [1444] 1 1 4 5 5 5 5 3 4 4 4 5 5 5 1 4 4 4 4 4 5 3 4 4 4 4 5 5 1 5 3 3 5 5 5 5 5
## [1481] 3 4 4 5 5 5 1 3 3 4 5 5 5 5 5 3 3 4 4 5 5 3 3 3 4 4 5 2 1 4 3 4 4 4 5 5 4
## [1518] 4 4 4 5 5 5 5 5 4 4 4 4 5 5 5 3 3 4 4 5 1 1 3 3 3 4 4 5 3 4 4 5 5 5 5 4 4
## [1555] 4 4 4 3 4 1 4 3 3 5 3 4 4 4 5 5 5 3 4 5 5 5 5 5 1 3 4 3 4 5 5 5 5 2 3 4 4
## [1592] 4 5 3 4 3 5 1 5 5 4 4 4 4 4 4 5 1 1 4 4 4 4 4 4 5 5 1 3 4 3 4 5 5 4 5 5 2
## [1629] 1 1 1 3 3 3 4 4 5 3 4 4 4 4 5 1 4 4 4 4 4 5 5 1 3 3 4 5 4 5 5 2 1 3 4 4 4
## [1666] 5 1 1 4 4 4 5 2 3 3 4 4 4 5 5 5 4 3 4 4 4 5 5 5 3 4 4 4 5 1 5 3 4 4 4 5 4
## [1703] 5 5 5 3 3 4 4 5 3 4 4 4 4 5 5 5 3 4 4 5 5 5 5 2 3 3 4 4 4 5 5 5 4 3 4 3 4
## [1740] 3 4 4 4 4 5 1 3 4 4 5 5 1 2 3 3 4 4 5 5 1 3 4 4 4 4 1 4 3 4 4 5 5 3 4 4 5
## [1777] 5 5 5 1 4 3 3 4 5 5 5 3 4 4 4 2 5 5 5 1 3 3 3 4 4 4 5 2 3 3 4 4 5 3 4 4 4
## [1814] 5 4 5 1 5 2 1 4 3 3 5 5 5 3 3 4 4 5 5 1 1 3 3 4 4 5 5 5 3 3 3 3 4 1 3 4 3
## [1851] 4 4 5 5 5 5 4 4 4 3 3 5 2 3 4 3 5 5 5 5 1 3 4 4 4 5 1 4 3 3 5 4 5 2 1 3 4
## [1888] 3 5 5 5 1 4 4 4 4 4 4 5 1 5 4 4 4 5 4 5 5 3 4 4 5 5 4 3 4 4 5 5 5 2 3 4 4
## [1925] 4 4 5 5 5 3 4 3 4 5 5 1 1 3 4 4 5 5 5 4 3 4 5 4 5 3 4 4 5 5 5 1 4 4 4 4 5
## [1962] 3 4 4 4 4 4 5 2 2 3 3 3 4 4 5 5 1 1 1 3 4 4 3 4 5 5 5 5 5 4 4 5 4 5 5 1 3
## [1999] 3 4 4 5 5 5 3 4 4 3 5 4 5 5 5 3 3 3 4 4 1 3 4 4 4 4 5 5 3 4 4 4 1 1 2 2 3
## [2036] 4 5 4 1 3 3 4 5 1 3 4 4 4 4 4 5 5 2 1 3 4 4 5 5 1 3 4 5 5 5 1 1 5 3 3 4 4
## [2073] 5 5 5 5 3 3 3 5 5 5 1 3 3 4 4 4 5 5 5 1 3 4 4 3 5 5 5 1 3 3 4 4 4 5 5 2 1
## [2110] 4 4 4 4 4 5 5 5 3 3 3 4 5 5 5 5 4 4 5 5 5 1 4 3 4 4 4 4 5 5 3 4 3 4 4 5 1
## [2147] 3 4 4 4 5 5 1 5 3 3 4 4 5 5 5 3 3 4 4 4 5 5 2 3 4 4 5 5 2 1 2 3 3 4 4 5 5
## [2184] 3 4 4 4 5 5 5 1 4 3 4 4 4 4 5 5 2 5 4 4 3 4 4 5 5 5 5 5 4 4 4 3 4 5 5 1 3
## [2221] 4 4 4 5 5 5 5 3 4 4 5 5 3 3 4 4 5 5 5 5 5 3 3 4 4 4 5 5 1 5 3 4 4 4 5 5 5
## [2258] 1 4 4 3 4 4 5 5 3 4 3 4 4 5 5 5 4 4 4 4 5 4 5 1 5 3 3 3 4 5 5 5 4 3 4 4 5
## [2295] 5 4 4 4 4 4 5 5 5 2 3 4 4 4 5 5 1 3 4 4 4 5 5 4 4 4 4 5 4 5 5 3 4 4 5 5 5
## [2332] 4 3 4 4 4 4 5 5 5 5 3 4 3 4 5 5 5 2 3 3 3 5 1 3 3 4 5 5 2 1 4 3 4 4 4 4 4
## [2369] 4 4 4 1 1 3 3 5 5 3 3 4 4 5 1 3 4 4 5 5 5 3 3 4 4 4 1 5 3 4 5 5 5 5 1 1 3
## [2406] 3 4 4 4 5 5 5 2 3 4 4 4 1 5 5 4 4 3 4 4 4 5 5 5 4 3 4 4 4 5 5 5 1 3 3 4 5
## [2443] 5 3 3 4 4 4 5 5 2 3 4 4 4 5 5 5 3 4 4 4 4 4 5 2 3 3 4 4 5 5 5 5 3 4 3 4 4
## [2480] 5 1 1 3 3 4 4 5 5 1 3 3 4 3 4 4 5 5 5 3 3 5 5 5 1 5 3 3 4 4 5 5 1 5 1 3 4
## [2517] 4 4 4 3 3 4 4 4 4 1 1 5 5 4 5 5 5 5 5 1 3 4 4 4 5 5 5 1 3 4 4 4 5 5 3 4 4
## [2554] 5 5 5 5 3 4 4 5 5 5 1 1 1 3 4 4 4 5 5 1 1 3 3 4 4 5 5 5 1 3 3 4 4 4 5 1 5
## [2591] 3 3 4 4 5 5 3 4 4 4 4 5 5 5 5 5 5 3 3 3 5 5 5 5 1 4 4 3 4 1 5 1 1 3 3 4 5
## [2628] 1 1 3 4 4 4 5 5 5 5 5 5 3 4 4 4 5 5 5 5
## 
## Within cluster sum of squares by cluster:
## [1]  319.4278  250.5515 1094.8283 1341.0495 1430.7781
##  (between_SS / total_SS =  72.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
\end{verbatim}

Interpreting this output, the very first thing you need to know is that \textbf{the cluster numbers are meaningless}. They aren't ranks. They aren't anything. After you have taken that on board, look at the cluster sizes at the top. Clusters 1 and 2 are pretty large compared to others. That's notable. Then we can look at the cluster means. For reference, 0 is going to be average. So group 1 is below average on minutes played. Group 2 is slightly above, group 5 is well above.

So which group is Cam Mack in? Well, first we have to put our data back together again. In K5, there is a list of cluster assignments in the same order we put them in, but recall we have no names. So we need to re-combine them with our original data. We can do that with the following:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playercluster }\OtherTok{\textless{}{-}} \FunctionTok{data.frame}\NormalTok{(playersselected, k5}\SpecialCharTok{$}\NormalTok{cluster) }
\end{Highlighting}
\end{Shaded}

Now we have a dataframe called playercluster that has our player names and what cluster they are in. The fastest way to find Cam Mack is to double click on the playercluster table in the environment and use the search in the top right of the table. Because this is based on some random selections of points to start the groupings, these may change from person to person, but Mack is in Group 1 in my data.

We now have a dataset and can plot it like anything else. Let's get Cam Mack and then plot him against the rest of college basketball on assists versus minutes played.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{cm }\OtherTok{\textless{}{-}}\NormalTok{ playercluster }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Player }\SpecialCharTok{==} \StringTok{"Cam Mack"}\NormalTok{)}

\NormalTok{cm}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     Player                 Team Pos  MP  FG.  X3P. AST TOV PTS k5.cluster
## 1 Cam Mack Nebraska Cornhuskers   G 931 0.39 0.339 172  71 324          3
\end{verbatim}

So Cam's in cluster 3, which if you look at our clusters, puts him in the cluster with all above average metrics. What does that look like? We know Cam was an assist machine, so where do group 5 people grade out on assists?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{playercluster, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{AST, }\AttributeTok{color=}\NormalTok{k5.cluster)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{cm, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{AST), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-127-1.pdf}

Not bad, not bad. But who are Cam Mack's peers? If we look at the numbers in Group 3, there's 495 of them. So let's limit them to just Big Ten guards. Unfortunately, my scraper didn't quite work and in the place of Conference is the coach's name. So I'm going to have to do this the hard way and make a list of Big Ten teams and filter on that. Then I'll sort by minutes played.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{big10 }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Nebraska Cornhuskers"}\NormalTok{, }\StringTok{"Iowa Hawkeyes"}\NormalTok{, }\StringTok{"Minnesota Golden Gophers"}\NormalTok{, }\StringTok{"Illinois Fighting Illini"}\NormalTok{, }\StringTok{"Northwestern Wildcats"}\NormalTok{, }\StringTok{"Wisconsin Badgers"}\NormalTok{, }\StringTok{"Indiana Hoosiers"}\NormalTok{, }\StringTok{"Purdue Boilermakers"}\NormalTok{, }\StringTok{"Ohio State Buckeyes"}\NormalTok{, }\StringTok{"Michigan Wolverines"}\NormalTok{, }\StringTok{"Michigan State Spartans"}\NormalTok{, }\StringTok{"Penn State Nittany Lions"}\NormalTok{, }\StringTok{"Rutgers Scarlet Knights"}\NormalTok{, }\StringTok{"Maryland Terrapins"}\NormalTok{)}

\NormalTok{playercluster }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(k5.cluster }\SpecialCharTok{==} \DecValTok{3}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{\%in\%}\NormalTok{ big10) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(MP))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##             Player                     Team Pos   MP   FG.  X3P. AST TOV PTS
## 1      Marcus Carr Minnesota Golden Gophers   G 1004 0.377 0.341 181  76 419
## 2    Anthony Cowan       Maryland Terrapins   G  965 0.379 0.331 133  59 454
## 3         Cam Mack     Nebraska Cornhuskers   G  931 0.390 0.339 172  71 324
## 4  Eric Hunter Jr.      Purdue Boilermakers   G  907 0.419 0.373  77  57 297
## 5   Zavier Simpson      Michigan Wolverines   G  907 0.472 0.354 213  85 351
## 6      Ayo Dosunmu Illinois Fighting Illini   G  891 0.483 0.295  86  72 443
## 7   D'Mitrik Trice        Wisconsin Badgers   G  883 0.397 0.390 116  50 289
## 8  Cassius Winston  Michigan State Spartans   G  868 0.432 0.409 157  85 497
## 9        CJ Walker      Ohio State Buckeyes   G  795 0.420 0.338  94  47 227
## 10     Pat Spencer    Northwestern Wildcats   G  787 0.458 0.250 103  63 286
## 11       Geo Baker  Rutgers Scarlet Knights   G  736 0.389 0.261  91  45 273
##    k5.cluster
## 1           3
## 2           3
## 3           3
## 4           3
## 5           3
## 6           3
## 7           3
## 8           3
## 9           3
## 10          3
## 11          3
\end{verbatim}

So there are the 11 guards most like Cam Mack in the Big Ten. Safe to say, these are the 11 best guards in the conference.

\hypertarget{advanced-metrics}{%
\section{Advanced metrics}\label{advanced-metrics}}

How much does this change if we change the metrics? I used pretty standard box score metrics above. What if we did it using Player Efficiency Rating, True Shooting Percentage, Point Production, Assist Percentage, Win Shares Per 40 Minutes and Box Plus Minus (you can get definitions of all of them by \href{https://www.sports-reference.com/cbb/schools/nebraska/2020.html}{hovering over the stats on Nebraksa's stats page}).

We'll repeat the process. Filter out players who don't play, players with stats missing, and just focus on those stats listed above.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playersadvanced }\OtherTok{\textless{}{-}}\NormalTok{ players }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(MP}\SpecialCharTok{\textgreater{}}\DecValTok{0}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(Pos }\SpecialCharTok{==} \StringTok{"G"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(Player, Team, Pos, PER, }\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{, PProd, }\StringTok{\textasciigrave{}}\AttributeTok{AST\%}\StringTok{\textasciigrave{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{WS/40}\StringTok{\textasciigrave{}}\NormalTok{, BPM) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{na.omit}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

Now to scale them.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playersadvscaled }\OtherTok{\textless{}{-}}\NormalTok{ playersadvanced }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(PER, }\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{, PProd, }\StringTok{\textasciigrave{}}\AttributeTok{AST\%}\StringTok{\textasciigrave{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{WS/40}\StringTok{\textasciigrave{}}\NormalTok{, BPM) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate\_all}\NormalTok{(scale) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{na.omit}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

Let's find the optimal number of clusters.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# function to compute total within{-}cluster sum of square }
\NormalTok{wss }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(k) \{}
  \FunctionTok{kmeans}\NormalTok{(playersadvscaled, k, }\AttributeTok{nstart =} \DecValTok{10}\NormalTok{ )}\SpecialCharTok{$}\NormalTok{tot.withinss}
\NormalTok{\}}

\CommentTok{\# Compute and plot wss for k = 1 to k = 15}
\NormalTok{k.values }\OtherTok{\textless{}{-}} \DecValTok{1}\SpecialCharTok{:}\DecValTok{15}

\CommentTok{\# extract wss for 2{-}15 clusters}
\NormalTok{wss\_values }\OtherTok{\textless{}{-}} \FunctionTok{map\_dbl}\NormalTok{(k.values, wss)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: did not converge in 10 iterations
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot}\NormalTok{(k.values, wss\_values,}
       \AttributeTok{type=}\StringTok{"b"}\NormalTok{, }\AttributeTok{pch =} \DecValTok{19}\NormalTok{, }\AttributeTok{frame =} \ConstantTok{FALSE}\NormalTok{, }
       \AttributeTok{xlab=}\StringTok{"Number of clusters K"}\NormalTok{,}
       \AttributeTok{ylab=}\StringTok{"Total within{-}clusters sum of squares"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-131-1.pdf}

Looks like 5 again.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{advk5 }\OtherTok{\textless{}{-}} \FunctionTok{kmeans}\NormalTok{(playersadvscaled, }\AttributeTok{centers =} \DecValTok{5}\NormalTok{, }\AttributeTok{nstart =} \DecValTok{25}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

What do we have here?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{advk5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## K-means clustering with 5 clusters of sizes 1477, 1030, 166, 3, 88
## 
## Cluster means:
##          PER         TS%      PProd       AST%      WS/40        BPM
## 1 -0.1853781 -0.06873982 -0.5343488 -0.2696473 -0.1184126 -0.1622670
## 2  0.4087876  0.24526817  1.0477637  0.4472182  0.3563642  0.4575732
## 3 -2.1693836 -2.42045625 -1.1410523 -0.7345409 -2.2297633 -2.2703594
## 4 14.5515907  4.92713793 -1.1318648 -1.3849128 16.0721265  8.9433106
## 5  1.9228983  2.68087293 -1.1040127  0.7241224  1.4745973  1.3456549
## 
## Clustering vector:
##    [1] 2 1 1 1 1 1 1 1 1 2 2 2 1 2 1 5 2 2 1 1 1 1 1 3 2 1 1 2 1 1 1 5 2 2 2 1 1
##   [38] 1 3 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 5 3 2 2 2 1 2 2 1 1 5 2 2 2 1 1 1 1 1 1
##   [75] 1 1 2 2 2 1 2 1 3 1 3 2 1 2 1 1 1 1 1 2 2 2 1 1 1 3 2 2 1 1 1 1 3 2 2 1 1
##  [112] 1 1 1 1 1 2 2 1 1 1 1 1 3 2 2 1 1 1 1 1 2 2 1 1 1 5 5 3 2 1 1 1 1 1 2 2 2
##  [149] 1 1 1 1 3 2 2 2 1 2 1 1 2 2 1 1 1 1 5 1 3 2 2 2 1 2 2 2 3 2 2 2 2 1 1 1 1
##  [186] 2 2 2 1 1 1 1 1 1 1 2 2 1 3 2 2 2 2 1 1 1 1 2 2 1 1 1 1 5 4 1 2 2 1 2 2 1
##  [223] 1 1 1 1 2 2 1 1 1 3 3 2 2 2 2 1 1 1 1 2 1 2 2 1 2 2 2 1 1 1 1 5 1 3 2 2 2
##  [260] 1 1 1 3 2 2 2 2 1 1 1 1 1 5 3 3 2 2 1 1 1 1 1 1 3 2 2 2 1 1 1 2 1 1 2 2 1
##  [297] 1 5 1 2 2 2 1 1 3 2 2 2 2 2 1 1 1 2 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1
##  [334] 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 1 2 2 2 1 1 1 3 2 2 2 1 1 1
##  [371] 1 2 2 2 1 1 1 1 3 3 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1
##  [408] 1 3 2 2 1 1 1 1 2 2 2 1 1 5 3 2 2 1 1 3 1 3 2 2 1 1 1 1 1 2 2 2 1 1 1 1 1
##  [445] 5 2 2 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 2 2 1 1 3 3 2 2 2 1 1 1 1 1 2 2 2 1 1
##  [482] 1 3 2 2 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 1 1 1 1 1 1 3 2 2 1 1 1 1 1 3 2 2
##  [519] 1 1 2 1 1 2 2 2 2 1 1 1 1 1 2 1 1 2 1 1 1 3 3 2 1 1 1 1 1 1 2 1 1 1 1 1 1
##  [556] 2 2 2 2 1 1 1 1 5 2 2 2 2 1 1 1 3 2 2 2 2 2 1 1 1 1 2 2 2 1 1 5 2 2 1 1 1
##  [593] 1 1 1 2 1 2 1 1 1 1 3 2 2 1 1 1 1 1 3 2 2 2 1 1 1 1 3 3 2 2 2 1 1 1 1 2 2
##  [630] 1 1 1 1 1 1 1 2 2 2 1 1 3 3 2 2 2 1 1 1 1 1 1 2 2 2 1 1 2 2 2 1 1 1 1 3 2
##  [667] 2 2 1 1 1 3 2 2 1 1 1 5 3 1 2 2 2 2 2 1 1 2 1 2 2 2 1 1 1 3 2 2 2 1 1 1 2
##  [704] 2 1 2 1 1 1 1 4 2 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 3 2 2 2 1 2 1 1 1 2 2 2 1
##  [741] 1 3 1 2 2 2 2 1 2 5 1 3 2 2 2 2 2 1 1 3 2 2 1 1 1 1 1 1 2 2 2 1 1 1 3 2 2
##  [778] 2 2 1 1 1 1 1 2 2 2 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 3 2 2 2 1 2 2 1 2 2 2 2
##  [815] 1 1 5 2 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 3 2 2 1 1 2 1 1 1 1 1 5 3 2 2 2 1 1
##  [852] 1 1 2 2 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 1 1 1 1 3 2 2 2 1 1 1 1 1 1 2 2
##  [889] 2 1 2 5 3 1 2 1 2 1 2 1 2 1 2 2 2 2 1 2 2 1 1 1 1 2 2 1 2 1 1 1 2 2 2 1 1
##  [926] 1 1 5 2 2 1 1 5 3 2 2 1 1 1 1 1 1 5 2 2 1 1 1 1 3 2 2 2 2 1 2 1 1 1 1 1 1
##  [963] 1 1 2 2 2 2 1 1 1 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 2 2 2 2
## [1000] 1 1 2 2 2 1 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 1 1 3 2 2 2 2 1 1 1 1 3 2 2 2 1
## [1037] 1 1 1 1 1 2 2 2 1 1 3 2 2 1 1 1 1 3 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 5 1 2
## [1074] 2 2 1 1 1 1 1 5 1 2 2 2 1 3 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 3 1 2 1 1 1 1 3
## [1111] 2 2 2 2 1 1 1 1 3 2 2 2 1 1 1 3 2 2 2 2 1 1 1 1 1 1 5 2 1 1 1 1 1 1 2 2 2
## [1148] 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 1 2 2 1
## [1185] 1 1 1 1 5 5 5 3 2 1 2 2 1 1 2 2 2 2 1 3 2 2 2 2 1 1 3 2 2 2 2 1 1 1 1 2 2
## [1222] 2 2 2 1 1 1 2 2 2 1 1 5 1 5 3 2 2 2 1 1 1 1 2 2 2 1 1 1 3 1 3 2 2 1 2 1 1
## [1259] 1 3 1 2 2 2 2 1 3 2 2 1 1 1 2 2 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 1 1 5 2 1 1
## [1296] 1 1 1 1 2 2 2 1 1 1 1 2 2 2 2 1 1 1 3 1 2 2 2 1 1 1 3 2 2 2 2 2 3 2 2 2 1
## [1333] 3 3 2 2 2 1 1 1 5 1 2 2 2 2 1 1 1 2 2 2 2 1 5 3 5 2 1 1 2 5 1 1 1 2 2 2 2
## [1370] 1 1 5 2 2 2 1 2 1 3 2 2 1 2 1 1 1 1 5 2 2 2 1 1 1 1 1 2 1 2 2 2 1 1 2 2 2
## [1407] 1 1 1 1 1 2 2 1 1 1 1 1 5 3 2 2 2 2 1 1 1 3 2 2 2 2 1 1 5 1 1 1 1 1 1 1 1
## [1444] 1 3 2 2 2 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 1 5 2 1 1 1 1 1 3 3 3 1 2 2 2 1 1
## [1481] 1 5 1 3 2 1 1 1 1 1 3 2 2 1 2 1 1 1 1 1 3 3 1 1 1 1 1 2 2 2 2 1 1 1 3 3 2
## [1518] 2 2 2 1 1 1 2 2 2 2 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 1 1
## [1555] 2 2 1 1 1 1 2 2 1 2 2 1 5 1 2 2 2 1 1 1 1 2 2 2 1 1 1 1 1 1 2 2 2 2 1 1 1
## [1592] 1 2 2 1 1 1 3 3 2 2 2 1 1 1 2 2 1 1 1 1 1 2 2 2 1 1 2 1 3 2 2 2 1 1 2 2 2
## [1629] 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 5 2 1 2 2 1 2 2 2 1 1 1 1 2 1 2 1
## [1666] 1 1 1 3 1 2 2 1 1 1 2 1 1 1 2 2 2 1 1 1 1 1 1 5 1 1 3 2 2 2 1 1 1 1 2 2 2
## [1703] 1 1 1 3 2 2 2 2 1 1 1 3 2 2 1 1 1 1 1 1 5 3 2 2 2 1 1 1 3 3 2 2 2 1 5 3 2
## [1740] 2 2 1 2 1 1 1 1 3 2 2 2 1 1 1 1 1 5 2 2 2 2 1 1 1 2 2 2 1 1 2 1 1 1 2 2 1
## [1777] 1 1 2 2 1 1 1 1 1 1 3 2 2 1 1 5 1 1 5 2 2 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1
## [1814] 1 1 2 1 2 1 1 1 1 2 2 2 2 1 1 1 1 3 2 2 1 1 1 1 1 2 2 1 1 1 1 2 2 1 1 1 1
## [1851] 2 3 2 2 2 1 1 1 1 2 2 1 1 1 1 1 1 1 5 3 2 2 2 2 2 1 1 5 2 2 2 1 1 2 1 1 1
## [1888] 1 1 5 1 2 5 1 2 2 2 1 1 1 2 2 1 2 1 1 1 3 2 2 1 1 1 1 5 5 2 2 2 2 2 3 3 2
## [1925] 2 2 2 1 1 5 1 1 1 2 2 2 2 2 2 1 3 2 2 2 1 1 1 5 1 2 2 1 2 1 1 3 3 2 2 2 1
## [1962] 1 1 1 3 2 2 2 1 1 1 1 3 2 2 1 2 2 1 1 1 1 2 1 2 1 1 1 1 2 2 2 1 1 1 2 2 2
## [1999] 2 1 1 5 5 1 3 2 2 1 1 1 2 1 1 1 2 2 2 1 1 1 1 3 2 2 2 1 1 1 2 2 1 1 1 1 5
## [2036] 2 2 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 5 1 1 2 2 2 1 1 1 1 1 3 3 2 2 1 1 2
## [2073] 1 1 1 1 1 1 2 2 1 1 1 1 3 2 2 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 2 2 2 3 2 1
## [2110] 1 1 1 1 1 1 3 2 2 1 1 1 1 5 5 2 2 1 2 1 2 2 1 1 3 2 2 2 2 1 1 1 1 5 3 2 1
## [2147] 1 1 1 3 2 2 1 1 1 1 3 1 3 2 2 1 1 1 1 1 1 2 2 2 1 1 1 3 2 1 1 1 1 1 1 1 1
## [2184] 2 2 2 2 1 1 5 5 5 3 2 2 2 2 1 1 1 5 3 2 2 1 1 1 1 1 1 5 2 2 2 2 1 1 1 1 2
## [2221] 1 1 1 5 1 1 2 2 2 1 1 1 1 1 1 2 2 2 1 1 1 3 3 2 1 1 1 1 1 1 1 1 2 2 2 1 1
## [2258] 1 1 2 2 2 1 1 1 2 5 2 2 1 1 1 1 1 4 2 2 2 1 1 1 1 5 2 2 2 2 1 2 1 1 2 2 2
## [2295] 1 1 1 1 1 5 1 2 2 2 1 1 1 1 1 1 1 2 2 2 2 1 1 1 3 2 2 2 1 1 1 1 1 2 2 1 1
## [2332] 1 2 2 1 1 1 2 1 1 1 2 2 2 2 1 1 1 3 1 2 2 2 2 1 1 1 1 2 2 2 1 1 1 1 2 2 2
## [2369] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 2 2 1 1 1 1 2 2 1 1 1 1 2 2 2 2 1 1 1 1 5 2
## [2406] 1 2 1 1 1 3 2 2 2 1 1 1 2 2 1 2 1 1 1 1 2 2 1 1 1 1 2 2 2 2 1 1 1 1 1 5 2
## [2443] 2 2 1 1 1 1 5 2 2 2 1 3 2 2 1 1 1 5 1 2 2 2 1 1 2 1 1 2 1 1 1 2 2 1 1 2 2
## [2480] 2 2 1 1 3 3 2 2 2 1 1 1 2 2 1 2 2 1 5 2 2 1 1 1 1 1 3 2 2 2 2 2 1 1 1 5 3
## [2517] 2 2 2 1 1 1 1 2 2 2 1 1 2 1 1 1 5 2 2 1 1 1 1 1 1 3 3 2 2 1 1 1 3 3 2 2 2
## [2554] 1 1 1 5 5 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 2 2 2 1 2 2 3 3
## [2591] 2 2 1 1 1 1 1 2 2 2 2 2 1 1 1 5 2 2 1 1 1 1 1 3 2 2 2 1 1 1 1 1 3 2 2 2 2
## [2628] 1 3 2 2 2 2 1 1 1 3 1 1 1 1 1 1 1 1 1 1 3 2 2 2 1 1 1 1 3 2 2 2 2 2 1 2 2
## [2665] 2 1 1 1 1 1 2 2 1 1 1 1 3 1 3 2 2 2 1 1 1 3 3 2 2 2 1 1 1 1 5 3 2 2 1 1 1
## [2702] 1 1 1 2 2 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 3 3 2 1 2 1 1 1 3 3
## [2739] 2 2 2 1 1 5 3 2 2 2 1 1 1 1 5 1 5 1 2 2 2 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2608.03836 1795.99837 1233.54705   29.44812 1412.27741
##  (between_SS / total_SS =  57.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
\end{verbatim}

Looks like this time, cluster 1 is all below average and cluster 5 is mostly above. Which cluster is Cam Mack in?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playeradvcluster }\OtherTok{\textless{}{-}} \FunctionTok{data.frame}\NormalTok{(playersadvanced, advk5}\SpecialCharTok{$}\NormalTok{cluster) }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{cmadv }\OtherTok{\textless{}{-}}\NormalTok{ playeradvcluster }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Player }\SpecialCharTok{==} \StringTok{"Cam Mack"}\NormalTok{)}

\NormalTok{cmadv}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     Player                 Team Pos  PER   TS. PProd AST. WS.40 BPM
## 1 Cam Mack Nebraska Cornhuskers   G 15.9 0.481   382 36.4 0.081 3.9
##   advk5.cluster
## 1             2
\end{verbatim}

Cluster 2 on my dataset. So in this season, we can say he's in a big group of players who are all above average on these advanced metrics.

Now who are his Big Ten peers?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playeradvcluster }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(advk5.cluster }\SpecialCharTok{==} \DecValTok{2}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{\%in\%}\NormalTok{ big10) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(PProd))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                   Player                     Team Pos  PER   TS. PProd AST.
## 1        Cassius Winston  Michigan State Spartans   G 22.5 0.567   495 37.2
## 2            Marcus Carr Minnesota Golden Gophers   G 18.1 0.491   468 37.0
## 3          Anthony Cowan       Maryland Terrapins   G 21.1 0.551   466 29.2
## 4         Zavier Simpson      Michigan Wolverines   G 19.7 0.535   436 43.6
## 5            Ayo Dosunmu Illinois Fighting Illini   G 19.6 0.551   411 21.1
## 6               Cam Mack     Nebraska Cornhuskers   G 15.9 0.481   382 36.4
## 7           Joe Wieskamp            Iowa Hawkeyes   G 19.7 0.566   368  9.3
## 8         Ron Harper Jr.  Rutgers Scarlet Knights   G 20.4 0.541   318  7.6
## 9         D'Mitrik Trice        Wisconsin Badgers   G 16.2 0.531   312 27.5
## 10       Haanif Cheatham     Nebraska Cornhuskers   G 15.6 0.551   308  9.1
## 11          Andres Feliz Illinois Fighting Illini   G 20.1 0.542   306 22.1
## 12           Pat Spencer    Northwestern Wildcats   G 15.9 0.527   299 28.6
## 13       Eric Hunter Jr.      Purdue Boilermakers   G 14.0 0.524   297 17.0
## 14          Dachon Burke     Nebraska Cornhuskers   G 14.3 0.461   294  9.7
## 15          Myreon Jones Penn State Nittany Lions   G 21.0 0.589   288 20.0
## 16            Eli Brooks      Michigan Wolverines   G 13.8 0.521   283 11.2
## 17             Geo Baker  Rutgers Scarlet Knights   G 16.5 0.483   282 23.6
## 18         Aaron Wiggins       Maryland Terrapins   G 14.5 0.496   282 11.0
## 19        Gabe Kalscheur Minnesota Golden Gophers   G 10.5 0.491   273  9.2
## 20         Aljami Durham         Indiana Hoosiers   G 14.3 0.573   262 18.3
## 21        Darryl Morsell       Maryland Terrapins   G 13.4 0.518   252 17.0
## 22         Trent Frazier Illinois Fighting Illini   G 11.5 0.495   251 12.0
## 23          Brad Davison        Wisconsin Badgers   G 14.3 0.546   250 11.9
## 24        Jahaad Proctor      Purdue Boilermakers   G 15.5 0.506   249 13.9
## 25         Devonte Green         Indiana Hoosiers   G 16.1 0.507   249 19.6
## 26             CJ Walker      Ohio State Buckeyes   G 13.9 0.534   247 23.3
## 27  Duane Washington Jr.      Ohio State Buckeyes   G 16.6 0.544   244 13.4
## 28          Franz Wagner      Michigan Wolverines   G 15.5 0.552   242  5.6
## 29           Myles Dread Penn State Nittany Lions   G 13.7 0.498   237 15.6
## 30            Eric Ayala       Maryland Terrapins   G 10.3 0.470   237 17.8
## 31    Izaiah Brockington Penn State Nittany Lions   G 15.5 0.530   232  9.4
## 32           Jacob Young  Rutgers Scarlet Knights   G 12.3 0.455   227 17.4
## 33              Boo Buie    Northwestern Wildcats   G 12.7 0.475   223 22.8
## 34      Sasha Stefanovic      Purdue Boilermakers   G 13.9 0.549   221 11.3
## 35 Thorir Thorbjarnarson     Nebraska Cornhuskers   G 14.1 0.606   218  8.8
## 36           CJ Fredrick            Iowa Hawkeyes   G 17.2 0.649   218 16.9
## 37      Connor McCaffery            Iowa Hawkeyes   G 12.5 0.491   214 20.5
## 38       Caleb McConnell  Rutgers Scarlet Knights   G 14.2 0.493   213 13.3
## 39          Alan Griffin Illinois Fighting Illini   G 27.6 0.652   209  6.1
## 40           D.J. Carton      Ohio State Buckeyes   G 17.8 0.593   208 27.0
## 41         Joe Toussaint            Iowa Hawkeyes   G 13.4 0.466   206 26.9
## 42         Brevin Pritzl        Wisconsin Badgers   G 13.5 0.545   205  5.5
## 43         Payton Willis Minnesota Golden Gophers   G 13.4 0.517   193 13.3
## 44             Kobe King        Wisconsin Badgers   G 14.4 0.513   188 12.2
## 45        David Dejulius      Michigan Wolverines   G 13.3 0.522   187 12.9
## 46         Nojel Eastern      Purdue Boilermakers   G 11.5 0.434   182 19.3
## 47       Luther Muhammad      Ohio State Buckeyes   G 12.2 0.557   181 10.9
## 48          Rob Phinisee         Indiana Hoosiers   G 11.5 0.480   175 25.8
## 49        Jamari Wheeler Penn State Nittany Lions   G 10.4 0.564   147 19.6
## 50          Paul Mulcahy  Rutgers Scarlet Knights   G 13.0 0.605   128 18.7
## 51          Foster Loyer  Michigan State Spartans   G 16.5 0.662    86 22.7
## 52            Tommy Luce      Purdue Boilermakers   G 14.0 0.385    12 43.0
## 53          Tino Malnati    Northwestern Wildcats   G 24.3 0.500     3 40.4
##    WS.40  BPM advk5.cluster
## 1  0.203  8.0             2
## 2  0.146  6.7             2
## 3  0.204  9.4             2
## 4  0.157  7.3             2
## 5  0.161  6.2             2
## 6  0.081  3.9             2
## 7  0.149  7.3             2
## 8  0.186  7.4             2
## 9  0.133  6.8             2
## 10 0.091  2.5             2
## 11 0.185  7.3             2
## 12 0.076  3.0             2
## 13 0.120  5.9             2
## 14 0.049  1.1             2
## 15 0.183  8.4             2
## 16 0.107  5.8             2
## 17 0.139  6.9             2
## 18 0.123  6.2             2
## 19 0.079  3.8             2
## 20 0.122  4.6             2
## 21 0.115  5.5             2
## 22 0.130  5.2             2
## 23 0.137  6.7             2
## 24 0.139  5.0             2
## 25 0.114  4.9             2
## 26 0.142  6.9             2
## 27 0.151  4.4             2
## 28 0.125  7.0             2
## 29 0.129  6.5             2
## 30 0.089  3.3             2
## 31 0.123  3.4             2
## 32 0.071  1.8             2
## 33 0.039  0.2             2
## 34 0.133  6.8             2
## 35 0.097  4.3             2
## 36 0.133  6.7             2
## 37 0.109  6.5             2
## 38 0.121  5.1             2
## 39 0.252 10.6             2
## 40 0.152  6.6             2
## 41 0.071  2.5             2
## 42 0.123  5.7             2
## 43 0.120  6.0             2
## 44 0.100  3.3             2
## 45 0.113  4.5             2
## 46 0.085  4.5             2
## 47 0.129  6.4             2
## 48 0.080  3.3             2
## 49 0.095  5.8             2
## 50 0.130  5.7             2
## 51 0.183  5.0             2
## 52 0.102  2.2             2
## 53 0.161  7.8             2
\end{verbatim}

Sorting on Points Produced, Cam Mack is sixth out of the 53 guards in the Big Ten who land in Cluster 2. Seems advanced metrics take a little bit of the shine off of Cam. But then, so does leaving the program after one suspension-riddled season.

\hypertarget{simulations}{%
\chapter{Simulations}\label{simulations}}

In the 2017-2018 season, James Palmer Jr.~took 139 three point attempts and made 43 of them for a .309 shooting percentage. A few weeks into the next season, he was 7 for 39 -- a paltry .179.

Is something wrong or is this just bad luck?

Luck is something that comes up a lot in sports. Is a team unlucky? Or a player? One way we can get to this, we can get to that is by simulating things based on their typical percentages. Simulations work by choosing random values within a range based on a distribution. The most common distribution is the normal or binomial distribution. The normal distribution is where the most cases appear around the mean, 66 percent of cases are within one standard deviation from the mean, and the further away from the mean you get, the more rare things become.

\includegraphics[width=17.64in]{images/simulations2}

Let's simulate 39 three point attempts 1000 times with his season long shooting percentage and see if this could just be random chance or something else.

We do this using a base R function called \texttt{rbinom} or binomial distribution. So what that means is there's a normally distrubuted chance that James Palmer Jr.~is going to shoot above and below his career three point shooting percentage. If we randomly assign values in that distribution 1000 times, how many times will it come up 7, like this example?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}

\NormalTok{simulations }\OtherTok{\textless{}{-}} \FunctionTok{rbinom}\NormalTok{(}\AttributeTok{n =} \DecValTok{1000}\NormalTok{, }\AttributeTok{size =} \DecValTok{39}\NormalTok{, }\AttributeTok{prob =}\NormalTok{ .}\DecValTok{309}\NormalTok{)}

\FunctionTok{table}\NormalTok{(simulations)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## simulations
##   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22 
##   1   4   5  12  35  44  76 117 134 135 135  99  71  53  37  21  15   2   3   1
\end{verbatim}

How do we read this? The first row and the second row form a pair. The top row is the number of shots made. The number immediately under it is the number of simulations where that occurred.

\includegraphics[width=23.06in]{images/simulations1}

So what we see is given his season long shooting percentage, it's not out of the realm of randomness that with just 39 attempts for Palmer, he's only hit only 7. In 1000 simulations, it comes up 35 times. Is he below where he should be? Yes. Will he likely improve and soon? Unless something is very wrong, yes. And indeed, by the end of the season, he finished with a .313 shooting percentage from 3 point range. So we can say he was just unlucky.

\hypertarget{cold-streaks}{%
\section{Cold streaks}\label{cold-streaks}}

During the Western Illinois game in the 2018-2019 season, the team, shooting .329 on the season from behind the arc, went 0-15 in the second half. How strange is that?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}

\NormalTok{simulations }\OtherTok{\textless{}{-}} \FunctionTok{rbinom}\NormalTok{(}\AttributeTok{n =} \DecValTok{1000}\NormalTok{, }\AttributeTok{size =} \DecValTok{15}\NormalTok{, }\AttributeTok{prob =}\NormalTok{ .}\DecValTok{329}\NormalTok{)}

\FunctionTok{hist}\NormalTok{(simulations)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-140-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{table}\NormalTok{(simulations)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## simulations
##   0   1   2   3   4   5   6   7   8   9  10  11 
##   5  16  59 132 200 218 172  92  65  34   4   3
\end{verbatim}

Short answer: Really weird. If you simulate 15 threes 1000 times, sometimes you'll see them miss all of them, but only a few times -- five times, in this case. Most of the time, the team won't go 0-15 even once. So going ice cold is not totally out of the realm of random chance, but it's highly unlikely.

\hypertarget{intro-to-ggplot}{%
\chapter{Intro to ggplot}\label{intro-to-ggplot}}

With \texttt{ggplot2}, we dive into the world of programmatic data visualization. The \texttt{ggplot2} library implements something called the grammar of graphics. The main concepts are:

\begin{itemize}
\tightlist
\item
  aesthetics - which in this case means the data which we are going to plot
\item
  geometries - which means the shape the data is going to take
\item
  scales - which means any transformations we might make on the data
\item
  facets - which means how we might graph many elements of the same dataset in the same space
\item
  layers - which means how we might lay multiple geometries over top of each other to reveal new information.
\end{itemize}

Hadley Wickham, who is behind all of the libraries we have used in this course to date, wrote about his layered grammar of graphics in \href{http://byrneslab.net/classes/biol607/readings/wickham_layered-grammar.pdf}{this 2009 paper that is worth your time to read}.

Here are some \texttt{ggplot2} resources you'll want to keep handy:

\begin{itemize}
\tightlist
\item
  \href{http://ggplot2.tidyverse.org/reference/index.html}{The ggplot documentation}.
\item
  \href{http://www.cookbook-r.com/Graphs/}{The ggplot cookbook}
\end{itemize}

Let's dive in using data we've already seen before -- football attendance. This workflow will represent a clear picture of what your work in this class will be like for much of the rest of the semester. One way to think of this workflow is that your R Notebook is now your digital sketchbook, where you will try different types of visualizations to find ones that work. Then, you will either write the code that adds necessary and required parts to finish it, or you'll export your work into a program like Illustrator to finish the work.

To begin, we'll use data we've seen before: college football attendance.

Now load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{\textquotesingle{}data/attendance.csv\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Institution = col_character(),
##   Conference = col_character(),
##   `2013` = col_double(),
##   `2014` = col_double(),
##   `2015` = col_double(),
##   `2016` = col_double(),
##   `2017` = col_double(),
##   `2018` = col_double()
## )
\end{verbatim}

First, let's get a top 10 list by announced attendance in the most recent season we have data. We'll use the same tricks we used in the filtering assignment.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(Institution, }\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by 2018
\end{verbatim}

\begin{verbatim}
## # A tibble: 10 x 2
##    Institution `2018`
##    <chr>        <dbl>
##  1 Michigan    775156
##  2 Penn St.    738396
##  3 Ohio St.    713630
##  4 Alabama     710931
##  5 LSU         705733
##  6 Texas A&M   698908
##  7 Tennessee   650887
##  8 Georgia     649222
##  9 Nebraska    623240
## 10 Oklahoma    607146
\end{verbatim}

That looks good, so let's save it to a new data frame and use that data frame instead going forward.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{top10 }\OtherTok{\textless{}{-}}\NormalTok{ attendance }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(Institution, }\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by 2018
\end{verbatim}

\hypertarget{the-bar-chart}{%
\section{The bar chart}\label{the-bar-chart}}

The easiest thing we can do is create a simple bar chart of our data. \textbf{Bar charts show magnitude. They invite you to compare how much more or less one thing is compared to others.}

We could, for instance, create a bar chart of the total attendance. To do that, we simply tell \texttt{ggplot2} what our dataset is, what element of the data we want to make the bar chart out of (which is the aesthetic), and the geometry type (which is the geom). It looks like this:

\texttt{ggplot()\ +\ geom\_bar(data=top10,\ aes(x=Institution))}

Note: top10 is our data, \texttt{aes} means aesthetics, \texttt{x=Institution} explicitly tells \texttt{ggplot2} that our x value -- our horizontal value -- is the Institution field from the data, and then we add on the \texttt{geom\_bar()} as the geometry. And what do we get when we run that?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Institution)) }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-146-1.pdf}

We get \ldots{} weirdness. We expected to see bars of different sizes, but we get all with a count of 1. What gives? Well, this is the default behavior. What we have here is something called a histogram, where \texttt{ggplot2} helpfully counted up the number of times the Institution appears and counted them up. Since we only have one record per Institution, the count is always 1. How do we fix this? By adding \texttt{weight} to our aesthetic.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Institution, }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-147-1.pdf}

Closer. But \ldots{} what order is that in? And what happened to our count numbers on the left? Why are they in scientific notation?

Let's deal with the ordering first. \texttt{ggplot2}'s default behavior is to sort the data by the x axis variable. So it's in alphabetical order. To change that, we have to \texttt{reorder} it. With \texttt{reorder}, we first have to tell \texttt{ggplot} what we are reordering, and then we have to tell it HOW we are reordering it. So it's reorder(FIELD, SORTFIELD).

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-148-1.pdf}

Better. We can argue about if the right order is smallest to largest or largest to smallest. But this gets us close. By the way, to sort it largest to smallest, put a negative # front of the sort field.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-149-1.pdf}

\hypertarget{scales}{%
\section{Scales}\label{scales}}

To fix the axis labels, we need try one of the other main elements of the \texttt{ggplot2} library, which is transform a scale. More often that not, that means doing something like putting it on a logarithmic scale or some other kind of transformation. In this case, we're just changing how it's represented. The default in \texttt{ggplot2} for large values is to express them as scientific notation. Rarely ever is that useful in our line of work. So we have to transform them into human readable numbers.

The easiest way to do this is to use a library called \texttt{scales} and it's already installed.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(scales)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Attaching package: 'scales'
\end{verbatim}

\begin{verbatim}
## The following object is masked from 'package:purrr':
## 
##     discard
\end{verbatim}

\begin{verbatim}
## The following object is masked from 'package:readr':
## 
##     col_factor
\end{verbatim}

To alter the scale, we add a piece to our plot with \texttt{+} and we tell it which scale is getting altered and what kind of data it is. In our case, our Y axis is what is needing to be altered, and it's continuous data (meaning it can be any number between x and y, vs discrete data which are categorical). So we need to add \texttt{scale\_y\_continuous} and the information we want to pass it is to alter the labels with a function called \texttt{comma}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{labels=}\NormalTok{comma)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-151-1.pdf}

Better.

\hypertarget{styling}{%
\section{Styling}\label{styling}}

We are going to spend a lot more time on styling, but let's add some simple labels to this with a new bit called \texttt{labs} which is short for labels.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{labels=}\NormalTok{comma) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}
    \AttributeTok{title=}\StringTok{"Top 10 Football Programs By Attendance"}\NormalTok{, }
    \AttributeTok{x=}\StringTok{"School"}\NormalTok{, }
    \AttributeTok{y=}\StringTok{"Attendance"}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-152-1.pdf}

The library has lots and lots of ways to alter the styling -- we can programmatically control nearly every part of the look and feel of the chart. One simple way is to apply themes in the library already. We do that the same way we've done other things -- we add them. Here's the light theme.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{labels=}\NormalTok{comma) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}
    \AttributeTok{title=}\StringTok{"Top 10 Football Programs By Attendance"}\NormalTok{, }
    \AttributeTok{x=}\StringTok{"School"}\NormalTok{, }
    \AttributeTok{y=}\StringTok{"Attendance"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_light}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-153-1.pdf}

Or the minimal theme:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{labels=}\NormalTok{comma) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}
    \AttributeTok{title=}\StringTok{"Top 10 Football Programs By Attendance"}\NormalTok{, }
    \AttributeTok{x=}\StringTok{"School"}\NormalTok{, }
    \AttributeTok{y=}\StringTok{"Attendance"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-154-1.pdf}

Later on, we'll write our own themes. For now, the built in ones will get us closer to something that looks good.

\hypertarget{one-last-trick-coord-flip}{%
\section{One last trick: coord flip}\label{one-last-trick-coord-flip}}

Sometimes, we don't want vertical bars. Maybe we think this would look better horizontal. How do we do that? By adding \texttt{coord\_flip()} to our code. It does what it says -- it inverts the coordinates of the figures.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, }\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{weight=}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{labels=}\NormalTok{comma) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}
    \AttributeTok{title=}\StringTok{"Top 10 Football Programs By Attendance"}\NormalTok{, }
    \AttributeTok{x=}\StringTok{"School"}\NormalTok{, }
    \AttributeTok{y=}\StringTok{"Attendance"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-155-1.pdf}

\hypertarget{stacked-bar-charts}{%
\chapter{Stacked bar charts}\label{stacked-bar-charts}}

One of the elements of data visualization excellence is \textbf{inviting comparison}. Often that comes in showing \textbf{what proportion a thing is in relation to the whole thing}. With bar charts, we're showing magnitude of the whole thing. If we have information about the parts of the whole, \textbf{we can stack them on top of each other to compare them, showing both the whole and the components}. And it's a simple change to what we've already done.

We're going to use a dataset of college football games from this season.

Load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{football }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

What we have here is every game in college football this season. The question we want to answer is this: Who had the most prolific offenses in the Big Ten? And how did they get there?

So to make this chart, we have to just add one thing to a bar chart like we did in the previous chapter. However, it's not that simple.

We have game data, and we need season data. To get that, we need to do some group by and sum work. And since we're only interested in the Big Ten, we have some filtering to do too. For this, we're going to measure offensive production by rushing yards and passing yards. So if we have all the games a team played, and the rushing and passing yards for each of those games, what we need to do to get the season totals is just add them up.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{football }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Conference, Team) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{SeasonRushingYards =} \FunctionTok{sum}\NormalTok{(RushingYds),}
    \AttributeTok{SeasonPassingYards =} \FunctionTok{sum}\NormalTok{(PassingYds),}
\NormalTok{  ) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten Conference"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Conference' (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 14 x 4
## # Groups:   Conference [1]
##    Conference         Team           SeasonRushingYards SeasonPassingYards
##    <chr>              <chr>                       <dbl>              <dbl>
##  1 Big Ten Conference Illinois                     1569               1223
##  2 Big Ten Conference Indiana                       726               1806
##  3 Big Ten Conference Iowa                         1368               1581
##  4 Big Ten Conference Maryland                      722               1320
##  5 Big Ten Conference Michigan                      786               1502
##  6 Big Ten Conference Michigan State                635               1672
##  7 Big Ten Conference Minnesota                    1343               1394
##  8 Big Ten Conference Nebraska                     1611               1521
##  9 Big Ten Conference Northwestern                 1299               1490
## 10 Big Ten Conference Ohio State                   1654               1521
## 11 Big Ten Conference Penn State                   1569               2304
## 12 Big Ten Conference Purdue                        489               1854
## 13 Big Ten Conference Rutgers                      1259               1786
## 14 Big Ten Conference Wisconsin                    1030               1123
\end{verbatim}

By looking at this, we can see we got what we needed. We have 14 teams and numbers that look like season totals for yards. Save that to a new dataframe.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{football }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Conference, Team) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{SeasonRushingYards =} \FunctionTok{sum}\NormalTok{(RushingYds),}
    \AttributeTok{SeasonPassingYards =} \FunctionTok{sum}\NormalTok{(PassingYds),}
\NormalTok{  ) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten Conference"}\NormalTok{) }\OtherTok{{-}\textgreater{}}\NormalTok{ yards}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Conference' (override with `.groups` argument)
\end{verbatim}

Now, the problem we have is that ggplot wants long data and this data is wide. So we need to use \texttt{tidyr} to make it long, just like we did in the transforming data chapter.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{yards }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{pivot\_longer}\NormalTok{(}
    \AttributeTok{cols=}\FunctionTok{starts\_with}\NormalTok{(}\StringTok{"Season"}\NormalTok{), }
    \AttributeTok{names\_to=}\StringTok{"Type"}\NormalTok{, }
    \AttributeTok{values\_to=}\StringTok{"Yards"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 28 x 4
## # Groups:   Conference [1]
##    Conference         Team     Type               Yards
##    <chr>              <chr>    <chr>              <dbl>
##  1 Big Ten Conference Illinois SeasonRushingYards  1569
##  2 Big Ten Conference Illinois SeasonPassingYards  1223
##  3 Big Ten Conference Indiana  SeasonRushingYards   726
##  4 Big Ten Conference Indiana  SeasonPassingYards  1806
##  5 Big Ten Conference Iowa     SeasonRushingYards  1368
##  6 Big Ten Conference Iowa     SeasonPassingYards  1581
##  7 Big Ten Conference Maryland SeasonRushingYards   722
##  8 Big Ten Conference Maryland SeasonPassingYards  1320
##  9 Big Ten Conference Michigan SeasonRushingYards   786
## 10 Big Ten Conference Michigan SeasonPassingYards  1502
## # ... with 18 more rows
\end{verbatim}

What you can see now is that we have two rows for each team: One for rushing yards, one for passing yards. This is what ggplot needs. Save it to a new dataframe.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{yards }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{pivot\_longer}\NormalTok{(}
    \AttributeTok{cols=}\FunctionTok{starts\_with}\NormalTok{(}\StringTok{"Season"}\NormalTok{), }
    \AttributeTok{names\_to=}\StringTok{"Type"}\NormalTok{, }
    \AttributeTok{values\_to=}\StringTok{"Yards"}\NormalTok{) }\OtherTok{{-}\textgreater{}}\NormalTok{ yardswide}
\end{Highlighting}
\end{Shaded}

Building on what we learned in the last chapter, we know we can turn this into a bar chart with an x value, a weight and a geom\_bar. What we are going to add is a \texttt{fill}. The \texttt{fill} will stack bars on each other based on which element it is. In this case, we can fill the bar by Type, which means it will stack the number of rushing yards on top of passing yards and we can see how they compare.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(yardswide, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Team, }\AttributeTok{weight=}\NormalTok{Yards, }\AttributeTok{fill=}\NormalTok{Type)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-163-1.pdf}

What's the problem with this chart?

There's a couple of things, one of which we'll deal with now: The ordering is alphabetical (from the bottom up). So let's \texttt{reorder} the teams by Yards.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(yardswide, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Team, Yards), }\AttributeTok{weight=}\NormalTok{Yards, }\AttributeTok{fill=}\NormalTok{Type)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_bar}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-164-1.pdf}

And just like that \ldots{} Penn State comes out on top? Huh. And look who is third.

What else is the problem here? Hint: there was a global pandemic going on.

\hypertarget{circular-bar-plots}{%
\chapter{Circular bar plots}\label{circular-bar-plots}}

At the 27:36 mark in the \href{https://www.omaha.com/sports/podcasts/half-court-press/half-court-press-creighton-cruises-in-opener-nebraska-stunned-in/article_67081a35-3a8f-5e9e-ae67-e88fcacbb362.html}{Half Court Podcast}, former Omaha World Herald Writer Chris Heady said ``November basketball doesn't matter, but it shows you where you are.''

It's a tempting phrase to believe, especially a day after Nebraska lost the first game of the Fred Hoiberg era at home to a baseball school, UC Riverside. And it wasn't close. The Huskers, because of a total roster turnover, were a complete mystery before the game. And what happened during it wasn't pretty, so there was a little soul searching going on in Lincoln.

But does November basketball really not matter?

Let's look, using a new form of chart called a circular bar plot. It's a chart type that combines several forms we've used before: bar charts to show magnitude, stacked bar charts to show proportion, but we're going to add bending the chart around a circle to add some visual interstingness to it. We're also going to use time as an x-axis value to make a not subtle circle of time reference -- a common technique with circular bar charts.

We'll use a dataset of every college basketball game last season.

Load your libraries.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(lubridate)}
\end{Highlighting}
\end{Shaded}

And load your data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\hypertarget{does-november-basketball-matter}{%
\section{Does November basketball matter?}\label{does-november-basketball-matter}}

So let's test the notion of November Basketball Doesn't Matter. What matters in basketball? Let's start simple: Wins.

Sports Reference's win columns are weird, so we need to scan through them and find W and L and we'll give them numbers using \texttt{case\_when}. I'm also going to filter out tournament basketball.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{winloss =} \FunctionTok{case\_when}\NormalTok{(}
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"W"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{1}\NormalTok{, }
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"L"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{0}\NormalTok{)}
\NormalTok{) }
\end{Highlighting}
\end{Shaded}

Now we can group by date and conference and sum up the wins. How many wins by day does each conference get?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dates }\OtherTok{\textless{}{-}}\NormalTok{ winlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(Date, Conference) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{wins =} \FunctionTok{sum}\NormalTok{(winloss))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Date' (override with `.groups` argument)
\end{verbatim}

Earlier, we did stacked bar charts. We have what we need to do that now.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{dates, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{weight=}\NormalTok{wins, }\AttributeTok{fill=}\NormalTok{Conference)) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-170-1.pdf}

Eeek. This is already looking not great. But to make it a circular bar chart, we add \texttt{coord\_polar()} to our chart.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{dates, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{weight=}\NormalTok{wins, }\AttributeTok{fill=}\NormalTok{Conference)) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{coord\_polar}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-171-1.pdf}

Based on that, the day is probably too thin a slice, and there's way too many conferences in college basketball. Let's group this by months and filter out all but the power five conferences.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p5 }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"SEC"}\NormalTok{, }\StringTok{"Big Ten"}\NormalTok{, }\StringTok{"Pac{-}12"}\NormalTok{, }\StringTok{"Big 12"}\NormalTok{, }\StringTok{"ACC"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

To get months, we're going to use a function in the library \texttt{lubridate} called \texttt{floor\_date}, which combine with mutate will give us a field of just months.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wins }\OtherTok{\textless{}{-}}\NormalTok{ winlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{month =} \FunctionTok{floor\_date}\NormalTok{(Date, }\AttributeTok{unit=}\StringTok{"months"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(month, Conference) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{wins=}\FunctionTok{sum}\NormalTok{(winloss)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ p5) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
\end{verbatim}

Now we can use wins to make our circular bar chart of wins by month in the Power Five.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{wins, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{month, }\AttributeTok{weight=}\NormalTok{wins, }\AttributeTok{fill=}\NormalTok{Conference)) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{coord\_polar}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-174-1.pdf}

Yikes. That looks a lot like a broken pie chart. So months are too thick of a slice. Let's use weeks in our floor date to see what that gives us.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wins }\OtherTok{\textless{}{-}}\NormalTok{ winlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{week =} \FunctionTok{floor\_date}\NormalTok{(Date, }\AttributeTok{unit=}\StringTok{"weeks"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(week, Conference) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{wins=}\FunctionTok{sum}\NormalTok{(winloss)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ p5) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'week' (override with `.groups` argument)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{wins, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{week, }\AttributeTok{weight=}\NormalTok{wins, }\AttributeTok{fill=}\NormalTok{Conference)) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{coord\_polar}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-176-1.pdf}

That looks better. But what does it say? Does November basketball matter? What this is saying is \ldots{} yeah, it kinda does. The reason? Lots of wins get piled up in November and December, during non-conference play. So if you are a team with NCAA tournament dreams, you need to win games in November to make sure your tournament resume is where it needs to be come March. Does an individual win or loss matter? Probably not. But your record in November does.

\hypertarget{does-it-show-you-where-you-are}{%
\section{Does it show you where you are?}\label{does-it-show-you-where-you-are}}

So here is the problem we have:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  We have data for every game. In the past, we were able to calculate the team wins and losses because the way the data records them is the Team is the main team, and they win or lose. The Opponent is recorded, but the Opponent has the mirror image of this game as well, where they are the Team. So essentially every game is in here twice -- one for each team that plays in the game.
\item
  We need to attach the Opponent's winning percentage to each game so we can decide if it's a quality win for Team.
\item
  The Team name is not an exact copy of the Team name. So we can't join them using it.
\end{enumerate}

So what we have to do is invert the process that we've done before. We need to group by the Opponent -- because the names will be consistent then -- and we need to invert the wins and losses. A win in the W\_L column is a win for the Team. That means each loss in the W\_L column is a WIN for the Opponent.

Once we invert, the data looks very similar to what we've done before. One other thing: I noticed there's some tournament games in here, so the filter at the end strips them out.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{oppwinlosslogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{winloss =} \FunctionTok{case\_when}\NormalTok{(}
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"W"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{0}\NormalTok{, }
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"L"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{1}\NormalTok{)}
\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Date }\SpecialCharTok{\textless{}} \StringTok{"2020{-}03{-}19"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

So now we have a dataframe called oppwinlosslogs that has an inverted winloss column. So now we can group by the Opponent and sum the wins and it will tell us how many games the Opponent won. We can also count the wins and get a winning percentage.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{oppwinlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(Opponent) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{games=}\FunctionTok{n}\NormalTok{(), }\AttributeTok{wins=}\FunctionTok{sum}\NormalTok{(winloss)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{winpct =}\NormalTok{ wins}\SpecialCharTok{/}\NormalTok{games) }\OtherTok{{-}\textgreater{}}\NormalTok{ opprecord}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

Now we have a dataframe of 659 opponent winning records. Wait, what? There's 353 teams in major college basketball, so why 659? If you look through it, there's a bunch of teams playing lower level teams. Given that they are lower level, they're likely cannon fodder and will lose the game, and we're going to filter them out in a minute.

Now we can join the opponent winning percentage to our winlosslogs data so we can answer our question about quality wins.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{winloss =} \FunctionTok{case\_when}\NormalTok{(}
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"W"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{1}\NormalTok{, }
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"L"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{0}\NormalTok{)}
\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Date }\SpecialCharTok{\textless{}} \StringTok{"2020{-}03{-}19"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{left\_join}\NormalTok{(opprecord, }\AttributeTok{by=}\NormalTok{(}\StringTok{"Opponent"}\NormalTok{)) }\OtherTok{{-}\textgreater{}}\NormalTok{ winswithopppct}
\end{Highlighting}
\end{Shaded}

Now that we have a table called winswithopppct, we can filter out teams non power 5 teams and teams that won less than 60 percent of their games and run the same calculations in the book.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p5 }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"SEC"}\NormalTok{, }\StringTok{"Big Ten"}\NormalTok{, }\StringTok{"Pac{-}12"}\NormalTok{, }\StringTok{"Big 12"}\NormalTok{, }\StringTok{"ACC"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winswithopppct }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(winpct }\SpecialCharTok{\textgreater{}}\NormalTok{ .}\DecValTok{6}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{week =} \FunctionTok{floor\_date}\NormalTok{(Date, }\AttributeTok{unit=}\StringTok{"weeks"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(week, Conference) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{wins=}\FunctionTok{sum}\NormalTok{(winloss)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ p5) }\OtherTok{{-}\textgreater{}}\NormalTok{ qualitywins}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'week' (override with `.groups` argument)
\end{verbatim}

Now with our dataframe called qualitywins, we can chart it again.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{qualitywins, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{week, }\AttributeTok{weight=}\NormalTok{wins, }\AttributeTok{fill=}\NormalTok{Conference)) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{coord\_polar}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-183-1.pdf}

Look at this chart and compare it to the first one.

\hypertarget{waffle-charts}{%
\chapter{Waffle charts}\label{waffle-charts}}

Pie charts are the devil. They should be an instant F in any data visualization class. The problem? How carefully can you evaluate angles and area? Unless they are blindingly obvious and only a few categories, not well. If you've got 25 categories, how can you tell the difference between 7 and 9 percent? You can't.

So let's introduce a better way: The Waffle Chart. Some call it a square pie chart. I personally hate that. Waffles it is.

\textbf{A waffle chart is designed to show you parts of the whole -- proportionality}. How many yards on offense come from rushing or passing. How many singles, doubles, triples and home runs make up a teams hits. How many shots a basketball team takes are two pointers versus three pointers.

First, install the library in the console. We want a newer version of the \texttt{waffle} library than is in CRAN -- where you normally get libraries from -- so copy and paste this into your console:

\texttt{install.packages("waffle")}

Now load it:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(waffle)}
\end{Highlighting}
\end{Shaded}

\hypertarget{waffles-two-ways-part-1}{%
\section{Waffles two ways: Part 1}\label{waffles-two-ways-part-1}}

Let's look at the debacle that was Nebraska vs.~Ohio State this past fall in college football. \href{https://www.espn.com/college-football/matchup?gameId=401112241}{Here's the box score}, which we'll use for this part of the walkthrough.

Maybe the easiest way to do waffle charts, at least at first, is to make vectors of your data and plug them in. To make a vector, we use the \texttt{c} or concatenate function.

So let's look at offense. Rushing vs passing.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Rushing"}\OtherTok{=}\DecValTok{217}\NormalTok{, }\StringTok{"Passing"}\OtherTok{=}\DecValTok{160}\NormalTok{)}
\NormalTok{oh }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Rushing"}\OtherTok{=}\DecValTok{215}\NormalTok{, }\StringTok{"Passing"}\OtherTok{=}\DecValTok{276}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

So what does the breakdown of the night look like?

The waffle library can break this down in a way that's easier on the eyes than a pie chart. We call the library, add the data, specify the number of rows, give it a title and an x value label, and to clean up a quirk of the library, we've got to specify colors.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{waffle}\NormalTok{(}
\NormalTok{        nu, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 1 yard"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-186-1.pdf}

Or, we could make this two teams in the same chart.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{passing }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Nebraska"}\OtherTok{=}\DecValTok{160}\NormalTok{, }\StringTok{"Ohio State"}\OtherTok{=}\DecValTok{276}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{waffle}\NormalTok{(}
\NormalTok{        passing, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Nebraska vs Ohio State: passing"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 1 yard"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"red"}\NormalTok{, }\StringTok{"black"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-188-1.pdf}

So what does it look like if we compare the two teams using the two vectors in the same chart? To do that -- and I am not making this up -- you have to create a waffle iron. Get it? Waffle charts? Iron?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{iron}\NormalTok{(}
 \FunctionTok{waffle}\NormalTok{(nu, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 1 yard"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{)}
\NormalTok{        ),}
 \FunctionTok{waffle}\NormalTok{(oh, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Ohio State\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 1 yard"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{)}
\NormalTok{        )}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-189-1.pdf}

What do you notice about this chart? Notice how the squares aren't the same size? Well, Ohio State out-gained Nebraska by a long way. So the squares aren't the same size because the numbers aren't the same. We can fix that by adding an unnamed padding number so the number of yards add up to the same thing. Let's make the total for everyone be 491, Ohio State's total yards of offense. So to do that, we need to add a padding of 114 to Nebraska. REMEMBER: Don't name it or it'll show up in the legend.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Rushing"}\OtherTok{=}\DecValTok{217}\NormalTok{, }\StringTok{"Passing"}\OtherTok{=}\DecValTok{160}\NormalTok{, }\DecValTok{114}\NormalTok{)}
\NormalTok{oh }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Rushing"}\OtherTok{=}\DecValTok{215}\NormalTok{, }\StringTok{"Passing"}\OtherTok{=}\DecValTok{276}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now, in our waffle iron, if we don't give that padding a color, we'll get an error. So we need to make it white. Which, given our white background, means it will disappear.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{iron}\NormalTok{(}
 \FunctionTok{waffle}\NormalTok{(nu, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 1 yard"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)}
\NormalTok{        ),}
 \FunctionTok{waffle}\NormalTok{(oh, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Ohio State\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 1 yard"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)}
\NormalTok{        )}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-191-1.pdf}

One last thing we can do is change the 1 square = 1 yard bit -- which makes the squares really small in this case -- by dividing our vector. Remember what you learned in Swirl about math on vectors?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{iron}\NormalTok{(}
 \FunctionTok{waffle}\NormalTok{(nu}\SpecialCharTok{/}\DecValTok{2}\NormalTok{, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 2 yards"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)}
\NormalTok{        ),}
 \FunctionTok{waffle}\NormalTok{(oh}\SpecialCharTok{/}\DecValTok{2}\NormalTok{, }
        \AttributeTok{rows =} \DecValTok{10}\NormalTok{, }
        \AttributeTok{title=}\StringTok{"Ohio State\textquotesingle{}s offense"}\NormalTok{, }
        \AttributeTok{xlab=}\StringTok{"1 square = 2 yards"}\NormalTok{, }
        \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)}
\NormalTok{        )}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-192-1.pdf}

News flash: Ohio State beat Nebraska.

\hypertarget{waffles-two-ways-part-2}{%
\section{Waffles two ways: Part 2}\label{waffles-two-ways-part-2}}

For this part, we want a newer version of the \texttt{waffle} library than is in CRAN -- where you normally get libraries from.

\textbf{WARNING: This didn't work in a variety of environments, so it may not work on yours.}

Copy and paste this into your console:

\texttt{install.packages("waffle",\ repos\ =\ "https://cinc.rud.is")}

At first, this way might seem harder than doing it the way we just walked through, but the benefits will come later when it's far, far easier to style this chart, where the previous charts are harder.

We have the log of every game in college football -- \href{https://unl.box.com/s/2prgq48ctoxlukn6kmfjw0u1opda5s0m}{you can get it here} -- and we can find this game with some simple filtering.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fblogs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs19.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fblogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska"} \SpecialCharTok{\&}\NormalTok{ Opponent }\SpecialCharTok{==} \StringTok{"Ohio State"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 54
##    Game Date       HomeAway Opponent Result PassingCmp PassingAtt PassingPct
##   <dbl> <date>     <chr>    <chr>    <chr>       <dbl>      <dbl>      <dbl>
## 1     5 2019-09-28 <NA>     Ohio St~ L (7-~          8         17       47.1
## # ... with 46 more variables: PassingYds <dbl>, PassingTD <dbl>,
## #   RushingAtt <dbl>, RushingYds <dbl>, RushingAvg <dbl>, RushingTD <dbl>,
## #   OffensivePlays <dbl>, OffensiveYards <dbl>, OffenseAvg <dbl>,
## #   FirstDownPass <dbl>, FirstDownRush <dbl>, FirstDownPen <dbl>,
## #   FirstDownTotal <dbl>, Penalties <dbl>, PenaltyYds <dbl>, Fumbles <dbl>,
## #   Interceptions <dbl>, TotalTurnovers <dbl>, TeamFull <chr>, TeamURL <chr>,
## #   Outcome <chr>, TeamScore <dbl>, OpponentScore <dbl>, DefPassingCmp <dbl>,
## #   DefPassingAtt <dbl>, DefPassingPct <dbl>, DefPassingYds <dbl>,
## #   DefPassingTD <dbl>, DefRushingAtt <dbl>, DefRushingYds <dbl>,
## #   DefRushingAvg <dbl>, DefRushingTD <dbl>, DefPlays <dbl>, DefYards <dbl>,
## #   DefAvg <dbl>, DefFirstDownPass <dbl>, DefFirstDownRush <dbl>,
## #   DefFirstDownPen <dbl>, DefFirstDownTotal <dbl>, DefPenalties <dbl>,
## #   DefPenaltyYds <dbl>, DefFumbles <dbl>, DefInterceptions <dbl>,
## #   DefTotalTurnovers <dbl>, Team <chr>, Conference <chr>
\end{verbatim}

That's the game. So now we need to make this long data -- same as we did with the stacked bar charts -- and we'll focus on total yards.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fblogs }\SpecialCharTok{\%\textgreater{}\%} 
        \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska"} \SpecialCharTok{\&}\NormalTok{ Opponent }\SpecialCharTok{==} \StringTok{"Ohio State"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
        \FunctionTok{select}\NormalTok{(Team, OffensiveYards, DefYards) }\SpecialCharTok{\%\textgreater{}\%} 
        \FunctionTok{pivot\_longer}\NormalTok{(}
                \AttributeTok{cols=}\FunctionTok{c}\NormalTok{(}\StringTok{"OffensiveYards"}\NormalTok{, }\StringTok{"DefYards"}\NormalTok{), }
                \AttributeTok{names\_to=}\StringTok{"Type"}\NormalTok{, }
                \AttributeTok{values\_to=}\StringTok{"Yards"}
\NormalTok{                )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 2 x 3
##   Team     Type           Yards
##   <chr>    <chr>          <dbl>
## 1 Nebraska OffensiveYards   231
## 2 Nebraska DefYards         583
\end{verbatim}

That does what we want, so let's save that to a new dataframe.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nuoh }\OtherTok{\textless{}{-}}\NormalTok{ fblogs }\SpecialCharTok{\%\textgreater{}\%} 
        \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska"} \SpecialCharTok{\&}\NormalTok{ Opponent }\SpecialCharTok{==} \StringTok{"Ohio State"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
        \FunctionTok{select}\NormalTok{(Team, OffensiveYards, DefYards) }\SpecialCharTok{\%\textgreater{}\%} 
        \FunctionTok{pivot\_longer}\NormalTok{(}
                \AttributeTok{cols=}\FunctionTok{c}\NormalTok{(}\StringTok{"OffensiveYards"}\NormalTok{, }\StringTok{"DefYards"}\NormalTok{), }
                \AttributeTok{names\_to=}\StringTok{"Type"}\NormalTok{, }
                \AttributeTok{values\_to=}\StringTok{"Yards"}
\NormalTok{                )}
\end{Highlighting}
\end{Shaded}

Now we can use a new geom -- \texttt{geom\_waffle} -- that the \texttt{waffle} library has added to \texttt{ggplot}. The \texttt{geom\_waffle} takes two required inputs: \texttt{fill} and \texttt{value}, but otherwise, it looks the same as previous things we've done.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_waffle}\NormalTok{(}
  \AttributeTok{data=}\NormalTok{nuoh,}
  \FunctionTok{aes}\NormalTok{(}\AttributeTok{fill=}\NormalTok{Type, }\AttributeTok{values=}\NormalTok{Yards))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-197-1.pdf}
First, we can see that going this route changes the boxes to narrow rectangles. That's to fill the space given.

If we want to split them, we can use something called a \texttt{facet\_wrap} which we will spend a whole class on later, so don't worry about this now. Just know we can split it by \texttt{Type} this way.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_waffle}\NormalTok{(}
  \AttributeTok{data=}\NormalTok{nuoh,}
  \FunctionTok{aes}\NormalTok{(}\AttributeTok{fill=}\NormalTok{Type, }\AttributeTok{values=}\NormalTok{Yards)}
\NormalTok{) }\SpecialCharTok{+} \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{Type)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-198-1.pdf}
Now we can stack the two charts so they aren't side by side using \texttt{ncol} inside the \texttt{facet\_wrap}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_waffle}\NormalTok{(}
  \AttributeTok{data=}\NormalTok{nuoh,}
  \FunctionTok{aes}\NormalTok{(}\AttributeTok{fill=}\NormalTok{Type, }\AttributeTok{values=}\NormalTok{Yards)}
\NormalTok{) }\SpecialCharTok{+} \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{Type, }\AttributeTok{ncol=}\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-199-1.pdf}
Now it's just a matter of formatting, labeling and general cleanup. We'll focus on that later as well, but here's a quick way to get started, which we did in the bar chart chapter.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_waffle}\NormalTok{(}
  \AttributeTok{data=}\NormalTok{nuoh,}
  \FunctionTok{aes}\NormalTok{(}\AttributeTok{fill=}\NormalTok{Type, }\AttributeTok{values=}\NormalTok{Yards)}
\NormalTok{) }\SpecialCharTok{+}  
    \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{Type, }\AttributeTok{ncol=}\DecValTok{1}\NormalTok{) }\SpecialCharTok{+} 
    \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
    \FunctionTok{theme\_enhance\_waffle}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-200-1.pdf}

\hypertarget{line-charts}{%
\chapter{Line charts}\label{line-charts}}

So far, we've talked about bar charts -- stacked or otherwise -- are good for showing relative size of a thing compared to another thing. Stacked Bars and Waffle charts are good at showing proportions of a whole.

\textbf{Line charts are good for showing change over time.}

Let's look at how we can answer this question: Why was Nebraska terrible at basketball last season?

We'll need the logs of every game in college basketball for this.

Let's start getting all that we need. We can use the tidyverse shortcut.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And now load the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

This data has every game from every team in it, so we need to use filtering to limit it, because we just want to look at Nebraska. If you don't remember, flip back to chapter 6.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska Cornhuskers"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Because this data has just Nebraska data in it, the dates are formatted correctly, and the data is long data (instead of wide), we have what we need to make line charts.

Line charts, unlike bar charts, do have a y-axis. So in our ggplot step, we have to define what our x and y axes are. In this case, the x axis is our Date -- the most common x axis in line charts is going to be a date of some variety -- and y in this case is up to us. We've seen from previous walkthroughs that how well a team shoots the ball has a lot to do with how well a team does in a season, so let's chart that.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-205-1.pdf}

See a problem here? Note the Y axis doesn't start with zero. That makes this look worse than it is (and that February swoon is pretty bad). To make the axis what you want, you can use \texttt{scale\_x\_continuous} or \texttt{scale\_y\_continuous} and pass in a list with the bottom and top value you want. You do that like this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{6}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-206-1.pdf}

Note also that our X axis labels are automated. It knows it's a date and it just labels it by month.

\hypertarget{this-is-too-simple.}{%
\section{This is too simple.}\label{this-is-too-simple.}}

With datasets, we want to invite comparison. So let's answer the question visually. Let's put two lines on the same chart. How does Nebraska compare to Michigan State and Purdue, the eventual regular season co-champions?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msu }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Michigan State Spartans"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

In this case, because we have two different datasets, we're going to put everything in the geom instead of the ggplot step. We also have to explicitly state what dataset we're using by saying \texttt{data=} in the geom step.

First, let's chart Nebraska. Read carefully. First we set the data. Then we set our aesthetic. Unlike bars, we need an X and a Y variable. In this case, our X is the date of the game, Y is the thing we want the lines to move with. In this case, the Team Field Goal Percentage -- TeamFGPCT.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-208-1.pdf}

Now, by using +, we can add Michigan State to it. REMEMBER COPY AND PASTE IS A THING. Nothing changes except what data you are using.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{msu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"green"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-209-1.pdf}

Let's flatten our lines out by zeroing the Y axis.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{msu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"green"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{65}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-210-1.pdf}

So visually speaking, the difference between Nebraska and Michigan State's season is that Michigan State stayed mostly on an even keel, and Nebraska went on a two month swoon.

\hypertarget{but-what-if-i-wanted-to-add-a-lot-of-lines.}{%
\section{But what if I wanted to add a lot of lines.}\label{but-what-if-i-wanted-to-add-a-lot-of-lines.}}

Fine. How about all Power Five Schools? This data for example purposes. You don't have to do it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{powerfive }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"SEC"}\NormalTok{, }\StringTok{"Big Ten"}\NormalTok{, }\StringTok{"Pac{-}12"}\NormalTok{, }\StringTok{"Big 12"}\NormalTok{, }\StringTok{"ACC"}\NormalTok{)}

\NormalTok{p5conf }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{\%in\%}\NormalTok{ powerfive)}
\end{Highlighting}
\end{Shaded}

I can keep layering on layers all day if I want. And if my dataset has more than one team in it, I need to use the \texttt{group} command. And, the layering comes in order -- so if you're going to layer a bunch of lines with a smaller group of lines, you want the bunch on the bottom. So to do that, your code stacks from the bottom. The first geom in the code gets rendered first. The second gets layered on top of that. The third gets layered on that and so on.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{p5conf, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{msu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"green"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{65}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-212-1.pdf}

What do we see here? How has Nebraska and Michigan State's season evolved against all the rest of the teams in college basketball?

But how does that compare to the average? We can add that pretty easily by creating a new dataframe with it and add another geom\_line.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{average }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(Date) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{mean\_shooting=}\FunctionTok{mean}\NormalTok{(TeamFGPCT))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{p5conf, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{msu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT), }\AttributeTok{color=}\StringTok{"green"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{average, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{mean\_shooting), }\AttributeTok{color=}\StringTok{"black"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{65}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-214-1.pdf}

\hypertarget{step-charts}{%
\chapter{Step charts}\label{step-charts}}

Step charts are \textbf{a method of showing progress} toward something. They combine showing change over time -- \textbf{cumulative change over time} -- with magnitude. They're good at inviting comparison.

There's great examples out there. First is the Washignton Post looking at \href{https://www.washingtonpost.com/graphics/sports/lebron-james-michael-jordan-nba-scoring-list/?utm_term=.481074150849}{Lebron passing Jordan's career point total}. Another is John Burn-Murdoch's work at the Financial Times (which is paywalled) about soccer stars. \href{http://johnburnmurdoch.github.io/projects/goal-lines/CL/}{Here's an example of his work outside the paywall}.

To replicate this, we need cumulative data -- data that is the running total of data at a given point. So think of it this way -- Nebraska scores 50 points in a basketball game and then 50 more the next, their cumulative total at two games is 100 points.

Step charts can be used for all kinds of things -- showing how a player's career has evolved over time, how a team fares over a season, or franchise history. Let's walk through an example.

Let's look at Fred Hoiberg's first team.

We'll need the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And we need to load our logs data we just downloaded.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

Here we're going to look at the scoring differential of teams. If you score more than your opponent, you win. So it stands to reason that if you score a lot more than your opponent over the course of a season, you should be very good, right? Let's see.

The first thing we're going to do is calculate that differential. Then, we'll group it by the team. After that, we're going to summarize using a new function called \texttt{cumsum} or cumulative sum -- the sum for each game as we go forward. So game 1's cumsum is the differential of that game. Game 2's cumsum is Game 1 + Game 2. Game 3 is Game 1 + 2 + 3 and so on.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{difflogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Differential =}\NormalTok{ TeamScore }\SpecialCharTok{{-}}\NormalTok{ OpponentScore) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Team) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{CumDiff =} \FunctionTok{cumsum}\NormalTok{(Differential))}
\end{Highlighting}
\end{Shaded}

Now that we have the cumulative sum for each, let's filter it down to just Big Ten teams.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bigdiff }\OtherTok{\textless{}{-}}\NormalTok{ difflogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

The step chart is it's own geom, so we can employ it just like we have the others. It works almost exactly the same as a line chart, but it uses the cumulative sum instead of a regular value and, as the name implies, creates a step like shape to the line instead of a curve.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_step}\NormalTok{(}\AttributeTok{data=}\NormalTok{bigdiff, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{CumDiff, }\AttributeTok{group=}\NormalTok{Team))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-220-1.pdf}

Let's try a different element of the aesthetic: color, but this time inside the aesthetic. Last time, we did the color outside. When you put it inside, you pass it a column name and ggplot will color each line based on what thing that is, and it will create a legend that labels each line that thing.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_step}\NormalTok{(}\AttributeTok{data=}\NormalTok{bigdiff, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{CumDiff, }\AttributeTok{group=}\NormalTok{Team, }\AttributeTok{color=}\NormalTok{Team))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-221-1.pdf}

From this, we can see two teams in the Big Ten had negative point differentials last season -- Nebraska and Northwestern. But which is which?

Let's look at the top team plus Nebraska.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ bigdiff }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska Cornhuskers"}\NormalTok{)}
\NormalTok{ms }\OtherTok{\textless{}{-}}\NormalTok{ bigdiff }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Michigan State Spartans"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Let's introduce a couple of new things here. First, note when I take the color OUT of the aesthetic, the legend disappears.

The second thing I'm going to add is the annotation layer. In this case, I am adding a text annotation layer, and I can specify where by adding in a x and a y value where I want to put it. This takes some finesse. After that, I'm going to add labels and a theme.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_step}\NormalTok{(}\AttributeTok{data=}\NormalTok{bigdiff, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{CumDiff, }\AttributeTok{group=}\NormalTok{Team), }\AttributeTok{color=}\StringTok{"light grey"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_step}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{CumDiff, }\AttributeTok{group=}\NormalTok{Team), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_step}\NormalTok{(}\AttributeTok{data=}\NormalTok{ms, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{CumDiff, }\AttributeTok{group=}\NormalTok{Team), }\AttributeTok{color=}\StringTok{"green"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{annotate}\NormalTok{(}\StringTok{"text"}\NormalTok{, }\AttributeTok{x=}\NormalTok{(}\FunctionTok{as.Date}\NormalTok{(}\StringTok{"2019{-}12{-}10"}\NormalTok{)), }\AttributeTok{y=}\SpecialCharTok{{-}}\DecValTok{70}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{annotate}\NormalTok{(}\StringTok{"text"}\NormalTok{, }\AttributeTok{x=}\NormalTok{(}\FunctionTok{as.Date}\NormalTok{(}\StringTok{"2020{-}02{-}01"}\NormalTok{)), }\AttributeTok{y=}\DecValTok{330}\NormalTok{, }\AttributeTok{label=}\StringTok{"Michigan State"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}
    \AttributeTok{x=}\StringTok{"Date"}\NormalTok{, }
    \AttributeTok{y=}\StringTok{"Cumulative Point Differential"}\NormalTok{, }
    \AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s season long slide"}\NormalTok{, }
    \AttributeTok{subtitle=}\StringTok{"The Huskers started slow and never got going"}\NormalTok{, }
    \AttributeTok{caption=}\StringTok{"Source: Sports{-}Reference.com | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-223-1.pdf}

\hypertarget{ridge-charts}{%
\chapter{Ridge charts}\label{ridge-charts}}

Ridgeplots are useful for when you want to show how different groupings compare with a large number of datapoints. So let's look at how we do this, and in the process, we learn about ggplot extensions. The extensions add functionality to ggplot, which doesn't out of the box have ridgeplots (sometimes called joyplots).

In the console, run this: \texttt{install.packages("ggridges")}

Now we can add those libraries.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggridges)}
\end{Highlighting}
\end{Shaded}

So for this, let's look at every basketball game played since the 2014-15 season.

We load that like this.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs1519.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Missing column names filled in: 'X1' [1]
\end{verbatim}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

So I want to group teams by wins. Wins are the only thing that matter -- ask Tim Miles. So our data has a column called W\_L that lists if the team won or lost. The problem is it doesn't just say W or L. If the game went to overtime, it lists that. That complicates counting wins. And, with ridgeplots, I want to be be able to separate EVERY GAME by how many wins the team had over a SEASON. So I've got some work to do.

First, here's a trick to find a string of text and make that. It's called \texttt{grepl} and the basic syntax is grepl for this string in this field and then do what I tell you. In this case, we're going to create a new field called winloss look for W or L (and ignore any OT notation) and give wins a 1 and losses a 0.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{winloss =} \FunctionTok{case\_when}\NormalTok{(}
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"W"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{1}\NormalTok{, }
  \FunctionTok{grepl}\NormalTok{(}\StringTok{"L"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{0}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now I'm going to add up all the winlosses for each team, which should give me the number of wins for each team.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{group\_by}\NormalTok{(Team, Conference, season) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{TeamWins =} \FunctionTok{sum}\NormalTok{(winloss)) }\OtherTok{{-}\textgreater{}}\NormalTok{ teamseasonwins}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Team', 'Conference' (override with `.groups` argument)
\end{verbatim}

Now that I have season win totals, I can join that data back to my log data so each game has the total number of wins in each season.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{left\_join}\NormalTok{(teamseasonwins, }\AttributeTok{by=}\FunctionTok{c}\NormalTok{(}\StringTok{"Team"}\NormalTok{, }\StringTok{"Conference"}\NormalTok{, }\StringTok{"season"}\NormalTok{)) }\OtherTok{{-}\textgreater{}}\NormalTok{ wintotallogs}
\end{Highlighting}
\end{Shaded}

Now I can use that same \texttt{case\_when} logic to create some groupings. So I want to group teams together by how many wins they had over the season. For no good reason, I started with more than 25 wins, then did groups of 5 down to 10 wins. If you had fewer than 10 wins, God help your program.

The way to create a new field based on groupings like that is to use \texttt{case\_when}, which says, basically, when This Thing Is True, Do This. So in our case, we're going to pass a couple of logical statements that when they are both true, our data gets labeled how we want to label it. So we \texttt{mutate} a field called grouping and then use \texttt{case\_when}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wintotallogs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{grouping =} \FunctionTok{case\_when}\NormalTok{(}
\NormalTok{  TeamWins }\SpecialCharTok{\textgreater{}} \DecValTok{25} \SpecialCharTok{\textasciitilde{}} \StringTok{"More than 25 wins"}\NormalTok{,}
\NormalTok{  TeamWins }\SpecialCharTok{\textgreater{}=} \DecValTok{20} \SpecialCharTok{\&}\NormalTok{ TeamWins }\SpecialCharTok{\textless{}=}\DecValTok{25} \SpecialCharTok{\textasciitilde{}} \StringTok{"20{-}25 wins"}\NormalTok{,}
\NormalTok{  TeamWins }\SpecialCharTok{\textgreater{}=} \DecValTok{15} \SpecialCharTok{\&}\NormalTok{ TeamWins }\SpecialCharTok{\textless{}=}\DecValTok{19} \SpecialCharTok{\textasciitilde{}} \StringTok{"15{-}19 wins"}\NormalTok{,}
\NormalTok{  TeamWins }\SpecialCharTok{\textgreater{}=} \DecValTok{10} \SpecialCharTok{\&}\NormalTok{ TeamWins }\SpecialCharTok{\textless{}=}\DecValTok{14} \SpecialCharTok{\textasciitilde{}} \StringTok{"10{-}14 wins"}\NormalTok{,}
\NormalTok{  TeamWins }\SpecialCharTok{\textless{}} \DecValTok{10} \SpecialCharTok{\textasciitilde{}} \StringTok{"Less than 10 wins"}\NormalTok{)}
\NormalTok{) }\OtherTok{{-}\textgreater{}}\NormalTok{ wintotalgroupinglogs}
\end{Highlighting}
\end{Shaded}

So my \texttt{wintotalgroupinglogs} table has each game, with a field that gives the total number of wins each team had that season and labeling each game with one of five groupings. I could use \texttt{dplyr} to do group\_by on those five groups and find some things out about them, but ridgeplots do that visually.

Let's look at the differences in rebounds by those five groups. Do teams that win more than 25 games rebound better than teams that win fewer games?

The answer might surprise you.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(wintotalgroupinglogs, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ TeamTotalRebounds, }\AttributeTok{y =}\NormalTok{ grouping, }\AttributeTok{fill =}\NormalTok{ grouping)) }\SpecialCharTok{+}
  \FunctionTok{geom\_density\_ridges}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme\_ridges}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Picking joint bandwidth of 0.88
\end{verbatim}

\begin{verbatim}
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-231-1.pdf}

Answer? Not really. Game to game, maybe. Over five seasons? The differences are indistinguishable.

How about assists?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(wintotalgroupinglogs, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ TeamAssists, }\AttributeTok{y =}\NormalTok{ grouping, }\AttributeTok{fill =}\NormalTok{ grouping)) }\SpecialCharTok{+}
  \FunctionTok{geom\_density\_ridges}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme\_ridges}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Picking joint bandwidth of 0.601
\end{verbatim}

\begin{verbatim}
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-232-1.pdf}

There's a little better, especially between top and bottom.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(wintotalgroupinglogs, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Team3PPCT, }\AttributeTok{y =}\NormalTok{ grouping, }\AttributeTok{fill =}\NormalTok{ grouping)) }\SpecialCharTok{+}
  \FunctionTok{geom\_density\_ridges}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme\_ridges}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Picking joint bandwidth of 0.0156
\end{verbatim}

\begin{verbatim}
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-233-1.pdf}

If you've been paying attention this semester, you know what's coming next.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(wintotalgroupinglogs, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ TeamFGPCT, }\AttributeTok{y =}\NormalTok{ grouping, }\AttributeTok{fill =}\NormalTok{ grouping)) }\SpecialCharTok{+}
  \FunctionTok{geom\_density\_ridges}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme\_ridges}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Picking joint bandwidth of 0.0102
\end{verbatim}

\begin{verbatim}
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-234-1.pdf}

\hypertarget{dumbbell-and-lollipop-charts}{%
\chapter{Dumbbell and lollipop charts}\label{dumbbell-and-lollipop-charts}}

Second to my love of waffle charts because I'm always hungry, dumbbell charts are an excellently named way of \textbf{showing the difference between two things on a number line} -- a start and a finish, for instance. Or the difference between two related things. Say, turnovers and assists.

Lollipop charts -- another excellent name -- are a variation on bar charts. They do a good job of showing magnitude and difference between things.

To use both of them, you need to add a new library:

\texttt{install.packages("ggalt")}

Let's give it a whirl.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggalt)}
\end{Highlighting}
\end{Shaded}

\hypertarget{dumbbell-plots}{%
\section{Dumbbell plots}\label{dumbbell-plots}}

For this, let's use college football game logs.

And load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

For the first example, let's look at the difference between a team's giveaways -- turnovers lost -- versus takeaways, turnovers gained. To get this, we're going to add up all offensive turnovers and defensive turnovers for a team in a season and take a look at where they come out. To make this readable, I'm going to focus on the Big Ten.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{turnovers }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(Team, Conference) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{Giveaways =} \FunctionTok{sum}\NormalTok{(TotalTurnovers), }
    \AttributeTok{Takeaways =} \FunctionTok{sum}\NormalTok{(DefTotalTurnovers)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten Conference"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Team' (override with `.groups` argument)
\end{verbatim}

Now, the way that the \texttt{geom\_dumbbell} works is pretty simple when viewed through what we've done before. There's just some tweaks.

First: We start with the y axis. The reason is we want our dumbbells going left and right, so the label is going to be on the y axis.

Second: Our x is actually two things: x and xend. What you put in there will decide where on the line the dot appears.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_dumbbell}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\NormalTok{Team, }\AttributeTok{x=}\NormalTok{Takeaways, }\AttributeTok{xend=}\NormalTok{Giveaways)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-239-1.pdf}

Well, that's a chart alright, but what dot is the giveaways and what are the takeaways? To fix this, we'll add colors.

So our choice of colors here is important. We want giveaways to be seen as bad and takeaways to be seen as good. So lets try red for giveaways and green for takeaways. To make this work, we'll need to do three things: first, use the English spelling of color, so \texttt{colour}. The, uh, \texttt{colour} is the bar between the dots, the \texttt{x\_colour} is the color of the x value dot and the \texttt{xend\_colour} is the color of the xend dot. So in our setup, takeaways are x, they're good, so they're green.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_dumbbell}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\NormalTok{Team, }\AttributeTok{x=}\NormalTok{Takeaways, }\AttributeTok{xend=}\NormalTok{Giveaways),}
    \AttributeTok{colour =} \StringTok{"grey"}\NormalTok{,}
    \AttributeTok{colour\_x =} \StringTok{"green"}\NormalTok{,}
    \AttributeTok{colour\_xend =} \StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-240-1.pdf}

Better. Let's make two more tweaks. First, let's make the whole thing bigger with a \texttt{size} element. And let's add \texttt{theme\_minimal} to clean out some cruft.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_dumbbell}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\NormalTok{Team, }\AttributeTok{x=}\NormalTok{Takeaways, }\AttributeTok{xend=}\NormalTok{Giveaways),}
    \AttributeTok{size =} \DecValTok{1}\NormalTok{,}
    \AttributeTok{color =} \StringTok{"grey"}\NormalTok{,}
    \AttributeTok{colour\_x =} \StringTok{"green"}\NormalTok{,}
    \AttributeTok{colour\_xend =} \StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-241-1.pdf}

And now we have a chart that tells a story -- got green on the right? That's good. A long distance between green and red? Better. But what if we sort it by good turnovers?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_dumbbell}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\FunctionTok{reorder}\NormalTok{(Team, Takeaways), }\AttributeTok{x=}\NormalTok{Takeaways, }\AttributeTok{xend=}\NormalTok{Giveaways),}
    \AttributeTok{size =} \DecValTok{1}\NormalTok{,}
    \AttributeTok{color =} \StringTok{"grey"}\NormalTok{,}
    \AttributeTok{colour\_x =} \StringTok{"green"}\NormalTok{,}
    \AttributeTok{colour\_xend =} \StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-242-1.pdf}

Believe it or not, Indiana had the most takeaways in the Big Ten last season. Don't sleep on the Fighting Basketball School.

\hypertarget{lollipop-charts}{%
\section{Lollipop charts}\label{lollipop-charts}}

Sticking with takeaways, lollipops are similar to bar charts in that they show magnitude. And like dumbbells, they are similar in that we start with a y -- the traditional lollipop chart is on its side -- and we only need one x. The only additional thing we need to add is that we need to tell it that it is a horizontal chart.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_lollipop}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\NormalTok{Team, }\AttributeTok{x=}\NormalTok{Takeaways), }
    \AttributeTok{horizontal =} \ConstantTok{TRUE}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-243-1.pdf}

We can do better than this with a simple theme\_minimal and some better labels.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_lollipop}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\FunctionTok{reorder}\NormalTok{(Team, Takeaways), }\AttributeTok{x=}\NormalTok{Takeaways), }
    \AttributeTok{horizontal =} \ConstantTok{TRUE}
\NormalTok{    ) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Nebraska\textquotesingle{}s defense improved, but needs more takeaways"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Team"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-244-1.pdf}

How about some layering?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ turnovers }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_lollipop}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{turnovers, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\FunctionTok{reorder}\NormalTok{(Team, Takeaways), }\AttributeTok{x=}\NormalTok{Takeaways), }
    \AttributeTok{horizontal =} \ConstantTok{TRUE}
\NormalTok{    ) }\SpecialCharTok{+} 
  \FunctionTok{geom\_lollipop}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu,}
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{y=}\NormalTok{Team, }\AttributeTok{x=}\NormalTok{Takeaways),}
    \AttributeTok{horizontal =} \ConstantTok{TRUE}\NormalTok{,}
    \AttributeTok{color =} \StringTok{"red"}
\NormalTok{  ) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Nebraska\textquotesingle{}s defense wasn\textquotesingle{}t as bad as you think"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Team"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-246-1.pdf}

The headline says it all.

\hypertarget{scatterplots}{%
\chapter{Scatterplots}\label{scatterplots}}

On the Monday, Sept.~21, 2020 edition of the Pick Six Podcast, Omaha World Herald reporter Sam McKewon talked a little about the Nebraska mens basketball team. Specifically the conversation was about a new roster release, and how the second year of Fred Hoiberg ball was going to look very different, starting with the heights of the players. After a near complete roster turnover, the players on the team now were nearly all taller than 6'4", and one of the shorter ones is penciled in as the starting point guard.

Why is that important? One reason, McKewon posited, is that teams made a lot of three point shots on Nebraska. In fact, Nebraska finished dead last in the conference in three points shots made against them. McKewon chalked this up to bad perimeter defense, and that Nebraska needed to improve it. Being taller -- or more specifically having the longer arms that go with being taller -- will help with that, McKewon said.

Better perimeter defense, better team.

The question before you is this: is that true? Does keeping a lid on your opponent's ability to score three pointers mean more wins?

This is what we're going to start to answer today. And we'll do it with scatterplots and regressions. Scatterplots are very good at showing \textbf{relationships between two numbers}.

First, we need libraries and every college basketball game last year.

Load the tidyverse.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

And the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

To do this, we need all teams and their season stats. How much, team to team, does a thing matter? That's the question you're going to answer.

In our case, we want to know how much do three point shots made influence wins? How much difference can we explain in wins by knowing how many threes the other team made against you? We're going to total up the number of threes each team allowed and their season wins in one swoop.

To do this, we need to use conditional logic -- \texttt{case\_when} in this case -- to determine if the team won or lost the game. In this case, we'll create a new column called \texttt{winloss}. Case when statements can be read like this: When This is True, Do This. This bit of code -- which you can use in a lot of contexts in this class -- uses the \texttt{grepl} function to look for the letter W in the W\_L column and, if it finds it, makes winloss 1. If it finds an L, it makes it 0. Sum your winloss column and you have your season win total. The reason we have to use \texttt{grepl} to find W or L is because Sports Reference will record overtime wins differently than regular wins. Same with losses.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{winloss =} \FunctionTok{case\_when}\NormalTok{(}
      \FunctionTok{grepl}\NormalTok{(}\StringTok{"W"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{1}\NormalTok{, }
      \FunctionTok{grepl}\NormalTok{(}\StringTok{"L"}\NormalTok{, W\_L) }\SpecialCharTok{\textasciitilde{}} \DecValTok{0}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now we can get a dataframe together that gives us the total wins for each team, and the total three point shots made. We'll call that new dataframe \texttt{threedef}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threedef }\OtherTok{\textless{}{-}}\NormalTok{ winlosslogs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Team) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{Wins =} \FunctionTok{sum}\NormalTok{(winloss), }
    \AttributeTok{TotalOpp3P =} \FunctionTok{sum}\NormalTok{(Opponent3P)}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

Now let's look at the scatterplot. With a scatterplot, we put what predicts the thing on the X axis, and the thing being predicted on the Y axis. In this case, X is our three pointers given up, y is our wins.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{threedef, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TotalOpp3P, }\AttributeTok{y=}\NormalTok{Wins))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-252-1.pdf}

Let's talk about this. This seems kind of random, but clustered around the middle and maybe sloping down to the right. That would mean the more threes you give up, the less you win. And that makes intuitive sense. But can we get a better sense of this? Yes, by adding another geom -- \texttt{geom\_smooth}. It's identical to our \texttt{geom\_point}, but we add a method to the end, which in this case we're using the linear method or \texttt{lm}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{threedef, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TotalOpp3P, }\AttributeTok{y=}\NormalTok{Wins)) }\SpecialCharTok{+}
  \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{data=}\NormalTok{threedef, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TotalOpp3P, }\AttributeTok{y=}\NormalTok{Wins), }\AttributeTok{method=}\StringTok{"lm"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-253-1.pdf}

So it does slope down to the right like we expect, but this still doesn't look good to me. It's very spread out. Which is a clue that you should be asking a question here: how strong of a relationship is this? How much can threes given up explain wins? Can we put some numbers to this?

Of course we can. We can apply a linear model to this -- remember Chapter 9? We're going to create an object called fit, and then we're going to put into that object a linear model -- \texttt{lm} -- and the way to read this is ``wins are predicted by opponent threes''. Then we just want the summary of that model.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fit }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(Wins }\SpecialCharTok{\textasciitilde{}}\NormalTok{ TotalOpp3P, }\AttributeTok{data =}\NormalTok{ threedef)}
\FunctionTok{summary}\NormalTok{(fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:
## lm(formula = Wins ~ TotalOpp3P, data = threedef)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.9345  -3.4593   0.3006   3.6036  14.5274 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.619712   1.859947  13.774  < 2e-16 ***
## TotalOpp3P  -0.041390   0.008184  -5.057 6.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.313 on 351 degrees of freedom
## Multiple R-squared:  0.06792,    Adjusted R-squared:  0.06526 
## F-statistic: 25.58 on 1 and 351 DF,  p-value: 6.869e-07
\end{verbatim}

Remember from Chapter 9: There's just a few things you really need.

The first thing: R-squared. In this case, the Adjusted R-squared value is 0.06526, which we can interpret as shooting percentage predicts about 6.5 percent of the variance in wins. Which sounds not great.

Second: The P-value. We want anything less than .05. If it's above .05, the change between them is not statistically significant -- it's probably explained by random chance. In our case, we have 6.869e-07, which is to say 6.869 with 7 zeros in front of it, or .00000006869. Is that less than .05? Yes. Yes it is. So this is not random. Again, we would expect this, so it's a good logic test.

Third: The coefficient. In this case, the coefficient for TeamOpp3P is -0.041390. What this model predicts, given that and the intercept of 25.619712, is this: Every team starts with about 26 wins. For every 100 three pointers the other team makes, you lose 4.139 games off that total. So if you give up 100 threes in a season, you'll be a 20 win team. Give up 200, you're a 17 win team, and so on. How am I doing that? Remember your algebra and y = mx + b. In this case, y is the wins, m is the coefficient, x is the number of threes given up and b is the intercept.

Let's use Nebraska as an example. They had 276 threes scored on them in the last season.

y = -0.041390 * 276 + 25.619712 or 14.19 wins.

How many wins did Nebraska have? 7.

What does that mean? It means that as disappointing a season as it was, Nebraska UNDERPERFORMED according to this model. But our R-squared is only 6.5 percent. Put another way: 93.5 percent of the difference in wins between teams is predicted by something else.

Where is Nebraska on the plot? We know we can use layering for that.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ threedef }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska Cornhuskers"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{threedef, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TotalOpp3P, }\AttributeTok{y=}\NormalTok{Wins)) }\SpecialCharTok{+}
  \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{data=}\NormalTok{threedef, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TotalOpp3P, }\AttributeTok{y=}\NormalTok{Wins), }\AttributeTok{method=}\StringTok{"lm"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TotalOpp3P, }\AttributeTok{y=}\NormalTok{Wins), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-256-1.pdf}

\hypertarget{bubble-charts}{%
\chapter{Bubble charts}\label{bubble-charts}}

Here is the real talk: Bubble charts are hard. The reason they are hard is not because of the code, or the complexity or anything like that. They're a scatterplot with magnitude added -- the size of the dot in the scatterplot has meaning. The hard part is seeing when a bubble chart works and when it doesn't.

If you want to see it work spectacularly well, \href{https://www.youtube.com/watch?v=hVimVzgtD6w}{watch a semi-famous Ted Talk} by Hans Rosling from 2006 where bubble charts were the centerpiece. It's worth watching. It'll change your perspective on the world. No seriously. It will.

And since then, people have wanted bubble charts. And we're back to the original problem: They're hard. There's a finite set of circumstances where they work.

First, I'm going to show you an example of them not working to illustrate the point.

I'm going to load up my libraries.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

So for this example, I want to look at where Big Ten teams compare to the rest of college football last season. Is the Big Ten's reputation for tough games and defenses earned? Can we see patterns in good team vs bad teams?

I'm going to create a scatterplot with offensive yards per play on the X axis and defensive yards per play on the y axis. We can then divide the grid into four quadrants. Teams with high yards per offensive play and low defensive yards per play are teams with good offenses and good defenses. The opposite means bad defense, bad offense. Then, to drive the point home, I'm going to make the dot the size of the total wins on the season -- the bubble in my bubble charts.

We'll use this season's college football games.

And load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

To do this, I've got some work to do. First, I need to mutate the outcomes of the games into 1s and 0s so I can add up the wins. We've done this before, so this won't be new to you, just adjusted slightly from basketball data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{wins =} \FunctionTok{case\_when}\NormalTok{(}
      \FunctionTok{grepl}\NormalTok{(}\StringTok{"W"}\NormalTok{, Outcome) }\SpecialCharTok{\textasciitilde{}} \DecValTok{1}\NormalTok{, }
      \FunctionTok{grepl}\NormalTok{(}\StringTok{"L"}\NormalTok{, Outcome) }\SpecialCharTok{\textasciitilde{}} \DecValTok{0}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now I have some more work to do. My football logs data has the yards per play of each game, and I could average those together and get something very close to what I'm going to do, but averaging each games yards per play is not the same thing as calculating it, so we're going to calculate it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{winlosslogs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Team, Conference) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{TotalPlays =} \FunctionTok{sum}\NormalTok{(OffensivePlays), }
    \AttributeTok{TotalYards =} \FunctionTok{sum}\NormalTok{(OffensiveYards), }
    \AttributeTok{DefensivePlays =} \FunctionTok{sum}\NormalTok{(DefPlays), }
    \AttributeTok{DefensiveYards =} \FunctionTok{sum}\NormalTok{(DefYards), }
    \AttributeTok{TotalWins =} \FunctionTok{sum}\NormalTok{(wins)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{OffensiveYPP =}\NormalTok{ TotalYards}\SpecialCharTok{/}\NormalTok{TotalPlays, }
    \AttributeTok{DefensiveYPP =}\NormalTok{ DefensiveYards}\SpecialCharTok{/}\NormalTok{DefensivePlays) }\OtherTok{{-}\textgreater{}}\NormalTok{ ypp}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Team' (override with `.groups` argument)
\end{verbatim}

A bubble chart is just a scatterplot with one additional element in the aesthetic -- a size. Here's the scatterplot version.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{ypp, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-262-1.pdf}

Looks kind of random, eh? In this case, that's not that bad because we're not claiming a relationship. We're saying the location on the chart has meaning. So, do teams on the bottom right -- good offense, good defense -- win more games?

Let's add the size element.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{ypp, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins)}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-263-1.pdf}

What does this chart tell you? We can see a general pattern that there are more big dots on the bottom right than the upper left. But we can make this more readable by adding an alpha element outside the aesthetic -- alpha in this case is transparency -- and we can manually change the size of the dots by adding \texttt{scale\_size} and a \texttt{range}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{ypp, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins),}
    \AttributeTok{alpha =}\NormalTok{ .}\DecValTok{3}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{scale\_size}\NormalTok{(}\AttributeTok{range =} \FunctionTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{8}\NormalTok{), }\AttributeTok{name=}\StringTok{"Wins"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-264-1.pdf}

And by now, you now know to add in the Big Ten as a layer, I would hope.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bigten }\OtherTok{\textless{}{-}}\NormalTok{ ypp }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten Conference"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{ypp, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"grey"}\NormalTok{, }
    \AttributeTok{alpha=}\NormalTok{.}\DecValTok{5}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{bigten, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-266-1.pdf}

Let's add some things to this chart to help us out. First, let's add lines that show us the average of all teams for those two metrics. So first, we need to calculate those. Because I have grouped data, it's going to require me to ungroup it so I can get just the total average of those two numbers.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ypp }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{offense =} \FunctionTok{mean}\NormalTok{(OffensiveYPP), }
    \AttributeTok{defense =} \FunctionTok{mean}\NormalTok{(DefensiveYPP)}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 2
##   offense defense
##     <dbl>   <dbl>
## 1    5.74    5.73
\end{verbatim}

Now we can use those averages to add two more geoms -- geom\_vline and geom\_hline, for vertical lines and horizontal lines.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{ypp, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"grey"}\NormalTok{, }
    \AttributeTok{alpha=}\NormalTok{.}\DecValTok{5}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{bigten, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_vline}\NormalTok{(}\AttributeTok{xintercept =} \FloatTok{5.820543}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept =} \FloatTok{5.673263}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-268-1.pdf}

Now, let's add another new geom for us, using a new library called \texttt{ggrepel}, which will help us label the dots without overwriting other labels. So we'll have to install that in the console:

`install.packages(``ggrepel'')

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(ggrepel)}
\end{Highlighting}
\end{Shaded}

And with that, we can add labels to the dots. The \texttt{geom\_text\_repel} is pretty much the exact same thing as your Big Ten geom point, but instead of a size, you include a label.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{ypp, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"grey"}\NormalTok{, }
    \AttributeTok{alpha=}\NormalTok{.}\DecValTok{5}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{bigten, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_vline}\NormalTok{(}\AttributeTok{xintercept =} \FloatTok{5.820543}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept =} \FloatTok{5.673263}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{bigten, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{label=}\NormalTok{Team)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-270-1.pdf}
Well, what do you know about that? Nebraska was \ldots{} really a mixed bag this season.

All that's left is some labels and some finishing touches.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{ypp, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"grey"}\NormalTok{, }
    \AttributeTok{alpha=}\NormalTok{.}\DecValTok{5}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{bigten, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{size=}\NormalTok{TotalWins), }
    \AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_vline}\NormalTok{(}\AttributeTok{xintercept =} \FloatTok{5.820543}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept =} \FloatTok{5.673263}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{bigten, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{label=}\NormalTok{Team)}
\NormalTok{  ) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Nebraska: Average at football?"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Huskers were kinda in the middle of everything, neither good nor bad at anything."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{)  }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-271-1.pdf}

\hypertarget{beeswarm-plots}{%
\chapter{Beeswarm plots}\label{beeswarm-plots}}

A beeswarm plot is sometimes called a column scatterplot. It's an effective way to show how individual things -- teams, players, etc. -- are distributed along a numberline. The column is a grouping -- say positions in basketball -- and the dots are players, and the dots cluster where the numbers are more common. So think of it like a histogram mixed with a scatterplot crossed with a bar chart.

An example will help.

First things first: Install ggbeeswarm with \texttt{install.packages("ggbeeswarm")}

Like ggalt and ggrepel, ggbeeswarm adds a couple new geoms to ggplot. We'll need to load it, the tidyverse and, for later, ggrepel.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggbeeswarm)}
\FunctionTok{library}\NormalTok{(ggrepel)}
\end{Highlighting}
\end{Shaded}

Another bit of setup: we need to set the seed for the random number generator. The library ``jitters'' the dots in the beeswarm randomly. If we don't set the seed, we'll get different results each time. Setting the seed means we get the same look.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

So let's look at last year's basketball team as a group of shooters. The team was disappointing -- we know that -- but what kind of a problem is it going to be that we're returning basically no one from it?

First we'll load our player data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/players20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Team = col_character(),
##   Player = col_character(),
##   Class = col_character(),
##   Pos = col_character(),
##   Height = col_character(),
##   Hometown = col_character(),
##   `High School` = col_character(),
##   Summary = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

We know this data has a lot of players who didn't play, so let's get rid of them.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{activeplayers }\OtherTok{\textless{}{-}}\NormalTok{ players }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(MP}\SpecialCharTok{\textgreater{}}\DecValTok{0}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

Now let's ask what makes a good shooter? The best measure, in my book, is True Shooting Percentage. It's a combination of weighted field goal shooting -- to account for three pointers -- and free throws. Our data has \texttt{TS\%}, but if we include \emph{all} players, we'll have too many dots. So let's narrow it down. A decent tool for cutoffs? Field goal attempts. Let's get a quick look at them.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{summary}\NormalTok{(activeplayers}\SpecialCharTok{$}\NormalTok{FGA)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   23.25   96.00  121.90  197.00  611.00
\end{verbatim}

The median number of shots is 96, but we only really care about good ones. So let's use 197 attempts as our cutoff.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{shooters }\OtherTok{\textless{}{-}}\NormalTok{ activeplayers }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(FGA }\SpecialCharTok{\textgreater{}} \DecValTok{197}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now we've got enough for a beeswarm plot. It works very much like you would expect -- the group value is the x, the number is the y. We're going to beeswarm by position, and the dots will be true shooting percentage.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_beeswarm}\NormalTok{(}\AttributeTok{data=}\NormalTok{shooters, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-279-1.pdf}

You can see that there's a lot fewer centers who have attempted more than 197 shots than guards, but then there's a lot more guards in college basketball than anything else. In the guards column, note that fat width of the swarm is between .5 and .55. So that means most guards who shoot more than 197 shots end up in that area. They're the average shooter at that level. You can see, some are better, some are worse.

So where are the Nebraska players in that mix?

We'll filter players on Nebraska who meet our criteria.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ players }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska Cornhuskers"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(FGA}\SpecialCharTok{\textgreater{}}\DecValTok{197}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

Four Cornhuskers took more than 197 shots. Number returning this season? Zero.

That's \ldots{} not a good start. Nothing coming back. But how good are they as true shooters?

When you add another beeswarm, we need to pass another element in -- we need to tell it if we're grouping on the x value. Not sure why, but you'll get a warning if you don't.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_beeswarm}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{shooters, }
    \AttributeTok{groupOnX=}\ConstantTok{TRUE}\NormalTok{, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_beeswarm}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \AttributeTok{groupOnX=}\ConstantTok{TRUE}\NormalTok{, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-281-1.pdf}
Ooof. Best we can muster is middle of the fat part. Who is that?

This is where we can use ggrepel. Let's add a text layer and label the dots.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_beeswarm}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{shooters, }
    \AttributeTok{groupOnX=}\ConstantTok{TRUE}\NormalTok{, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_beeswarm}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \AttributeTok{groupOnX=}\ConstantTok{TRUE}\NormalTok{, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\NormalTok{Player))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-282-1.pdf}

So Haanif Cheatham was our best shooter by true shooting percentage. The rest were below average shooters for that volume of shooting.

\hypertarget{a-few-other-options}{%
\section{A few other options}\label{a-few-other-options}}

The ggbeeswarm library has a couple of variations on the geom\_beeswarm that may work better for your application. They are \texttt{geom\_quasirandom} and \texttt{geom\_jitter}.

There's not a lot to change from our example to see what they do.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_quasirandom}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{shooters, }
    \AttributeTok{groupOnX=}\ConstantTok{TRUE}\NormalTok{, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_quasirandom}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \AttributeTok{groupOnX=}\ConstantTok{TRUE}\NormalTok{, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\NormalTok{Player))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-283-1.pdf}

Quasirandom spreads out the dots you see in beeswarm using -- you guessed it -- quasirandom spacing.

For \texttt{geom\_jitter}, we need to remove the groupOnX value. Why? No clue.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_jitter}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{shooters, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_jitter}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}
    \AttributeTok{data=}\NormalTok{nu, }
    \FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Pos, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{TS\%}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\NormalTok{Player))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-284-1.pdf}

\texttt{geom\_jitter} spreads out the dots evenly across the width of the column, randomly deciding where in the line of the true shooting percentage they appear.

Which one is right for you? You're going to have to experiment and decide. This is the art in the art and a science.

\hypertarget{bump-charts}{%
\chapter{Bump charts}\label{bump-charts}}

The point of a bump chart is to show how the ranking of something changed over time -- you could do this with the top 25 in football or basketball. I've seen it done with European soccer league standings over a season.

The requirements are that you have a row of data for a team, in that week, with their rank.

This is another extension to ggplot, and you'll install it the usual way: \texttt{install.packages("ggbump")}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggbump)}
\end{Highlighting}
\end{Shaded}

Let's use last season's college football playoff rankings (this year wasn't done as of this writing):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rankings }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/cfbranking.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Rank = col_double(),
##   Team = col_character(),
##   Record = col_character(),
##   Week = col_double(),
##   ShortTeam = col_character()
## )
\end{verbatim}

Given our requirements of a row of data for a team, in that week, with their rank, take a look at the data provided. We have 5 weeks of playoff rankings, so we should have five rows of LSU, and five rows of Ohio State. You can see the basic look of the data by using head()

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(rankings)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 5
##    Rank Team               Record  Week ShortTeam
##   <dbl> <chr>              <chr>  <dbl> <chr>    
## 1     9 Baylor University  1-Oct     13 Baylor   
## 2     7 Baylor University  1-Nov     14 Baylor   
## 3     7 Baylor University  2-Nov     15 Baylor   
## 4     5 Clemson University Sep-00    10 Clem.    
## 5     3 Clemson University Oct-00    11 Clem.    
## 6     3 Clemson University Nov-00    12 Clem.
\end{verbatim}

So Baylor was ranked in the 13th, 14th and 15th week, 9th, 7th and 7th, respectively. So our data is in the form we need it to be. Now we can make a bump chart. We'll start simple.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-289-1.pdf}

Well, it's a start. I'm immediately annoyed by the top teams being at the bottom. I learned a neat trick from ggbump that's been in ggplot all along -- \texttt{scale\_y\_reverse()}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} \FunctionTok{scale\_y\_reverse}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-290-1.pdf}

Better. But, still not great. Let's add a point at each week.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team), }\AttributeTok{size =} \DecValTok{4}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_reverse}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-291-1.pdf}

Another step. That makes it more subway map like. But the colors are all wrong. To fix this, we're going to use \texttt{scale\_color\_manual} and we're going to Google the hex codes for each team. The legend will tell you what order your \texttt{scale\_color\_manual} needs to be.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team), }\AttributeTok{size =} \DecValTok{4}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{scale\_color\_manual}\NormalTok{(}\AttributeTok{values =} \FunctionTok{c}\NormalTok{(}\StringTok{"\#003015"}\NormalTok{,}\StringTok{"\#F66733"}\NormalTok{, }\StringTok{"\#461D7C"}\NormalTok{, }\StringTok{"\#bb0000"}\NormalTok{, }\StringTok{"\#041E42"}\NormalTok{, }\StringTok{"\#AF002A"}\NormalTok{,}\StringTok{"\#0021A5"}\NormalTok{, }\StringTok{"\#BA0C2F"}\NormalTok{, }\StringTok{"\#7A0019"}\NormalTok{, }\StringTok{"\#841617"}\NormalTok{, }\StringTok{"\#154733"}\NormalTok{, }\StringTok{"\#CC0000"}\NormalTok{, }\StringTok{"\#c5050c"}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_reverse}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-292-1.pdf}

Another step. But the legend is annoying. And trying to find which red is Alabama vs Ohio State is hard. So what if we labeled each dot at the beginning and end? We can do that with some clever usage of geom\_text and a little dplyr filtering inside the data step. We filter out the first and last weeks, then use hjust -- horizontal justification -- to move them left or right.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team), }\AttributeTok{size =} \DecValTok{4}\NormalTok{) }\SpecialCharTok{+}   
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{min}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{{-}}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{1}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{max}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{+}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{0}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{scale\_color\_manual}\NormalTok{(}\AttributeTok{values =} \FunctionTok{c}\NormalTok{(}\StringTok{"\#003015"}\NormalTok{,}\StringTok{"\#F66733"}\NormalTok{, }\StringTok{"\#461D7C"}\NormalTok{, }\StringTok{"\#bb0000"}\NormalTok{, }\StringTok{"\#041E42"}\NormalTok{, }\StringTok{"\#AF002A"}\NormalTok{,}\StringTok{"\#0021A5"}\NormalTok{, }\StringTok{"\#BA0C2F"}\NormalTok{, }\StringTok{"\#7A0019"}\NormalTok{, }\StringTok{"\#841617"}\NormalTok{, }\StringTok{"\#154733"}\NormalTok{, }\StringTok{"\#CC0000"}\NormalTok{, }\StringTok{"\#c5050c"}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_reverse}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-293-1.pdf}

Better, but the legend is still there. We can drop it in a theme directive by saying \texttt{legend.position\ =\ "none"}. We'll also throw a theme\_minimal on there to drop the default grey, and we'll add some better labeling.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team), }\AttributeTok{size =} \DecValTok{4}\NormalTok{) }\SpecialCharTok{+}   
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{min}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{{-}}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{1}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{max}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{+}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{0}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"The race to the playoffs"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"LSU and Ohio State were never out of the top two spots. LSU deserved it."}\NormalTok{, }\AttributeTok{y=} \StringTok{"Rank"}\NormalTok{, }\AttributeTok{x =} \StringTok{"Week"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{,}
    \AttributeTok{panel.grid.major =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{scale\_color\_manual}\NormalTok{(}\AttributeTok{values =} \FunctionTok{c}\NormalTok{(}\StringTok{"\#003015"}\NormalTok{,}\StringTok{"\#F66733"}\NormalTok{, }\StringTok{"\#461D7C"}\NormalTok{, }\StringTok{"\#bb0000"}\NormalTok{, }\StringTok{"\#041E42"}\NormalTok{, }\StringTok{"\#AF002A"}\NormalTok{,}\StringTok{"\#0021A5"}\NormalTok{, }\StringTok{"\#BA0C2F"}\NormalTok{, }\StringTok{"\#7A0019"}\NormalTok{, }\StringTok{"\#841617"}\NormalTok{, }\StringTok{"\#154733"}\NormalTok{, }\StringTok{"\#CC0000"}\NormalTok{, }\StringTok{"\#c5050c"}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_reverse}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-294-1.pdf}

Now let's fix our text hierarchy.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team), }\AttributeTok{size =} \DecValTok{4}\NormalTok{) }\SpecialCharTok{+}   
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{min}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{{-}}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{1}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{max}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{+}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{0}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"The race to the playoffs"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"LSU and Ohio State were never out of the top two spots. LSU deserved it."}\NormalTok{, }\AttributeTok{y=} \StringTok{"Rank"}\NormalTok{, }\AttributeTok{x =} \StringTok{"Week"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{,}
    \AttributeTok{panel.grid.major =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{scale\_color\_manual}\NormalTok{(}\AttributeTok{values =} \FunctionTok{c}\NormalTok{(}\StringTok{"\#003015"}\NormalTok{,}\StringTok{"\#F66733"}\NormalTok{, }\StringTok{"\#461D7C"}\NormalTok{, }\StringTok{"\#bb0000"}\NormalTok{, }\StringTok{"\#041E42"}\NormalTok{, }\StringTok{"\#AF002A"}\NormalTok{,}\StringTok{"\#0021A5"}\NormalTok{, }\StringTok{"\#BA0C2F"}\NormalTok{, }\StringTok{"\#7A0019"}\NormalTok{, }\StringTok{"\#841617"}\NormalTok{, }\StringTok{"\#154733"}\NormalTok{, }\StringTok{"\#CC0000"}\NormalTok{, }\StringTok{"\#c5050c"}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_reverse}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-295-1.pdf}

And the last thing: anyone else annoyed at 7.5th place on the left? We can fix that too by specifying the breaks in scale\_y\_reverse. We can do that with the x axis as well, but since we haven't reversed it, we do that in \texttt{scale\_x\_continuous} with the same breaks. Also: forgot my source and credit line.

One last thing: Let's change the width of the chart to make Ohio State and Penn State fit. We can do that by adding \texttt{fig.width=X} in the \texttt{\{r\}} setup in your block. So something like this:

\begin{verbatim}
{r fig.width=8}
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_bump}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{rankings, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Week, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{color=}\NormalTok{Team), }\AttributeTok{size =} \DecValTok{4}\NormalTok{) }\SpecialCharTok{+}   
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{min}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{{-}}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{1}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\AttributeTok{data =}\NormalTok{ rankings }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Week }\SpecialCharTok{==} \FunctionTok{max}\NormalTok{(Week)), }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ Week }\SpecialCharTok{+}\NormalTok{ .}\DecValTok{2}\NormalTok{, }\AttributeTok{y=}\NormalTok{Rank, }\AttributeTok{label =}\NormalTok{ ShortTeam), }\AttributeTok{size =} \DecValTok{3}\NormalTok{, }\AttributeTok{hjust =} \DecValTok{0}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"The race to the playoffs"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"LSU and Ohio State were never out of the top two spots. LSU deserved it."}\NormalTok{, }\AttributeTok{y=} \StringTok{"Rank"}\NormalTok{, }\AttributeTok{x =} \StringTok{"Week"}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: College Football Playoff Committee | Chart by Matt Waite"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{legend.position =} \StringTok{"none"}\NormalTok{,}
    \AttributeTok{panel.grid.major =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{scale\_color\_manual}\NormalTok{(}\AttributeTok{values =} \FunctionTok{c}\NormalTok{(}\StringTok{"\#003015"}\NormalTok{,}\StringTok{"\#F66733"}\NormalTok{, }\StringTok{"\#461D7C"}\NormalTok{, }\StringTok{"\#bb0000"}\NormalTok{, }\StringTok{"\#041E42"}\NormalTok{, }\StringTok{"\#AF002A"}\NormalTok{,}\StringTok{"\#0021A5"}\NormalTok{, }\StringTok{"\#BA0C2F"}\NormalTok{, }\StringTok{"\#7A0019"}\NormalTok{, }\StringTok{"\#841617"}\NormalTok{, }\StringTok{"\#154733"}\NormalTok{, }\StringTok{"\#CC0000"}\NormalTok{, }\StringTok{"\#c5050c"}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{scale\_x\_continuous}\NormalTok{(}\AttributeTok{breaks=}\FunctionTok{c}\NormalTok{(}\DecValTok{10}\NormalTok{,}\DecValTok{11}\NormalTok{,}\DecValTok{12}\NormalTok{,}\DecValTok{13}\NormalTok{,}\DecValTok{14}\NormalTok{,}\DecValTok{15}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_reverse}\NormalTok{(}\AttributeTok{breaks=}\FunctionTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{5}\NormalTok{,}\DecValTok{6}\NormalTok{,}\DecValTok{7}\NormalTok{,}\DecValTok{8}\NormalTok{,}\DecValTok{9}\NormalTok{,}\DecValTok{10}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-296-1.pdf}

\hypertarget{tables}{%
\chapter{Tables}\label{tables}}

But not a table. A table with features.

Sometimes, the best way to show your data is with a table -- simple rows and columns. It allows a reader to compare whatever they want to compare a little easier than a graph where you've chosen what to highlight. R has a neat package called \texttt{kableExtra}.

For this assignment, we're going to need a bunch of new libraries. Go over to the console and run these:

\begin{verbatim}
install.packages("kableExtra")
install.packages("formattable")
install.packages("htmltools")
install.packages("webshot")
webshot::install_phantomjs()
\end{verbatim}

So what does all of these libraries do? Let's gather a few and use data of every game in the last 5 years.

Load libraries.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(kableExtra)}
\end{Highlighting}
\end{Shaded}

And the data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs1520.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

Let's ask this question: Which college basketball team saw the greatest increase in three point attempts last season as a percentage of shots? The simplest way to calculate that is by percent change.

We've got a little work to do, putting together ideas we've used before. What we need to end up with is some data that looks like this:

\texttt{Team\ \textbar{}\ 2018-2019\ season\ threes\ \textbar{}\ 2019-2020\ season\ threes\ \textbar{}\ pct\ change}

To get that, we'll need to do some filtering to get the right seasons, some grouping and summarizing to get the right number, some pivoting to get it organized correctly so we can mutate the percent change.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(season }\SpecialCharTok{==} \StringTok{"2018{-}2019"} \SpecialCharTok{|}\NormalTok{ season }\SpecialCharTok{==} \StringTok{"2019{-}2020"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(Team, Conference, season) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{Total3PA =} \FunctionTok{sum}\NormalTok{(Team3PA)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from=}\NormalTok{season, }\AttributeTok{values\_from =}\NormalTok{ Total3PA) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{PercentChange =}\NormalTok{ (}\StringTok{\textasciigrave{}}\AttributeTok{2019{-}2020}\StringTok{\textasciigrave{}}\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2018{-}2019}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2018{-}2019}\StringTok{\textasciigrave{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(PercentChange)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{) }\CommentTok{\# just want a top 10 list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Team', 'Conference' (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## Selecting by PercentChange
\end{verbatim}

We've output tables to the screen a thousand times in this class with \texttt{head}, but kable makes them look decent with very little code.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{kable}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{l|l|r|r|r}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange\\
\hline
Mississippi Valley State Delta Devils & SWAC & 554 & 837 & 0.5108303\\
\hline
Valparaiso Crusaders & MVC & 585 & 843 & 0.4410256\\
\hline
Ball State Cardinals & MAC & 621 & 842 & 0.3558776\\
\hline
San Jose State Spartans & MWC & 641 & 861 & 0.3432137\\
\hline
Alabama Crimson Tide & SEC & 718 & 957 & 0.3328691\\
\hline
Minnesota Golden Gophers & Big Ten & 603 & 762 & 0.2636816\\
\hline
Georgia Southern Eagles & Sun Belt & 631 & 792 & 0.2551506\\
\hline
Tennessee Tech Golden Eagles & OVC & 620 & 776 & 0.2516129\\
\hline
San Francisco Dons & WCC & 728 & 899 & 0.2348901\\
\hline
McNeese State Cowboys & Southland & 547 & 675 & 0.2340037\\
\hline
\end{tabular}

So there you have it. Mississippi Valley State changed their team so much they took 51 percent more threes last season from the season before. Where did Nebraska come out? Isn't Fred Ball supposed to be a lot of threes? We ranked 111th in college basketball in terms of change from the season before. Believe it or not, Nebraska took four fewer threes last season under Fred Ball than the last season of Tim Miles.

Kable has a mountain of customization options. The good news is that it works in a very familiar pattern. We'll start with default styling.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable\_styling}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{table}[H]
\centering
\begin{tabular}{l|l|r|r|r}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange\\
\hline
Mississippi Valley State Delta Devils & SWAC & 554 & 837 & 0.5108303\\
\hline
Valparaiso Crusaders & MVC & 585 & 843 & 0.4410256\\
\hline
Ball State Cardinals & MAC & 621 & 842 & 0.3558776\\
\hline
San Jose State Spartans & MWC & 641 & 861 & 0.3432137\\
\hline
Alabama Crimson Tide & SEC & 718 & 957 & 0.3328691\\
\hline
Minnesota Golden Gophers & Big Ten & 603 & 762 & 0.2636816\\
\hline
Georgia Southern Eagles & Sun Belt & 631 & 792 & 0.2551506\\
\hline
Tennessee Tech Golden Eagles & OVC & 620 & 776 & 0.2516129\\
\hline
San Francisco Dons & WCC & 728 & 899 & 0.2348901\\
\hline
McNeese State Cowboys & Southland & 547 & 675 & 0.2340037\\
\hline
\end{tabular}
\end{table}

Let's do more than the defaults, which you can see are pretty decent. Let's stripe every other row with a little bit of grey, and let's smush the width of the rows.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable\_styling}\NormalTok{(}\AttributeTok{bootstrap\_options =} \FunctionTok{c}\NormalTok{(}\StringTok{"striped"}\NormalTok{, }\StringTok{"condensed"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{table}[H]
\centering
\begin{tabular}{l|l|r|r|r}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange\\
\hline
Mississippi Valley State Delta Devils & SWAC & 554 & 837 & 0.5108303\\
\hline
Valparaiso Crusaders & MVC & 585 & 843 & 0.4410256\\
\hline
Ball State Cardinals & MAC & 621 & 842 & 0.3558776\\
\hline
San Jose State Spartans & MWC & 641 & 861 & 0.3432137\\
\hline
Alabama Crimson Tide & SEC & 718 & 957 & 0.3328691\\
\hline
Minnesota Golden Gophers & Big Ten & 603 & 762 & 0.2636816\\
\hline
Georgia Southern Eagles & Sun Belt & 631 & 792 & 0.2551506\\
\hline
Tennessee Tech Golden Eagles & OVC & 620 & 776 & 0.2516129\\
\hline
San Francisco Dons & WCC & 728 & 899 & 0.2348901\\
\hline
McNeese State Cowboys & Southland & 547 & 675 & 0.2340037\\
\hline
\end{tabular}
\end{table}

Throughout the semester, we've been using color and other signals to highlight things. Let's pretend we're doing a project on Minnesota. We can use row\_spec to highlight things.

What row\_spec is doing here is we're specifying which row -- 6 -- and making all the text on that row bold. We're making the color of the text white, because we're going to set the background to a color -- in this case, the hex color for Minnesota gold.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable\_styling}\NormalTok{(}\AttributeTok{bootstrap\_options =} \FunctionTok{c}\NormalTok{(}\StringTok{"striped"}\NormalTok{, }\StringTok{"condensed"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{row\_spec}\NormalTok{(}\DecValTok{6}\NormalTok{, }\AttributeTok{bold =}\NormalTok{ T, }\AttributeTok{color =} \StringTok{"white"}\NormalTok{, }\AttributeTok{background =} \StringTok{"\#FBB93C"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{table}[H]
\centering
\begin{tabular}{l|l|r|r|r}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange\\
\hline
Mississippi Valley State Delta Devils & SWAC & 554 & 837 & 0.5108303\\
\hline
Valparaiso Crusaders & MVC & 585 & 843 & 0.4410256\\
\hline
Ball State Cardinals & MAC & 621 & 842 & 0.3558776\\
\hline
San Jose State Spartans & MWC & 641 & 861 & 0.3432137\\
\hline
Alabama Crimson Tide & SEC & 718 & 957 & 0.3328691\\
\hline
\cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Minnesota Golden Gophers}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Big Ten}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{603}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{762}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{0.2636816}}}\\
\hline
Georgia Southern Eagles & Sun Belt & 631 & 792 & 0.2551506\\
\hline
Tennessee Tech Golden Eagles & OVC & 620 & 776 & 0.2516129\\
\hline
San Francisco Dons & WCC & 728 & 899 & 0.2348901\\
\hline
McNeese State Cowboys & Southland & 547 & 675 & 0.2340037\\
\hline
\end{tabular}
\end{table}

There's also something called column\_spec where we can change the styling on individual columns. What if we wanted to make all the team names bold?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable\_styling}\NormalTok{(}\AttributeTok{bootstrap\_options =} \FunctionTok{c}\NormalTok{(}\StringTok{"striped"}\NormalTok{, }\StringTok{"condensed"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{row\_spec}\NormalTok{(}\DecValTok{6}\NormalTok{, }\AttributeTok{bold =}\NormalTok{ T, }\AttributeTok{color =} \StringTok{"white"}\NormalTok{, }\AttributeTok{background =} \StringTok{"\#FBB93C"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{column\_spec}\NormalTok{(}\DecValTok{1}\NormalTok{, }\AttributeTok{bold=}\NormalTok{T)}
\end{Highlighting}
\end{Shaded}

\begin{table}[H]
\centering
\begin{tabular}{>{}l|l|r|r|r}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange\\
\hline
\textbf{Mississippi Valley State Delta Devils} & SWAC & 554 & 837 & 0.5108303\\
\hline
\textbf{Valparaiso Crusaders} & MVC & 585 & 843 & 0.4410256\\
\hline
\textbf{Ball State Cardinals} & MAC & 621 & 842 & 0.3558776\\
\hline
\textbf{San Jose State Spartans} & MWC & 641 & 861 & 0.3432137\\
\hline
\textbf{Alabama Crimson Tide} & SEC & 718 & 957 & 0.3328691\\
\hline
\textbf{\cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Minnesota Golden Gophers}}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Big Ten}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{603}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{762}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{0.2636816}}}\\
\hline
\textbf{Georgia Southern Eagles} & Sun Belt & 631 & 792 & 0.2551506\\
\hline
\textbf{Tennessee Tech Golden Eagles} & OVC & 620 & 776 & 0.2516129\\
\hline
\textbf{San Francisco Dons} & WCC & 728 & 899 & 0.2348901\\
\hline
\textbf{McNeese State Cowboys} & Southland & 547 & 675 & 0.2340037\\
\hline
\end{tabular}
\end{table}

Honestly, this is really good right here. You'd see this published \ldots{} except for one thing: the percentages.

We could go back up to the top and multiply by 100, but we'd still be missing the percent sign. Well, there's another library for making interesting tables that, in my opinion, has some flaws but does some interesting things too called \texttt{formattable}.

Go to the console and install \texttt{formattable} with \texttt{install.packages("formattable")}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(formattable)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Attaching package: 'formattable'
\end{verbatim}

\begin{verbatim}
## The following objects are masked from 'package:scales':
## 
##     comma, percent, scientific
\end{verbatim}

Then, we're going to use mutate here to use formattables \texttt{percent()} function to fix Change. Because it uses some HTML wizardry under the hood, we have to set kable to not \texttt{escape} the HTML.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Change =} \FunctionTok{percent}\NormalTok{(PercentChange)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{kable}\NormalTok{(}\AttributeTok{escape=}\NormalTok{F) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable\_styling}\NormalTok{(}\AttributeTok{bootstrap\_options =} \FunctionTok{c}\NormalTok{(}\StringTok{"striped"}\NormalTok{, }\StringTok{"condensed"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{row\_spec}\NormalTok{(}\DecValTok{6}\NormalTok{, }\AttributeTok{bold =}\NormalTok{ T, }\AttributeTok{color =} \StringTok{"white"}\NormalTok{, }\AttributeTok{background =} \StringTok{"\#FBB93C"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{column\_spec}\NormalTok{(}\DecValTok{1}\NormalTok{, }\AttributeTok{bold=}\NormalTok{T)}
\end{Highlighting}
\end{Shaded}

\begin{table}[H]
\centering
\begin{tabular}{>{}l|l|r|r|r|r}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange & Change\\
\hline
\textbf{Mississippi Valley State Delta Devils} & SWAC & 554 & 837 & 0.5108303 & 51.08%\\
\hline
\textbf{Valparaiso Crusaders} & MVC & 585 & 843 & 0.4410256 & 44.10%\\
\hline
\textbf{Ball State Cardinals} & MAC & 621 & 842 & 0.3558776 & 35.59%\\
\hline
\textbf{San Jose State Spartans} & MWC & 641 & 861 & 0.3432137 & 34.32%\\
\hline
\textbf{Alabama Crimson Tide} & SEC & 718 & 957 & 0.3328691 & 33.29%\\
\hline
\textbf{\cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Minnesota Golden Gophers}}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Big Ten}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{603}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{762}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{0.2636816}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{26.37%}}}\\
\hline
\textbf{Georgia Southern Eagles} & Sun Belt & 631 & 792 & 0.2551506 & 25.52%\\
\hline
\textbf{Tennessee Tech Golden Eagles} & OVC & 620 & 776 & 0.2516129 & 25.16%\\
\hline
\textbf{San Francisco Dons} & WCC & 728 & 899 & 0.2348901 & 23.49%\\
\hline
\textbf{McNeese State Cowboys} & Southland & 547 & 675 & 0.2340037 & 23.40%\\
\hline
\end{tabular}
\end{table}

Another way to highlight things: color ramps. We can change the color of the box covering the percentage using another mutate like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threechange }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Change =} \FunctionTok{percent}\NormalTok{(PercentChange)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Change =} \FunctionTok{cell\_spec}\NormalTok{(}
\NormalTok{    Change, }\AttributeTok{color =} \StringTok{"white"}\NormalTok{, }\AttributeTok{bold =}\NormalTok{ T,}
    \AttributeTok{background =} \FunctionTok{spec\_color}\NormalTok{(}\DecValTok{1}\SpecialCharTok{:}\DecValTok{10}\NormalTok{, }\AttributeTok{end =} \FloatTok{0.75}\NormalTok{, }\AttributeTok{option =} \StringTok{"B"}\NormalTok{, }\AttributeTok{direction =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)}
\NormalTok{  )) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{kable}\NormalTok{(}\AttributeTok{escape=}\NormalTok{F) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{kable\_styling}\NormalTok{(}\AttributeTok{bootstrap\_options =} \FunctionTok{c}\NormalTok{(}\StringTok{"striped"}\NormalTok{, }\StringTok{"condensed"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{row\_spec}\NormalTok{(}\DecValTok{6}\NormalTok{, }\AttributeTok{bold =}\NormalTok{ T, }\AttributeTok{color =} \StringTok{"white"}\NormalTok{, }\AttributeTok{background =} \StringTok{"\#FBB93C"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{column\_spec}\NormalTok{(}\DecValTok{1}\NormalTok{, }\AttributeTok{bold=}\NormalTok{T)}
\end{Highlighting}
\end{Shaded}

\begin{table}[H]
\centering
\begin{tabular}{>{}l|l|r|r|r|l}
\hline
Team & Conference & 2018-2019 & 2019-2020 & PercentChange & Change\\
\hline
\textbf{Mississippi Valley State Delta Devils} & SWAC & 554 & 837 & 0.5108303 & \cellcolor[HTML]{F98C0A}{\textcolor{white}{\textbf{51.08\%}}}\\
\hline
\textbf{Valparaiso Crusaders} & MVC & 585 & 843 & 0.4410256 & \cellcolor[HTML]{ED6925}{\textcolor{white}{\textbf{44.10\%}}}\\
\hline
\textbf{Ball State Cardinals} & MAC & 621 & 842 & 0.3558776 & \cellcolor[HTML]{D84B3E}{\textcolor{white}{\textbf{35.59\%}}}\\
\hline
\textbf{San Jose State Spartans} & MWC & 641 & 861 & 0.3432137 & \cellcolor[HTML]{BB3754}{\textcolor{white}{\textbf{34.32\%}}}\\
\hline
\textbf{Alabama Crimson Tide} & SEC & 718 & 957 & 0.3328691 & \cellcolor[HTML]{9B2964}{\textcolor{white}{\textbf{33.29\%}}}\\
\hline
\textbf{\cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Minnesota Golden Gophers}}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{Big Ten}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{603}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{762}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{0.2636816}}} & \cellcolor[HTML]{FBB93C}{\textcolor{white}{\textbf{\textbf{26.37\%}}}}\\
\hline
\textbf{Georgia Southern Eagles} & Sun Belt & 631 & 792 & 0.2551506 & \cellcolor[HTML]{56106E}{\textcolor{white}{\textbf{25.52\%}}}\\
\hline
\textbf{Tennessee Tech Golden Eagles} & OVC & 620 & 776 & 0.2516129 & \cellcolor[HTML]{330A5F}{\textcolor{white}{\textbf{25.16\%}}}\\
\hline
\textbf{San Francisco Dons} & WCC & 728 & 899 & 0.2348901 & \cellcolor[HTML]{110A30}{\textcolor{white}{\textbf{23.49\%}}}\\
\hline
\textbf{McNeese State Cowboys} & Southland & 547 & 675 & 0.2340037 & \cellcolor[HTML]{000004}{\textcolor{white}{\textbf{23.40\%}}}\\
\hline
\end{tabular}
\end{table}

If this is intersting to you, \href{https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html\#overview}{there's more you can do}.

\hypertarget{exporting-tables}{%
\section{Exporting tables}\label{exporting-tables}}

One of the major shortcomings with \texttt{formattable} is a very limited ability to export tables. Fortunately, \texttt{kable} appears to have somewhat solved that problem.

But, for this to be a finished product, we need to add a headline, credit line and source line.

Good news bad news: The bad news is that \texttt{kable} doesn't have a good way to add a headline. The good news is we can do that in illustrator pretty easily.

The other good news bad news: \texttt{kable} isn't going to export this as an actual vector file. It's going to make it an image and embed that into a pdf. That has consequences if we want to edit it.

To export it, we merely add a save\_kable to the end and give it a file path.

\begin{verbatim}
threechange %>% 
  mutate(Change = percent(PercentChange)) %>%
  mutate(Change = cell_spec(
    Change, color = "white", bold = T,
    background = spec_color(1:10, end = 0.75, option = "B", direction = -1)
  )) %>%
  kable(escape=F) %>% 
  kable_styling(bootstrap_options = c("striped", "condensed")) %>%
  row_spec(6, bold = T, color = "white", background = "#FBB93C") %>%
  column_spec(1, bold=T) %>%
  save_kable("minnesota.pdf")
\end{verbatim}

To fix this, we need to pull it into Illustrator. In there, we're going to expand the artboard, add a headline and chatter, then add source and credit lines at the bottom, and finish with touching up the artboards.

Here's a video walkthrough:

\hypertarget{facet-wraps}{%
\chapter{Facet wraps}\label{facet-wraps}}

Sometimes the easiest way to spot a trend is to chart a bunch of small things side by side. Edward Tufte, one of the most well known data visualization thinkers on the planet, calls this ``small multiples'' where ggplot calls this a facet wrap or a facet grid, depending.

One thing we noticed earlier in the semester -- it seems that a lot of teams shoot worse as the season goes on. Do they? We could answer this a number of ways, but the best way to show people would be visually. Let's use Small Multiples.

As always, we start with libraries.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

We're going to use the logs of college basketball games last season.

And load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/logs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   W_L = col_character(),
##   Blank = col_logical(),
##   Team = col_character(),
##   Conference = col_character(),
##   season = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

Let's narrow our pile and look just at the Big Ten.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{big10 }\OtherTok{\textless{}{-}}\NormalTok{ logs }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Conference }\SpecialCharTok{==} \StringTok{"Big Ten"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

The first thing we can do is look at a line chart, like we have done in previous chapters.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{big10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-313-1.pdf}

And, not surprisingly, we get a hairball. We could color certain lines, but that would limit us to focus on one team. What if we did all of them at once? We do that with a \texttt{facet\_wrap}. The only thing we MUST pass into a \texttt{facet\_wrap} is what thing we're going to separate them out by. In this case, we precede that field with a tilde, so in our case we want the Team field. It looks like this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{big10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{Team)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-314-1.pdf}

Answer: Not immediately clear, but we can look at this and analyze it. We could add a piece of annotation to help us out.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{big10 }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\FunctionTok{mean}\NormalTok{(TeamFGPCT))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   `mean(TeamFGPCT)`
##               <dbl>
## 1             0.436
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept=}\NormalTok{.}\DecValTok{4361078}\NormalTok{, }\AttributeTok{color=}\StringTok{"blue"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{big10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{Team)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-316-1.pdf}

What do you see here? How do teams compare? How do they change over time? I'm not asking you these questions because they're an assignment -- I'm asking because that's exactly what this chart helps answer. Your brain will immediately start making those connections.

\hypertarget{facet-grid-vs-facet-wraps}{%
\section{Facet grid vs facet wraps}\label{facet-grid-vs-facet-wraps}}

Facet grids allow us to put teams on the same plane, versus just repeating them. And we can specify that plane as vertical or horizontal. For example, here's our chart from above, but using facet\_grid to stack them.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept=}\NormalTok{.}\DecValTok{4361078}\NormalTok{, }\AttributeTok{color=}\StringTok{"blue"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{big10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{facet\_grid}\NormalTok{(Team }\SpecialCharTok{\textasciitilde{}}\NormalTok{ .)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-317-1.pdf}

And here they are next to each other:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept=}\NormalTok{.}\DecValTok{4361078}\NormalTok{, }\AttributeTok{color=}\StringTok{"blue"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data=}\NormalTok{big10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{Date, }\AttributeTok{y=}\NormalTok{TeamFGPCT, }\AttributeTok{group=}\NormalTok{Team)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{facet\_grid}\NormalTok{(. }\SpecialCharTok{\textasciitilde{}}\NormalTok{ Team)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-318-1.pdf}

Note: We'd have some work to do with the labeling on this -- we'll get to that -- but you can see where this is valuable comparing a group of things. One warning: Don't go too crazy with this or it loses it's visual power.

\hypertarget{other-types}{%
\section{Other types}\label{other-types}}

Line charts aren't the only things we can do. We can do any kind of chart in ggplot. Staying with shooting, where are team's winning and losing performances coming from when we talk about team shooting and opponent shooting?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{big10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{TeamFGPCT, }\AttributeTok{y=}\NormalTok{OpponentFGPCT, }\AttributeTok{color=}\NormalTok{W\_L)) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{scale\_x\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, .}\DecValTok{7}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{Team)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-319-1.pdf}

\hypertarget{arranging-multiple-plots-together}{%
\chapter{Arranging multiple plots together}\label{arranging-multiple-plots-together}}

Sometimes you have two or three (or more) charts that are really just one chart that you need to merge them together. It would be nice to be able to arrange them programmatically and not have to mess with it in illustrator.

Good news.

There is.

It's called \texttt{cowplot}, and it's pretty easy to use. First install cowplot with install.packages(``cowplot''). Then let's load tidyverse and cowplot.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(cowplot)}
\end{Highlighting}
\end{Shaded}

We'll use the college football attendance data we've used before.

And load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/attendance.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Institution = col_character(),
##   Conference = col_character(),
##   `2013` = col_double(),
##   `2014` = col_double(),
##   `2015` = col_double(),
##   `2016` = col_double(),
##   `2017` = col_double(),
##   `2018` = col_double()
## )
\end{verbatim}

Making a quick percent change.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{attendance }\OtherTok{\textless{}{-}}\NormalTok{ attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{change =}\NormalTok{ ((}\StringTok{\textasciigrave{}}\AttributeTok{2018}\StringTok{\textasciigrave{}}\SpecialCharTok{{-}}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{/}\StringTok{\textasciigrave{}}\AttributeTok{2017}\StringTok{\textasciigrave{}}\NormalTok{)}\SpecialCharTok{*}\DecValTok{100}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Let's chart the top 10 and bottom 10 of college football ticket growth \ldots{} and shrinkage.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{top10 }\OtherTok{\textless{}{-}}\NormalTok{ attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{, }\AttributeTok{wt=}\NormalTok{change)}
\NormalTok{bottom10 }\OtherTok{\textless{}{-}}\NormalTok{ attendance }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{, }\AttributeTok{wt=}\SpecialCharTok{{-}}\NormalTok{change)}
\end{Highlighting}
\end{Shaded}

Okay, now to do this I need to save my plots to an object. We do this the same way we save things to a dataframe -- with the arrow. We'll make two identical bar charts, one with the top 10 and one with the bottom 10.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bar1 }\OtherTok{\textless{}{-}} \FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, change), }\AttributeTok{weight=}\NormalTok{change)) }\SpecialCharTok{+} \FunctionTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bar2 }\OtherTok{\textless{}{-}} \FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{bottom10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, change), }\AttributeTok{weight=}\NormalTok{change)) }\SpecialCharTok{+} \FunctionTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

With cowplot, we can use a function called \texttt{plot\_grid} to arrange the charts:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot\_grid}\NormalTok{(bar1, bar2) }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-327-1.pdf}

We can also stack them on top of each other:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot\_grid}\NormalTok{(bar1, bar2, }\AttributeTok{ncol=}\DecValTok{1}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-328-1.pdf}

To make these publishable, we should add headlines, chatter, decent labels, credit lines, etc. But to do this, we'll have to figure out which labels go on which charts, so we can make it look decent. For example -- both charts don't need x or y labels. If you don't have a title and subtitle on both, the spacing is off, so you need to leave one blank or the other blank. You'll just have to fiddle with it until you get it looking right.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bar1 }\OtherTok{\textless{}{-}} \FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{top10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, change), }\AttributeTok{weight=}\NormalTok{change)) }\SpecialCharTok{+} \FunctionTok{coord\_flip}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"College football winners..."}\NormalTok{, }\AttributeTok{subtitle =} \StringTok{"Not every football program saw attendance shrink in 2018. But some really did."}\NormalTok{,  }\AttributeTok{x=}\StringTok{""}\NormalTok{, }\AttributeTok{y=}\StringTok{"Percent change"}\NormalTok{, }\AttributeTok{caption =} \StringTok{""}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bar2 }\OtherTok{\textless{}{-}} \FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{(}\AttributeTok{data=}\NormalTok{bottom10, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FunctionTok{reorder}\NormalTok{(Institution, change), }\AttributeTok{weight=}\NormalTok{change)) }\SpecialCharTok{+} \FunctionTok{coord\_flip}\NormalTok{() }\SpecialCharTok{+}  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"... and losers"}\NormalTok{, }\AttributeTok{subtitle=} \StringTok{""}\NormalTok{, }\AttributeTok{x=}\StringTok{""}\NormalTok{, }\AttributeTok{y=}\StringTok{""}\NormalTok{,  }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

Saving a cowplot plot\_grid is the same as anything else we did in the class:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot\_grid}\NormalTok{(bar1, bar2) }\SpecialCharTok{+} \FunctionTok{ggsave}\NormalTok{(}\StringTok{"test.png"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{encircling-points-on-a-scatterplot}{%
\chapter{Encircling points on a scatterplot}\label{encircling-points-on-a-scatterplot}}

One thing we've talked about all semester is drawing attention to the thing you want to draw attention to. We've used color and labels to do that so far. Let's add another layer to it -- a shape around the points you want to highlight.

Remember: The point of all of this is to draw the eye to what you are trying to show your reader. You want people to see the story you are trying to tell.

It's not hard to draw a shape in ggplot -- it is a challenge to put it in the right place. But, there is a library to the rescue that makes this super easy -- \texttt{ggalt}.

Install it in the console with \texttt{install.packages("ggalt")}

There's a bunch of things that \texttt{ggalt} does, but one of the most useful for us is the function \texttt{encircle}. Let's dive in.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggalt)}
\end{Highlighting}
\end{Shaded}

Let's say we want to highlight the top scorers in college basketball. So let's use our player data.

And while we're loading it, let's filter out anyone who hasn't played.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{players }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/players20.csv"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(MP }\SpecialCharTok{\textgreater{}} \DecValTok{0}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Team = col_character(),
##   Player = col_character(),
##   Class = col_character(),
##   Pos = col_character(),
##   Height = col_character(),
##   Hometown = col_character(),
##   `High School` = col_character(),
##   Summary = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

We've done this before, but let's make a standard scatterplot of minutes and points.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS))}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-335-1.pdf}

So we can see right away that there are some dots at the very top that we'd want to highlight. Who are these scoring machines?

Like we have done in the past, let's make a dataframe of top scorers. We'll set the cutoff at 650 points in a season.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{topscorers }\OtherTok{\textless{}{-}}\NormalTok{ players }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(PTS }\SpecialCharTok{\textgreater{}} \DecValTok{650}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

And like we've done in the past, we can add it to the chart with another geom\_point. We'll make all the players grey, we'll make all the top scorers black.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"black"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-337-1.pdf}

And like that, we're on the path to something publishable. We'll need to label those dots with \texttt{ggrepel} and we'll need to drop the default grey and add some headlines and all that. And, for the most part, we've got a solid chart.

But what if we could really draw the eye to those players. Let's draw a circle around them.
In \texttt{ggalt}, there is a new geom called \texttt{geom\_encircle}, which \ldots{} does what you think it does. It encircles all the dots in a dataset.

So let's add geom\_encircle and we'll just copy the data and the aes from our topscorers geom\_point. Then, we need to give the encirclement a shape using s\_shape -- which is a number between 0 and 1 -- and then how far away from the dots to draw the circle using expand, which is another number between 0 and 1.

Let's start with \texttt{s\_shape} 1 and \texttt{expand} 1.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"black"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_encircle}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{s\_shape=}\DecValTok{1}\NormalTok{, }\AttributeTok{expand=}\DecValTok{1}\NormalTok{, }\AttributeTok{colour=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-338-1.pdf}

Whoa. That's \ldots{} not good.

Let's go the opposite direction.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"black"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_encircle}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{s\_shape=}\DecValTok{0}\NormalTok{, }\AttributeTok{expand=}\DecValTok{0}\NormalTok{, }\AttributeTok{colour=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-339-1.pdf}

Better, but \ldots{} the circle cuts through multiple dots.

This takes a little bit of finessing, but a shape of .5 means the line will have some bend to it -- it'll look more like someone circled it with a pen. Then, the expand is better if you use hundredths instead of tenths. So .01 instead of .1. Here's mine after fiddling with it for a bit.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"black"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_encircle}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{s\_shape=}\NormalTok{.}\DecValTok{5}\NormalTok{, }\AttributeTok{expand=}\NormalTok{.}\DecValTok{03}\NormalTok{, }\AttributeTok{colour=}\StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-340-1.pdf}

Now let's clean this up and make it presentable. If you look at the top scorers, only two were Wooden Award finalists. So here's what a chart telling that story might look like.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{color=}\StringTok{"black"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_encircle}\NormalTok{(}\AttributeTok{data=}\NormalTok{topscorers, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\NormalTok{PTS), }\AttributeTok{s\_shape=}\NormalTok{.}\DecValTok{5}\NormalTok{, }\AttributeTok{expand=}\NormalTok{.}\DecValTok{03}\NormalTok{, }\AttributeTok{colour=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\DecValTok{725}\NormalTok{, }\AttributeTok{y=}\DecValTok{700}\NormalTok{, }\AttributeTok{label=}\StringTok{"Top scorers"}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Only two top scorers are Wooden Award candidates"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"Iowa\textquotesingle{}s Garza and Marquette\textquotesingle{}s Howard are the only two looking at postseason hardware."}\NormalTok{, }\AttributeTok{x=}\StringTok{"Minutes"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-341-1.pdf}

\hypertarget{a-different-more-local-example}{%
\section{A different, more local example}\label{a-different-more-local-example}}

You can use circling outside of the top of something. It's a bit obvious that the previous dots were top scorers. What about when they aren't at the top?

Works the same way -- use layering and color smartly and tell the story with all your tools.

Let's grab the top three point attempt takers on the Nebraska roster. As of now, only one will be coming back.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nutop }\OtherTok{\textless{}{-}}\NormalTok{ players }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{==} \StringTok{"Nebraska Cornhuskers"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{top\_n}\NormalTok{(}\DecValTok{3}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{3PA}\StringTok{\textasciigrave{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

And just like above, we can plug in our players geom, our nutop dataframe into another geom, then encircle that dataframe. Slap some headlines and annotations on it and here's what we get:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{players, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{3PA}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nutop, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{3PA}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_encircle}\NormalTok{(}\AttributeTok{data=}\NormalTok{nutop, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{MP, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{3PA}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{s\_shape=}\NormalTok{.}\DecValTok{02}\NormalTok{, }\AttributeTok{expand=}\NormalTok{.}\DecValTok{18}\NormalTok{, }\AttributeTok{colour=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\DecValTok{400}\NormalTok{, }\AttributeTok{y=}\DecValTok{100}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska\textquotesingle{}s top three shooters"}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Did Hoiberg install his system?"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"Nebraska\textquotesingle{}s top three point shooters were nowhere near the tops in college basketball"}\NormalTok{, }\AttributeTok{x=}\StringTok{"Minutes"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Three point attempts"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-343-1.pdf}

The dot on the far right? Cam Mack. Oh what could have been.

\hypertarget{plotly}{%
\chapter{Plotly}\label{plotly}}

\textbf{By John Strasheim}

We've been working on making charts and graphs and outputting them as static images -- png files. Why? Because images will embed into any website, work just fine on mobile, and require no special coding. But the wonder of the internet is that it's interactive. The trouble with interactive graphics, though, is the code can be exceedingly complicated and difficult for beginners to grasp, as is the case with the javascript visualization library D3. Or the tools are ultra simple and allow for minimal customization, such as tools like Tableau. The third problem is that the accessible interactive tools -- the ones that don't require a ton of code knowledge -- cost money to publish.

The library we're going to look at in this chapter is from a company called Plotly, which sits in between all these problems. You can create interactive graphs with point and click, but you can also do it in code. You can publish graphs for free, but if you're going to do it for a company or with a large audience, you need to pay. For our purposes, you'll see the power without needing to pony up.

You will need to install plotly. Go to the console -- not in the notebook -- and run \texttt{install.packages("plotly")}.

Then we'll load it:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(plotly)}
\end{Highlighting}
\end{Shaded}

To start out, we'll make a graph similar to something we've already done. We're going to use a dataset of batting stats from the 2019 season. This dataset has players who had more than 190 plate appearances and includes their basic stats and some advanced metrics.

Now load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{batting }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/batting.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Name = col_character(),
##   Tm = col_character(),
##   Division = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

Let's look at the relationship between a player's batting average and wRC+ (\href{http://m.mlb.com/glossary/advanced-stats/weighted-runs-created-plus}{weighted runs created plus}).

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{batting, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\NormalTok{AVG)) }\SpecialCharTok{+}
  \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{data=}\NormalTok{batting, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\NormalTok{AVG), }\AttributeTok{method=}\StringTok{\textquotesingle{}lm\textquotesingle{}}\NormalTok{, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{\textquotesingle{}wRC+\textquotesingle{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textquotesingle{}Batting Average\textquotesingle{}}\NormalTok{, }\AttributeTok{title=} \StringTok{"2019 MLB Season Batting"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"190 PAs min to Qualify"}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source:  FanGraphs | by John Strasheim"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-347-1.pdf}

You can obviously add more aesthetics to make it look better, but you get the picture. If I'm a fan of sports though, obviously I want to see who the outlier points are or just scroll through and see each player at an individual location. We can always annotate data, but that process can be tedious.

Here is where plotly comes in.

Plotly will make your visualizatons interactive. Additionally, you can zoom in on certain parts of the viz too. For example, you can drag a box around players with a .300 average, and see all the guys in that specific range.

Let's start simple with just the minimum needed for a scatterplot. For that, we need to specify our data source -- \texttt{batting} -- and set an X and a Y value, just like above. One difference? We prepend a \textasciitilde{} before the field names. We'll add a color to separate out players by division too.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot\_ly}\NormalTok{(}\AttributeTok{data=}\NormalTok{batting, }\AttributeTok{x=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=} \SpecialCharTok{\textasciitilde{}}\NormalTok{AVG, }\AttributeTok{color=} \SpecialCharTok{\textasciitilde{}}\NormalTok{Division)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
\end{verbatim}

\begin{verbatim}
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
\end{verbatim}

\begin{verbatim}
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-348-1.pdf}

We get a chart, but hover over a point. Recognize those players? You can't unless you know each player's specific stats to divine who they are. That isn't very friendly, so let's add a hover element. Then we want to specify what we want our users to see when they hover over a data point, hence hoverinfo = ``text''. The next step will be to define what our text is. How that gets done is a little bit of HTML and a little bit of R. What is in quotes is what the users are going to see directly, what's after the quotes is what data is going to appear. So ``Player:'', Name translates to something like Player: Christian Yelich when the user hovers above Yelich's data point.

Then we add the HTML, \texttt{\textless{}br\textgreater{}}, or break. All this means is we're having a line break so all of our data is not on one line. Simple. Do this process for whatever variables you want to have your users see. So for mine, I wanted my users to see the Player's name, wRC+, Batting Avg, and what division they are in.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot\_ly}\NormalTok{(}\AttributeTok{data=}\NormalTok{batting, }\AttributeTok{x=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{AVG}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{color=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{Division}\StringTok{\textasciigrave{}}\NormalTok{,}
        \AttributeTok{hoverinfo =} \StringTok{"text"}\NormalTok{,}
        \AttributeTok{text =} \SpecialCharTok{\textasciitilde{}}\FunctionTok{paste}\NormalTok{(}\StringTok{"Player:"}\NormalTok{, Name,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}wRC+:\textquotesingle{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}AVG:\textquotesingle{}}\NormalTok{, AVG,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}Team:\textquotesingle{}}\NormalTok{, Tm,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}Division:\textquotesingle{}}\NormalTok{, Division}
\NormalTok{                      ))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
\end{verbatim}

\begin{verbatim}
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-349-1.pdf}

Now we can see each player a little better. If you look at the players on the farthest right, you'll find Mike Trout (shocker), Yordan Alvarez and Christian Yelich.

To finish, we're going to fix the layout a bit, very similar to how we've been doing it in ggplot. We're just telling plotly what we want the layout to be of our viz, starting with the title, and then doing the x and y axis names after that.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot\_ly}\NormalTok{(}\AttributeTok{data=}\NormalTok{batting, }\AttributeTok{x=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{AVG}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{color=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{Division}\StringTok{\textasciigrave{}}\NormalTok{,}
        \AttributeTok{hoverinfo =} \StringTok{"text"}\NormalTok{,}
        \AttributeTok{text =} \SpecialCharTok{\textasciitilde{}}\FunctionTok{paste}\NormalTok{(}\StringTok{"Player:"}\NormalTok{, Name,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}wRC+:\textquotesingle{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}AVG:\textquotesingle{}}\NormalTok{, AVG,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}Team:\textquotesingle{}}\NormalTok{, Tm,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}Division:\textquotesingle{}}\NormalTok{, Division}
\NormalTok{                      )) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{layout}\NormalTok{(}
    \AttributeTok{title =} \StringTok{"2019 MLB Season Batting"}\NormalTok{,}
    \AttributeTok{xaxis =} \FunctionTok{list}\NormalTok{(}\AttributeTok{title =} \StringTok{"wRC+"}\NormalTok{),}
    \AttributeTok{yaxis =} \FunctionTok{list}\NormalTok{(}\AttributeTok{title =} \StringTok{"Batting Average"}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
\end{verbatim}

\begin{verbatim}
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-350-1.pdf}

\hypertarget{publishing-using-plotly}{%
\section{Publishing using Plotly}\label{publishing-using-plotly}}

We want to now export our plotly visualization. First, you'll need to # for a \href{https://chart-studio.plotly.com/Auth/#/\#/}{free plotly account}. Then you'll need to register your plotly username and your API key.

More info about how to do that can be \href{https://plot.ly/r/getting-started/\#initialization-for-online-plotting}{found on Plotly's website}.

For our purposes, we need to register our username and API key this way, where you put your username and API key where prompted:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{Sys.setenv}\NormalTok{(}\StringTok{"plotly\_username"}\OtherTok{=}\StringTok{"Enter your plotly username here"}\NormalTok{)}
\FunctionTok{Sys.setenv}\NormalTok{(}\StringTok{"plotly\_api\_key"}\OtherTok{=}\StringTok{"Enter your API key here"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now run this line of code specifiying what variable you are exporting, and what you want the file to be named on plotly's servers. From plotly's website you can then do several different things like editing it on there, embedding it on websites, or create a shareable link.

To publish our chart, we need to save it to an object similar to how we've been creating dataframes. So something like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OtherTok{\textless{}{-}} \FunctionTok{plot\_ly}\NormalTok{(}\AttributeTok{data=}\NormalTok{batting, }\AttributeTok{x=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{AVG}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{color=} \SpecialCharTok{\textasciitilde{}}\StringTok{\textasciigrave{}}\AttributeTok{Division}\StringTok{\textasciigrave{}}\NormalTok{,}
        \AttributeTok{hoverinfo =} \StringTok{"text"}\NormalTok{,}
        \AttributeTok{text =} \SpecialCharTok{\textasciitilde{}}\FunctionTok{paste}\NormalTok{(}\StringTok{"Player:"}\NormalTok{, Name,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}wRC+:\textquotesingle{}}\NormalTok{, }\StringTok{\textasciigrave{}}\AttributeTok{wRC+}\StringTok{\textasciigrave{}}\NormalTok{,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}AVG:\textquotesingle{}}\NormalTok{, AVG,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}Team:\textquotesingle{}}\NormalTok{, Tm,}
                      \StringTok{\textquotesingle{}\textless{}br\textgreater{}Division:\textquotesingle{}}\NormalTok{, Division}
\NormalTok{                      )) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{layout}\NormalTok{(}
    \AttributeTok{title =} \StringTok{"2019 MLB Season Batting"}\NormalTok{,}
    \AttributeTok{xaxis =} \FunctionTok{list}\NormalTok{(}\AttributeTok{title =} \StringTok{"wRC+"}\NormalTok{),}
    \AttributeTok{yaxis =} \FunctionTok{list}\NormalTok{(}\AttributeTok{title =} \StringTok{"Batting Average"}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

To publish it, we simply run the following, passing in our chart value \texttt{p} for plotly and we give it a filename.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{api\_create}\NormalTok{(p, }\AttributeTok{filename=}\StringTok{"MLBOffense19"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

If all goes well, a browser will pop up with \href{https://plot.ly/~mattwaite/1/\#/}{your chart in it}.

\hypertarget{text-cleaning}{%
\chapter{Text cleaning}\label{text-cleaning}}

On occasion, you'll get some data from someone that \ldots{} isn't quite what you need it to be. There's something flawed in it. Some extra text, some choice that the data provider made that you just don't agree with.

There's a ton of tools in the tidyverse to fix this, and you already have some tools in your toolboxt. Let's take a look at a couple.

First, you know what you need.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

Now, two examples.

\hypertarget{stripping-out-text}{%
\section{Stripping out text}\label{stripping-out-text}}

Throughout this class, we've used data from Sports Reference. If you've used their Share \textgreater{} CSV method to copy data from a table, you may have noticed some extra cruft in the player name field. If you haven't seen it, I'll give you an example -- a dataset of NBA players and their advanced metrics.

Now load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/nbaplayers.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Missing column names filled in: 'X20' [20], 'X25' [25]
\end{verbatim}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Player = col_character(),
##   Pos = col_character(),
##   Tm = col_character(),
##   X20 = col_logical(),
##   X25 = col_logical()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

Let's take a look:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(nbaplayers)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 29
##      Rk Player Pos     Age Tm        G    MP   PER `TS%` `3PAr`   FTr `ORB%`
##   <dbl> <chr>  <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1     1 "Stev~ C        26 OKC      58  1564  20.8 0.605  0.007 0.413   14.4
## 2     2 "Bam ~ PF       22 MIA      65  2235  20.6 0.606  0.018 0.476    8.7
## 3     3 "LaMa~ C        34 SAS      53  1754  19.8 0.571  0.198 0.241    6.3
## 4     4 "Nick~ SG       21 NOP      41   501   7.6 0.441  0.515 0.123    1.7
## 5     5 "Gray~ SG       24 MEM      30   498  11.4 0.577  0.517 0.199    1.1
## 6     6 "Jarr~ C        21 BRK      64  1647  20.3 0.658  0.012 0.574   12.5
## # ... with 17 more variables: `DRB%` <dbl>, `TRB%` <dbl>, `AST%` <dbl>,
## #   `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>, X20 <lgl>,
## #   OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/48` <dbl>, X25 <lgl>, OBPM <dbl>,
## #   DBPM <dbl>, BPM <dbl>, VORP <dbl>
\end{verbatim}

For starters, see the two empty columns that don't have any names? X20 and X25. We can get rid of them. They're empty.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\OtherTok{\textless{}{-}}\NormalTok{ nbaplayers }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{X20, }\SpecialCharTok{{-}}\NormalTok{X25)}
\end{Highlighting}
\end{Shaded}

Now let's look at those names:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(nbaplayers)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 27
##      Rk Player Pos     Age Tm        G    MP   PER `TS%` `3PAr`   FTr `ORB%`
##   <dbl> <chr>  <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1     1 "Stev~ C        26 OKC      58  1564  20.8 0.605  0.007 0.413   14.4
## 2     2 "Bam ~ PF       22 MIA      65  2235  20.6 0.606  0.018 0.476    8.7
## 3     3 "LaMa~ C        34 SAS      53  1754  19.8 0.571  0.198 0.241    6.3
## 4     4 "Nick~ SG       21 NOP      41   501   7.6 0.441  0.515 0.123    1.7
## 5     5 "Gray~ SG       24 MEM      30   498  11.4 0.577  0.517 0.199    1.1
## 6     6 "Jarr~ C        21 BRK      64  1647  20.3 0.658  0.012 0.574   12.5
## # ... with 15 more variables: `DRB%` <dbl>, `TRB%` <dbl>, `AST%` <dbl>,
## #   `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>, OWS <dbl>,
## #   DWS <dbl>, WS <dbl>, `WS/48` <dbl>, OBPM <dbl>, DBPM <dbl>, BPM <dbl>,
## #   VORP <dbl>
\end{verbatim}

You can see that every players name is their name, then two backslashes, then some version of their name that must have meaning to Sports Reference, but not to us. So we need to get rid of that.

To do this, we're going to use a little regular expression magic. Regular expressions are a programmatic way to find any pattern in text. What we're looking for is that \texttt{\textbackslash{}\textbackslash{}} business. But, that presents a problem, because the \texttt{\textbackslash{}} is a special character. It's called an escape character. That escape character means what comes next is potentially special. For instance, if you see \texttt{\textbackslash{}n}, that's a newline character. So normally, if you see that, it would add a return.

So for us to get rid of the \texttt{\textbackslash{}} we're going to have to escape the escape character with an escape character. And we have two of them. So we have to do it twice.

Yes. Really.

So if we wanted to find two backslashes, we need \texttt{\textbackslash{}\textbackslash{}\textbackslash{}\textbackslash{}}. Then, using regular expressions, we can say ``and then everything else after this'' with this: .*

No really. That's it. So we're looking for \texttt{\textbackslash{}\textbackslash{}\textbackslash{}\textbackslash{}.*}. That'll find two backslashes and then everything after it. If you think this is hard \ldots{} you're right. Regular expressions are an entire month of a programming course by themselves. They are EXTREMELY powerful.

To find something in text, we'll use a function called \texttt{gsub}. The pattern in \texttt{gsub} is \texttt{pattern,\ what\ we\ want\ to\ replace\ it\ with,\ what\ column\ this\ can\ all\ be\ found\ in}. So in our example, the pattern is \texttt{\textbackslash{}\textbackslash{}\textbackslash{}\textbackslash{}.*}, what we want to replace it with is \ldots{} nothing, and this is all in the Player column. Here's the code.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nbaplayers }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Player=}\FunctionTok{gsub}\NormalTok{(}\StringTok{"}\SpecialCharTok{\textbackslash{}\textbackslash{}\textbackslash{}\textbackslash{}}\StringTok{.*"}\NormalTok{,}\StringTok{""}\NormalTok{,Player)) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 27
##      Rk Player Pos     Age Tm        G    MP   PER `TS%` `3PAr`   FTr `ORB%`
##   <dbl> <chr>  <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1     1 Steve~ C        26 OKC      58  1564  20.8 0.605  0.007 0.413   14.4
## 2     2 Bam A~ PF       22 MIA      65  2235  20.6 0.606  0.018 0.476    8.7
## 3     3 LaMar~ C        34 SAS      53  1754  19.8 0.571  0.198 0.241    6.3
## 4     4 Nicke~ SG       21 NOP      41   501   7.6 0.441  0.515 0.123    1.7
## 5     5 Grays~ SG       24 MEM      30   498  11.4 0.577  0.517 0.199    1.1
## 6     6 Jarre~ C        21 BRK      64  1647  20.3 0.658  0.012 0.574   12.5
## # ... with 15 more variables: `DRB%` <dbl>, `TRB%` <dbl>, `AST%` <dbl>,
## #   `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>, `USG%` <dbl>, OWS <dbl>,
## #   DWS <dbl>, WS <dbl>, `WS/48` <dbl>, OBPM <dbl>, DBPM <dbl>, BPM <dbl>,
## #   VORP <dbl>
\end{verbatim}

Just like that, the trash is gone.

\hypertarget{another-example-splitting-columns}{%
\section{Another example: splitting columns}\label{another-example-splitting-columns}}

Text cleaning is really just a set of logic puzzles. What do I need to do? How can I get there step by step?

The NCAA does some very interesting things with data, making it pretty useless. \href{https://unl.box.com/s/kxjh8k7bm95i8eeovao18nqiky860j1s}{Here's an example}.

Let's import it and take a look.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/killsperset.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Rank = col_double(),
##   Player = col_character(),
##   Cl = col_character(),
##   Ht = col_character(),
##   Pos = col_character(),
##   S = col_double(),
##   Kills = col_double(),
##   `Per Set` = col_double(),
##   Season = col_character()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(kills)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 9
##    Rank Player                   Cl    Ht    Pos       S Kills `Per Set` Season 
##   <dbl> <chr>                    <chr> <chr> <chr> <dbl> <dbl>     <dbl> <chr>  
## 1     1 Lindsey Ruddins, UC San~ So.   6-2   OH       90   526      5.84 2017-2~
## 2     2 Pilar Victoria, Arkansa~ Sr.   5-11  OH      116   634      5.47 2017-2~
## 3     3 Laura Milos, Oral Rober~ Sr.   5-10  OH      106   560      5.28 2017-2~
## 4     4 Carlyle Nusbaum, Lipsco~ Jr.   5-10  OH      100   522      5.22 2017-2~
## 5     5 Veronica Jones-Perry, B~ Jr.   6-0   OH      118   569      4.82 2017-2~
## 6     6 Torrey Van Winden, Cal ~ So.   6-3   OH      101   477      4.72 2017-2~
\end{verbatim}

First things first, Player isn't just player, it's player, school and conference, all in one. And Ht is a character field -- and in feet and inches.

So \ldots{} this is a mess. But there is a pattern. See it? A comma after the player's name. The Conference is in parens. We can use that.

For this, we're going to use a \texttt{tidyr} function called \texttt{separate} to split columns into multiple columns based on a character. We'll do this step by step.

First, let's use that comma to split the player and the rest. Ignore the head at the end. That's just to keep it from showing you all 150.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{separate}\NormalTok{(Player, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Player"}\NormalTok{, }\StringTok{"School"}\NormalTok{), }\AttributeTok{sep=}\StringTok{","}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 10
##    Rank Player     School         Cl    Ht    Pos       S Kills `Per Set` Season
##   <dbl> <chr>      <chr>          <chr> <chr> <chr> <dbl> <dbl>     <dbl> <chr> 
## 1     1 Lindsey R~ " UC Santa Ba~ So.   6-2   OH       90   526      5.84 2017-~
## 2     2 Pilar Vic~ " Arkansas (S~ Sr.   5-11  OH      116   634      5.47 2017-~
## 3     3 Laura Mil~ " Oral Robert~ Sr.   5-10  OH      106   560      5.28 2017-~
## 4     4 Carlyle N~ " Lipscomb (A~ Jr.   5-10  OH      100   522      5.22 2017-~
## 5     5 Veronica ~ " BYU (WCC)"   Jr.   6-0   OH      118   569      4.82 2017-~
## 6     6 Torrey Va~ " Cal Poly (B~ So.   6-3   OH      101   477      4.72 2017-~
\end{verbatim}

Good start.

Now, let's get the conference separated. A problem is going to crop up here -- the paren is a special character, so we have to escape it with the \texttt{\textbackslash{}\textbackslash{}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{separate}\NormalTok{(Player, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Player"}\NormalTok{, }\StringTok{"School"}\NormalTok{), }\AttributeTok{sep=}\StringTok{","}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(School, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"School"}\NormalTok{, }\StringTok{"Conference"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"}\SpecialCharTok{\textbackslash{}\textbackslash{}}\StringTok{("}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Expected 2 pieces. Additional pieces discarded in 3 rows [15, 42, 83].
\end{verbatim}

\begin{verbatim}
## # A tibble: 6 x 11
##    Rank Player  School Conference Cl    Ht    Pos       S Kills `Per Set` Season
##   <dbl> <chr>   <chr>  <chr>      <chr> <chr> <chr> <dbl> <dbl>     <dbl> <chr> 
## 1     1 Lindse~ " UC ~ Big West)  So.   6-2   OH       90   526      5.84 2017-~
## 2     2 Pilar ~ " Ark~ SEC)       Sr.   5-11  OH      116   634      5.47 2017-~
## 3     3 Laura ~ " Ora~ Summit Le~ Sr.   5-10  OH      106   560      5.28 2017-~
## 4     4 Carlyl~ " Lip~ ASUN)      Jr.   5-10  OH      100   522      5.22 2017-~
## 5     5 Veroni~ " BYU~ WCC)       Jr.   6-0   OH      118   569      4.82 2017-~
## 6     6 Torrey~ " Cal~ Big West)  So.   6-3   OH      101   477      4.72 2017-~
\end{verbatim}

Uh oh. Says we have problems in rows 15, 42 and 83. What are they? The NCAA has decided to put (FL), (NY) and (PA) into three teams to tell you they're in Florida, New York and Pennsylvania respectively. Well, we can fix that with some gsub and we'll use a switch called \texttt{fixed}, which when set to TRUE it means this literal string, no special characters.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{separate}\NormalTok{(Player, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Player"}\NormalTok{, }\StringTok{"School"}\NormalTok{), }\AttributeTok{sep=}\StringTok{","}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(FL)"}\NormalTok{, }\StringTok{"FL"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(NY)"}\NormalTok{, }\StringTok{"NY"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(PA)"}\NormalTok{, }\StringTok{"PA"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(School, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"School"}\NormalTok{, }\StringTok{"Conference"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"}\SpecialCharTok{\textbackslash{}\textbackslash{}}\StringTok{("}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 11
##    Rank Player  School Conference Cl    Ht    Pos       S Kills `Per Set` Season
##   <dbl> <chr>   <chr>  <chr>      <chr> <chr> <chr> <dbl> <dbl>     <dbl> <chr> 
## 1     1 Lindse~ " UC ~ Big West)  So.   6-2   OH       90   526      5.84 2017-~
## 2     2 Pilar ~ " Ark~ SEC)       Sr.   5-11  OH      116   634      5.47 2017-~
## 3     3 Laura ~ " Ora~ Summit Le~ Sr.   5-10  OH      106   560      5.28 2017-~
## 4     4 Carlyl~ " Lip~ ASUN)      Jr.   5-10  OH      100   522      5.22 2017-~
## 5     5 Veroni~ " BYU~ WCC)       Jr.   6-0   OH      118   569      4.82 2017-~
## 6     6 Torrey~ " Cal~ Big West)  So.   6-3   OH      101   477      4.72 2017-~
\end{verbatim}

One last thing: see the trailing paren?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{separate}\NormalTok{(Player, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Player"}\NormalTok{, }\StringTok{"School"}\NormalTok{), }\AttributeTok{sep=}\StringTok{","}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(FL)"}\NormalTok{, }\StringTok{"FL"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(NY)"}\NormalTok{, }\StringTok{"NY"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(PA)"}\NormalTok{, }\StringTok{"PA"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(School, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"School"}\NormalTok{, }\StringTok{"Conference"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"}\SpecialCharTok{\textbackslash{}\textbackslash{}}\StringTok{("}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Conference=}\FunctionTok{gsub}\NormalTok{(}\StringTok{")"}\NormalTok{, }\StringTok{""}\NormalTok{, Conference)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 11
##    Rank Player  School Conference Cl    Ht    Pos       S Kills `Per Set` Season
##   <dbl> <chr>   <chr>  <chr>      <chr> <chr> <chr> <dbl> <dbl>     <dbl> <chr> 
## 1     1 Lindse~ " UC ~ Big West   So.   6-2   OH       90   526      5.84 2017-~
## 2     2 Pilar ~ " Ark~ SEC        Sr.   5-11  OH      116   634      5.47 2017-~
## 3     3 Laura ~ " Ora~ Summit Le~ Sr.   5-10  OH      106   560      5.28 2017-~
## 4     4 Carlyl~ " Lip~ ASUN       Jr.   5-10  OH      100   522      5.22 2017-~
## 5     5 Veroni~ " BYU~ WCC        Jr.   6-0   OH      118   569      4.82 2017-~
## 6     6 Torrey~ " Cal~ Big West   So.   6-3   OH      101   477      4.72 2017-~
\end{verbatim}

Looking good, no errors.

Now, what should we do about Ht? 6-2 is not going to tell me much when I want to run a regression of height to kills per set. And it's a character field. So we need to convert it to numbers.

Separate again comes to the rescue.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{separate}\NormalTok{(Player, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Player"}\NormalTok{, }\StringTok{"School"}\NormalTok{), }\AttributeTok{sep=}\StringTok{","}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(FL)"}\NormalTok{, }\StringTok{"FL"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(NY)"}\NormalTok{, }\StringTok{"NY"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(PA)"}\NormalTok{, }\StringTok{"PA"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(School, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"School"}\NormalTok{, }\StringTok{"Conference"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"}\SpecialCharTok{\textbackslash{}\textbackslash{}}\StringTok{("}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Conference=}\FunctionTok{gsub}\NormalTok{(}\StringTok{")"}\NormalTok{, }\StringTok{""}\NormalTok{, Conference)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(Ht, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Feet"}\NormalTok{, }\StringTok{"Inches"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"{-}"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Feet =} \FunctionTok{as.numeric}\NormalTok{(Feet), }\AttributeTok{Inches =} \FunctionTok{as.numeric}\NormalTok{(Inches)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 12
##    Rank Player School Conference Cl     Feet Inches Pos       S Kills `Per Set`
##   <dbl> <chr>  <chr>  <chr>      <chr> <dbl>  <dbl> <chr> <dbl> <dbl>     <dbl>
## 1     1 Linds~ " UC ~ Big West   So.       6      2 OH       90   526      5.84
## 2     2 Pilar~ " Ark~ SEC        Sr.       5     11 OH      116   634      5.47
## 3     3 Laura~ " Ora~ Summit Le~ Sr.       5     10 OH      106   560      5.28
## 4     4 Carly~ " Lip~ ASUN       Jr.       5     10 OH      100   522      5.22
## 5     5 Veron~ " BYU~ WCC        Jr.       6      0 OH      118   569      4.82
## 6     6 Torre~ " Cal~ Big West   So.       6      3 OH      101   477      4.72
## # ... with 1 more variable: Season <chr>
\end{verbatim}

But how do we turn that into a height? Math!

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kills }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{separate}\NormalTok{(Player, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Player"}\NormalTok{, }\StringTok{"School"}\NormalTok{), }\AttributeTok{sep=}\StringTok{","}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(FL)"}\NormalTok{, }\StringTok{"FL"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(NY)"}\NormalTok{, }\StringTok{"NY"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{School =} \FunctionTok{gsub}\NormalTok{(}\StringTok{"(PA)"}\NormalTok{, }\StringTok{"PA"}\NormalTok{, School, }\AttributeTok{fixed=}\ConstantTok{TRUE}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(School, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"School"}\NormalTok{, }\StringTok{"Conference"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"}\SpecialCharTok{\textbackslash{}\textbackslash{}}\StringTok{("}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Conference=}\FunctionTok{gsub}\NormalTok{(}\StringTok{")"}\NormalTok{, }\StringTok{""}\NormalTok{, Conference)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(Ht, }\AttributeTok{into=}\FunctionTok{c}\NormalTok{(}\StringTok{"Feet"}\NormalTok{, }\StringTok{"Inches"}\NormalTok{), }\AttributeTok{sep=}\StringTok{"{-}"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Feet =} \FunctionTok{as.numeric}\NormalTok{(Feet), }\AttributeTok{Inches =} \FunctionTok{as.numeric}\NormalTok{(Inches)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Height =}\NormalTok{ (Feet}\SpecialCharTok{*}\DecValTok{12}\NormalTok{)}\SpecialCharTok{+}\NormalTok{Inches) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 13
##    Rank Player School Conference Cl     Feet Inches Pos       S Kills `Per Set`
##   <dbl> <chr>  <chr>  <chr>      <chr> <dbl>  <dbl> <chr> <dbl> <dbl>     <dbl>
## 1     1 Linds~ " UC ~ Big West   So.       6      2 OH       90   526      5.84
## 2     2 Pilar~ " Ark~ SEC        Sr.       5     11 OH      116   634      5.47
## 3     3 Laura~ " Ora~ Summit Le~ Sr.       5     10 OH      106   560      5.28
## 4     4 Carly~ " Lip~ ASUN       Jr.       5     10 OH      100   522      5.22
## 5     5 Veron~ " BYU~ WCC        Jr.       6      0 OH      118   569      4.82
## 6     6 Torre~ " Cal~ Big West   So.       6      3 OH      101   477      4.72
## # ... with 2 more variables: Season <chr>, Height <dbl>
\end{verbatim}

And now, in 10 lines of code, using separate, mutate and gsub, we've turned the mess that is the NCAA's data into actually useful data we can analyze.

These patterns of thought come in handy when facing messed up data.

\hypertarget{headlines}{%
\chapter{Headlines}\label{headlines}}

These are the pieces of a good graphic:

\begin{itemize}
\tightlist
\item
  Headline
\item
  Chatter
\item
  The main body
\item
  Annotations
\item
  Labels
\item
  Source line
\item
  Credit line
\end{itemize}

The first on that list is the first for a reason. The headline is an incredibly important part of any graphic: it's often the first thing a reader will see. It's got to entice people in, tell them a little bit about what they're going to see, and help tell the story.

The second item is the chatter -- the text underneath that headline. It needs to work with the headline to further the story, drive people toward the point, maybe add some context.

The two bits of text are extremely important. Let's set up a chart and talk about how to do it wrong and how to do it better.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggrepel)}
\end{Highlighting}
\end{Shaded}

The data and the chart code isn't important for you to follow along. The code is nothing special. The issues will be with the words that you'll see below.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoring }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/scoringoffense.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Name = col_character(),
##   G = col_double(),
##   TD = col_double(),
##   FG = col_double(),
##   `1XP` = col_double(),
##   `2XP` = col_double(),
##   Safety = col_double(),
##   Points = col_double(),
##   `Points/G` = col_double(),
##   Year = col_double()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{total }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/totaloffense.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Name = col_character(),
##   G = col_double(),
##   `Rush Yards` = col_double(),
##   `Pass Yards` = col_double(),
##   Plays = col_double(),
##   `Total Yards` = col_double(),
##   `Yards/Play` = col_double(),
##   `Yards/G` = col_double(),
##   Year = col_double()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{offense }\OtherTok{\textless{}{-}}\NormalTok{ total }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{left\_join}\NormalTok{(scoring, }\AttributeTok{by=}\FunctionTok{c}\NormalTok{(}\StringTok{"Name"}\NormalTok{, }\StringTok{"Year"}\NormalTok{))}

\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ offense }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Name }\SpecialCharTok{==} \StringTok{"Nebraska"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Year }\SpecialCharTok{==} \DecValTok{2018}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Headline here"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"This is the chatter. It is chatter. Chatter."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska 2018"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-371-1.pdf}

First, let's start with some headline basics:

\begin{itemize}
\tightlist
\item
  Your headline should be about what the chart is about, not what makes up the chart. What story is the chart telling? What made it interesting to you? Don't tell me what the stats are, tell me what it says.
\item
  Your headline should be specific. Generic headlines are boring and ignored.
\item
  Your headline should, most often, have a verb. It's not a 100 percent requirement, but a headline without a verb means you're trying to be cute and \ldots{}
\item
  Your headline shouldn't be overly cute. Trying to get away with slang, a very Of The Moment cultural reference that will be forgotten soon, or some inside joke is asking for trouble.
\item
  Your headline should provoke a reaction.
\end{itemize}

Given our graph, here's a few that don't work.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"Nebraska\textquotesingle{}s 2018 offense is the red dot."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska 2018"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-372-1.pdf}
The problems here:

\begin{itemize}
\tightlist
\item
  No verb.
\item
  Generic, forgettable, doesn't say anything.
\item
  What is this chart about? What does it say? We have no idea from the headline and chatter.
\item
  Don't repeat words from the headline in the chatter. Nebraska Nebraska looks bad. Make one of the Huskers if you're going to do this.
\end{itemize}

Another example:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Points per game vs total yards per game"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"Nebraska\textquotesingle{}s 2018 offense is below the blue line, which is bad."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska 2018"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-373-1.pdf}
What's wrong here?

\begin{itemize}
\tightlist
\item
  The headline is about the stats, not the story.
\item
  The headline lacks a verb.
\item
  The headline lacks any interest, really.
\item
  The headline at least moves in the direction of what this chart is about, but see the previous two.
\item
  The chatter adds more flavor to it, but what does ``below the blue line'' even mean? We're leaving the reader with a lot of questions and no real answers. That;s bad.
\end{itemize}

Let's try to do this better.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s strength?"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s offense was supposed to power the team. It underperformed."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska 2018"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-374-1.pdf}

What works here:

\begin{itemize}
\tightlist
\item
  Provokes a reaction by asking a question. Drives at what the story is about.
\item
  The chatter answers the question in the headline without talking about the blue line, a model, anything. A reader can see it.
\item
  Simple, precise, direct language.
\end{itemize}

One more, same chart.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s offense underperformed"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s should have scored nearly a touchdown more given their output."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska 2018"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-375-1.pdf}

What works here:

\begin{itemize}
\tightlist
\item
  Strong verb: underperformed.
\item
  Headline tells the story. Chatter bolsters it.
\item
  Doesn't repeat Nebraska or Huskers.
\end{itemize}

Taking time to sharpen your headlines will make your graphics better.

\hypertarget{annotations}{%
\chapter{Annotations}\label{annotations}}

Some of the best sports data visualizations start with a provocative question. How about this one: Why does Oklahoma always blow it in the college football playoffs?

For this, we're going to go back to some code we started in Bubble Charts and we're going to add some annotations to it. Annotations help us draw attention to things, or help the reader understand what they're looking at. They're labels on things, be that the teams we want to highlight or regions of the chart or lines or all of those things.

For this, we'll need to add a new library to the mix called \texttt{ggrepel}. You'll need to install it in the console with \texttt{install.packages("ggrepel")}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggrepel)}
\end{Highlighting}
\end{Shaded}

Now we'll grab the data, each football game in 2020.

Now load it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/footballlogs20.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeAway = col_character(),
##   Opponent = col_character(),
##   Result = col_character(),
##   TeamFull = col_character(),
##   TeamURL = col_character(),
##   Outcome = col_character(),
##   Team = col_character(),
##   Conference = col_character()
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

I'm going to set up a point chart that places teams on two-axes -- yards per play on offense on the x axis, and yards per play on defense. We did this in the bubble charts example.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{logs }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(Team, Conference) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}
    \AttributeTok{TotalPlays =} \FunctionTok{sum}\NormalTok{(OffensivePlays), }
    \AttributeTok{TotalYards =} \FunctionTok{sum}\NormalTok{(OffensiveYards), }
    \AttributeTok{DefensivePlays =} \FunctionTok{sum}\NormalTok{(DefPlays), }
    \AttributeTok{DefensiveYards =} \FunctionTok{sum}\NormalTok{(DefYards)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{OffensiveYPP =}\NormalTok{ TotalYards}\SpecialCharTok{/}\NormalTok{TotalPlays, }
    \AttributeTok{DefensiveYPP =}\NormalTok{ DefensiveYards}\SpecialCharTok{/}\NormalTok{DefensivePlays) }\OtherTok{{-}\textgreater{}}\NormalTok{ ypp}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'Team' (override with `.groups` argument)
\end{verbatim}

To build the annotations, I want the average for offensive yards per play and defensive yards per play. We're going to use those as a proxy for quality. If your team averages more yards per play on offense, that's good. If they average fewer yards per play on defense, that too is good. So that sets up a situation where we have four corners, anchored by good at both and bad at both. The averages will create lines to divide those four corners up.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{averages }\OtherTok{\textless{}{-}}\NormalTok{ ypp }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summarise}\NormalTok{(}\AttributeTok{AvgOffYardsPer =} \FunctionTok{mean}\NormalTok{(OffensiveYPP), }\AttributeTok{AvgDefYardsPer =} \FunctionTok{mean}\NormalTok{(DefensiveYPP))}

\NormalTok{averages}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 2
##   AvgOffYardsPer AvgDefYardsPer
##            <dbl>          <dbl>
## 1           5.74           5.73
\end{verbatim}

I also want to highlight playoff teams.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{playoff\_teams }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Notre Dame"}\NormalTok{, }\StringTok{"Alabama"}\NormalTok{, }\StringTok{"Clemson"}\NormalTok{, }\StringTok{"Ohio State"}\NormalTok{)}

\NormalTok{playoffs }\OtherTok{\textless{}{-}}\NormalTok{ ypp }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Team }\SpecialCharTok{\%in\%}\NormalTok{ playoff\_teams)}
\end{Highlighting}
\end{Shaded}

Now we create the plot. We have two geom\_points, starting with everyone, then playoff teams. I alter the colors on each to separate them. Next, I add a geom\_hline to add the horizontal line of my defensive average and a geom\_vline for my offensive average. Next, I want to add some text annotations, labeling two corners of my chart (the other two, in my opinion, become obvious). Then, I want to label all the playoff teams. I use \texttt{geom\_text\_repel} to do that -- it's using the ggrepel library to push the text away from the dots, respective of other labels and other dots. It means you don't have to move them around so you can read them, or so they don't cover up the dots.

The rest is just adding labels and messing with the theme.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{ypp, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP), }\AttributeTok{color=}\StringTok{"light grey"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{playoffs, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP)) }\SpecialCharTok{+}
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept=}\FloatTok{5.7}\NormalTok{, }\AttributeTok{color=}\StringTok{"dark grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_vline}\NormalTok{(}\AttributeTok{xintercept=}\FloatTok{5.7}\NormalTok{, }\AttributeTok{color=}\StringTok{"dark grey"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\FloatTok{6.8}\NormalTok{, }\AttributeTok{y=}\DecValTok{5}\NormalTok{, }\AttributeTok{label=}\StringTok{"Good Offense, Good Defense"}\NormalTok{), }\AttributeTok{color=}\StringTok{"blue"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\DecValTok{5}\NormalTok{, }\AttributeTok{y=}\DecValTok{6}\NormalTok{, }\AttributeTok{label=}\StringTok{"Bad Defense, Bad Offense"}\NormalTok{), }\AttributeTok{color=}\StringTok{"blue"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{playoffs, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\NormalTok{OffensiveYPP, }\AttributeTok{y=}\NormalTok{DefensiveYPP, }\AttributeTok{label=}\NormalTok{Team)) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Offensive Yards Per Play"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Defensive Points Per Play"}\NormalTok{, }\AttributeTok{title=}\StringTok{"All four playoff teams are good"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"Each of the four have above average offenses and defenses."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: Sports{-}Reference.com | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.text =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{7}\NormalTok{),}
    \AttributeTok{axis.ticks =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{(),}
    \AttributeTok{panel.grid.major.x =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-382-1.pdf}

\hypertarget{finishing-touches}{%
\chapter{Finishing touches}\label{finishing-touches}}

The output from ggplot is good, but not great. We need to add some pieces to it. The elements of a good graphic are:

\begin{itemize}
\tightlist
\item
  Headline
\item
  Chatter
\item
  The main body
\item
  Annotations
\item
  Labels
\item
  Source line
\item
  Credit line
\end{itemize}

That looks like:

\includegraphics[width=12.97in]{images/chartannotated}

\hypertarget{graphics-vs-visual-stories}{%
\section{Graphics vs visual stories}\label{graphics-vs-visual-stories}}

While the elements above are nearly required in every chart, they aren't when you are making visual stories.

\begin{itemize}
\tightlist
\item
  When you have a visual story, things like credit lines can become a byline.
\item
  In visual stories, source lines are often a note at the end of the story.
\item
  Graphics don't always get headlines -- sometimes just labels, letting the visual story headline carry the load.
\end{itemize}

\href{https://www.nytimes.com/interactive/2018/02/14/business/economy/inflation-prices.html}{An example from The Upshot}. Note how the charts don't have headlines, source or credit lines.

\hypertarget{getting-ggplot-closer-to-output}{%
\section{Getting ggplot closer to output}\label{getting-ggplot-closer-to-output}}

Let's explore fixing up ggplot's output before we send it to a finishing program like Adobe Illustrator. We'll need a graphic to work with first.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(ggrepel)}
\end{Highlighting}
\end{Shaded}

Here's the data we'll use:

Let's load them and join them together.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoring }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/scoringoffense.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Name = col_character(),
##   G = col_double(),
##   TD = col_double(),
##   FG = col_double(),
##   `1XP` = col_double(),
##   `2XP` = col_double(),
##   Safety = col_double(),
##   Points = col_double(),
##   `Points/G` = col_double(),
##   Year = col_double()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{total }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/totaloffense.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Name = col_character(),
##   G = col_double(),
##   `Rush Yards` = col_double(),
##   `Pass Yards` = col_double(),
##   Plays = col_double(),
##   `Total Yards` = col_double(),
##   `Yards/Play` = col_double(),
##   `Yards/G` = col_double(),
##   Year = col_double()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{offense }\OtherTok{\textless{}{-}}\NormalTok{ total }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{left\_join}\NormalTok{(scoring, }\AttributeTok{by=}\FunctionTok{c}\NormalTok{(}\StringTok{"Name"}\NormalTok{, }\StringTok{"Year"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

We're going to need this later, so let's grab Nebraska's 2018 stats from this dataframe.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}}\NormalTok{ offense }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Name }\SpecialCharTok{==} \StringTok{"Nebraska"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Year }\SpecialCharTok{==} \DecValTok{2018}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

We'll start with the basics.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-388-1.pdf}

Let's take changing things one by one. The first thing we can do is change the figure size. Sometimes you don't want a square. We can use the \texttt{knitr} output settings in our chunk to do this easily in our notebooks.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-389-1.pdf}

Now let's add a fit line.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-390-1.pdf}

And now some labels.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s underperforming offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s offense was the strength of the team. They underperformed."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-391-1.pdf}

Let's get rid of the default plot look and drop the grey background.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s underperforming offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s offense was the strength of the team. They underperformed."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-392-1.pdf}

Off to a good start, but our text has no real heirarchy. We'd want our headline to stand out more. So let's change that. When it comes to changing text, the place to do that is in the theme element. \href{http://ggplot2.tidyverse.org/reference/theme.html}{There are a lot of ways to modify the theme}. We'll start easy. Let's make the headline bigger and bold.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s underperforming offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s offense was the strength of the team. They underperformed."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{)}
\NormalTok{    ) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-393-1.pdf}

Now let's fix a few other things -- like the axis labels being too big, the subtitle could be bigger and lets drop some grid lines.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s underperforming offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s offense was the strength of the team. They underperformed."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-394-1.pdf}

Missing from this graph is the context that the headline promises. Where is Nebraska? We haven't added it yet. So let's add a point and a label for it.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{(offense, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} 
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{color=}\StringTok{"grey"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{geom\_smooth}\NormalTok{(}\AttributeTok{method=}\NormalTok{lm, }\AttributeTok{se=}\ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x=}\StringTok{"Total yards per game"}\NormalTok{, }\AttributeTok{y=}\StringTok{"Points per game"}\NormalTok{, }\AttributeTok{title=}\StringTok{"Nebraska\textquotesingle{}s underperforming offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Husker\textquotesingle{}s offense was the strength of the team. They underperformed."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | By Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{8}\NormalTok{), }
    \AttributeTok{plot.subtitle =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size=}\DecValTok{10}\NormalTok{), }
    \AttributeTok{panel.grid.minor =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{    ) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{), }\AttributeTok{color=}\StringTok{"red"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data=}\NormalTok{nu, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x=}\StringTok{\textasciigrave{}}\AttributeTok{Yards/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{y=}\StringTok{\textasciigrave{}}\AttributeTok{Points/G}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{label=}\StringTok{"Nebraska 2018"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-395-1.pdf}

If we're happy with this output -- if it meets all of our needs for publication -- then we can simply export it as a png file. We do that by adding \texttt{+\ ggsave("plot.png",\ width=5,\ height=2)} to the end of our code. Note the width and the height are from our knitr parameters at the top -- you have to repeat them or the graph will export at the default 7x7.

If there's more work you want to do with this graph that isn't easy or possible in R but is in Illustrator, simply change the file extension to \texttt{pdf} instead of \texttt{png}. The pdf will open as a vector file in Illustrator with everything being fully editable.

\hypertarget{waffle-charts-require-special-attention}{%
\section{Waffle charts require special attention}\label{waffle-charts-require-special-attention}}

Frequently in my classes, students use the waffle charts library quite extensively to make graphics. This is a quick walkthough on how to get a waffle chart into a publication ready state.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(waffle)}
\end{Highlighting}
\end{Shaded}

Let's look at the offensive numbers from the 2018 Nebraska v. Wisconsin football game. Nebraska lost 41-24, but Wisconsin gained only 15 yards more than Nebraska did. You can find the \href{https://www.ncaa.com/game/football/fbs/2018/10/06/nebraska-wisconsin/team-stats}{official stats on the NCAA's website}.

I'm going to make two vectors for each team and record rushing yards and passing yards.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nu }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Rushing"}\OtherTok{=}\DecValTok{111}\NormalTok{, }\StringTok{"Passing"}\OtherTok{=}\DecValTok{407}\NormalTok{, }\DecValTok{15}\NormalTok{)}
\NormalTok{wi }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Rushing"}\OtherTok{=}\DecValTok{370}\NormalTok{, }\StringTok{"Passing"}\OtherTok{=}\DecValTok{163}\NormalTok{, }\DecValTok{0}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

So what does the breakdown of Nebraska's night look like? How balanced was the offense?

The waffle library can break this down in a way that's easier on the eyes than a pie chart. We call the library, add the data, specify the number of rows, give it a title and an x value label, and to clean up a quirk of the library, we've got to specify colors.

\textbf{ADDITIONALLY}

We can add labels and themes, but you have to be careful. The waffle library is applying it's own theme, but if we override something they are using in their theme, some things that are hidden come back and make it worse. So here is an example of how to use ggplot's \texttt{labs} and the theme to make a fully publication ready graphic.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{waffle}\NormalTok{(nu}\SpecialCharTok{/}\DecValTok{10}\NormalTok{, }\AttributeTok{rows =} \DecValTok{5}\NormalTok{, }\AttributeTok{xlab=}\StringTok{"1 square = 10 yards"}\NormalTok{, }\AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)) }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Nebraska vs Wisconsin on offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Huskers couldn\textquotesingle{}t get much of a running game going."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | Graphic by Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.title.y =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-398-1.pdf}

Note: The alignment of text sucks.

How to fix that? We can use ggsave to a pdf and fix it in Illustrator.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{waffle}\NormalTok{(nu}\SpecialCharTok{/}\DecValTok{10}\NormalTok{, }\AttributeTok{rows =} \DecValTok{5}\NormalTok{, }\AttributeTok{xlab=}\StringTok{"1 square = 10 yards"}\NormalTok{, }\AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{)) }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Nebraska vs Wisconsin on offense"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Huskers couldn\textquotesingle{}t get much of a running game going."}\NormalTok{, }\AttributeTok{caption=}\StringTok{"Source: NCAA | Graphic by Matt Waite"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.title.y =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{  ) }\SpecialCharTok{+} \FunctionTok{ggsave}\NormalTok{(}\StringTok{"waffle.pdf"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

But what if we're using a waffle iron? And what if we want to change the output size? It gets tougher.

Truth is, I'm not sure what is going on with the sizing. You can try it and you'll find that the outputs are \ldots{} unpredictable.

Things you need to know about waffle irons:

\begin{itemize}
\tightlist
\item
  They're a convenience method, but all they're really doing is executing two waffle charts together. If you don't apply the theme to both waffle charts, it breaks.
\item
  You will have to get creative about applying headline and subtitle to the top waffle chart and the caption to the bottom.
\item
  Using ggsave doesn't work either. So you'll have to use R's pdf output.
\end{itemize}

Here is a full example. I start with my waffle iron code, but note that each waffle is pretty much a self contained thing. That's because a waffle iron isn't really a thing. It's just a way to group waffles together, so you have to make each waffle individually. My first waffle has the title and subtitle but no x axis labels and the bottom one has not title or subtitle but the axis labels and the caption.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{iron}\NormalTok{(}
 \FunctionTok{waffle}\NormalTok{(}
\NormalTok{   nu}\SpecialCharTok{/}\DecValTok{10}\NormalTok{, }
   \AttributeTok{rows =} \DecValTok{2}\NormalTok{, }
   \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)) }\SpecialCharTok{+} 
   \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Nebraska vs Wisconsin: By the numbers"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Huskers couldn\textquotesingle{}t run, Wisconsin could."}\NormalTok{) }\SpecialCharTok{+} 
   \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.title.y =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{  ),}
 \FunctionTok{waffle}\NormalTok{(}
\NormalTok{   wi}\SpecialCharTok{/}\DecValTok{10}\NormalTok{, }
   \AttributeTok{rows =} \DecValTok{2}\NormalTok{, }
   \AttributeTok{xlab=}\StringTok{"1 square = 10 yards"}\NormalTok{, }
   \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)) }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{caption=}\StringTok{"Source: NCAA | Graphic by Matt Waite"}\NormalTok{)}
\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-400-1.pdf}

If you try to use ggsave on that, you'll only get the last waffle chart. Like I said, irons aren't really anything, so ggplot ignores them. So to do this, we have to use R's pdf capability.

Here's the same code, but wrapped in the R pdf functions. The first line says we're going to output this as a pdf with this name. Then my code, then \texttt{dev.off} to tell R that's what I want as a PDF. Don't forget that.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{pdf}\NormalTok{(}\StringTok{"waffleiron.pdf"}\NormalTok{)}
\FunctionTok{iron}\NormalTok{(}
 \FunctionTok{waffle}\NormalTok{(}
\NormalTok{   nu}\SpecialCharTok{/}\DecValTok{10}\NormalTok{, }
   \AttributeTok{rows =} \DecValTok{2}\NormalTok{, }
   \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)) }\SpecialCharTok{+} 
   \FunctionTok{labs}\NormalTok{(}\AttributeTok{title=}\StringTok{"Nebraska vs Wisconsin: By the numbers"}\NormalTok{, }\AttributeTok{subtitle=}\StringTok{"The Huskers couldn\textquotesingle{}t run, Wisconsin could."}\NormalTok{) }\SpecialCharTok{+} 
   \FunctionTok{theme}\NormalTok{(}
    \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.title.y =} \FunctionTok{element\_blank}\NormalTok{()}
\NormalTok{  ),}
 \FunctionTok{waffle}\NormalTok{(}
\NormalTok{   wi}\SpecialCharTok{/}\DecValTok{10}\NormalTok{, }
   \AttributeTok{rows =} \DecValTok{2}\NormalTok{, }
   \AttributeTok{xlab=}\StringTok{"1 square = 10 yards"}\NormalTok{, }
   \AttributeTok{colors =} \FunctionTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{, }\StringTok{"white"}\NormalTok{)) }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{caption=}\StringTok{"Source: NCAA | Graphic by Matt Waite"}\NormalTok{)}
\NormalTok{) }
\FunctionTok{dev.off}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

It probably still needs work in Illustrator, but less than before.

\hypertarget{rtweet-and-text-analysis}{%
\chapter{Rtweet and Text Analysis}\label{rtweet-and-text-analysis}}

\textbf{By Collin K. Berke, Ph.D.}

One of the best rivalries in college volleyball was played on Saturday, November 2, 2019. \href{https://journalstar.com/sports/huskers/volleyball/john-cook-on-the-radio-nebraska-penn-state-match-will/article_c1d5c426-e136-5ef2-b589-510a0f17da82.html}{The seventh-ranked Penn State Nittany Lions (16-3) took on the eigth-ranked Nebraska Cornhuskers (16-3)}. This match featured two of the best middle blockers in the country, Nebraska's Lauren Stivrins and Penn State's Kaitlyn Hord. Stivrins was ranked No.~1 in Big Ten hitting percentage, a .466 before the match up. Hord, close behind, had a .423 hitting percentage and was ranked towards the top as one of the league's top blockers.

Alongside being a competition between premier players, this match was set to be a battle between two of the winningest coaches in NCAA Women's Volleyball history. Russ Rose, head coach of the Nittany Lions, came into the match with a 1289-209 (.860) record, 17 Big Ten Conference Championships, and 7 NCAA National Championships. For the Nebraska Cornhuskers' head coach, John Cook came into the match with a 721-148 (.830) record, 9 Big 12 Conference Championships, 4 Big Ten Conference Championships, and 5 NCAA National Championships. Check out this article \href{https://www.ncaa.com/news/volleyball-women/article/2019-10-29/no-7-penn-state-vs-no-8-nebraska-volleyball-preview}{here} to get a better understanding of the significance of this game and rivalry.

Being a contest between two storied programs, premier players, and two of the most winningest coaches in NCAA volleyball history, this match was poised to be one of the premier Big Ten matches of the 2019 season. If history was to serve as a guide, this match would easily go into five exciting, nail-biting sets.

To no surprise--it did. Nebraska came out victorious, 3 sets to 2, winning the fifth set 15 - 13. Although we have commentators, analysts, and reporters to tell us the story of the game, wouldn't it be interesting to tell the story from the fan's perspective? Can what they say allow us to take a pulse of how the fan base feels during the game? We can answer this question using Twitter tweet data, which we will access with the \href{https://rtweet.info/}{\texttt{rtweet}} package.

\textbf{Question - How do people feel during a game? Positive? Negative? Neutral?}

This chapter will teach you how to extract, analyze, and visualize \href{www.twitter.com}{Twitter} text data to tell a story about peoples sentiments toward any sport team, player, or event. Although Twitter is conventionally thought of as a social media platform, at a general level, it can be thought of as a corpus of textual data, which is generated by millions of users, talking about a wide array of topics over time.

During this chapter, we will access text data held within the body of tweets, which we will extract and import into R through the use of an API (application programming interface). This can seem like a pretty technical term, but all it really is is a portal to which data can be shared between computers and humans. NPR has an \href{https://digitalservices.npr.org/post/api-101-what-api}{API 101 post} on their site, which you can read to get a rough idea of what an API is and how they are used.

In fact, many news organizations provide APIs for people to access and use their data. For example, many news services like \href{https://developer.nytimes.com/}{The New York Times}, \href{https://www.npr.org/api/index}{NPR}, \href{https://developer.ap.org/}{The Associated Press} and social media platforms like \href{https://developers.facebook.com/docs/apis-and-sdks/}{Facebook} have APIs that can be used to access content or varying types of data. Many of these just require you to: a). have a developer account; b) have the proper API keys; and c). use their API in accordance with their terms of service. Every API you come across should have documentation outlining its use.

Above was a pretty hand-wavy explanation of APIs. Indeed, APIs have many different uses beyond just extracting data, but such a discussion is beyond the scope of this chapter. Nevertheless, APIs can be a powerful, useful tool to access data not normally available on web pages or other statistical reporting services.

\hypertarget{prerequisites}{%
\section{Prerequisites}\label{prerequisites}}

You will need to have a Twitter account to access and extract data. If needed, you can # for an account \href{https://twitter.com/i/flow/#}{here}.

\hypertarget{tools-for-text-analysis}{%
\subsection{Tools for text analysis}\label{tools-for-text-analysis}}

This chapter will also require you to load and acquaint yourself with functions in four packages, \texttt{rtweet}, \texttt{lubridate}, \texttt{stringr}, and \texttt{tidytext}. You may have used some of functions in other portions of this class. Others may be new to you.

\begin{itemize}
\item
  \href{https://rtweet.info/index.html}{\texttt{rtweet}} is a R package used to access Twitter data via the Twitter API.
\item
  \href{https://lubridate.tidyverse.org/}{\texttt{lubridate}} is a package that makes working with dates and times a bit easier.
\item
  \href{https://stringr.tidyverse.org/}{\texttt{stringr}} is a package that provides several functions to make working with string data a little easier.
\item
  \href{https://juliasilge.github.io/tidytext/}{\texttt{tidytext}} is a package used to tidy, analyze, and visualize textual analyses. We will use this package to calculate tweet sentiments (e.g., positive and negative feelings).
\end{itemize}

This chapter will also use other packages you have gained familiarity with throughout the class: \texttt{dplyr} and \texttt{ggplot2}. To install these packages and load them for use in our analysis session, run the following code:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{install.packages}\NormalTok{(}\StringTok{"rtweet"}\NormalTok{) }\CommentTok{\# installs the rtweet package}

\FunctionTok{install.packages}\NormalTok{(}\StringTok{"tidytext"}\NormalTok{) }\CommentTok{\# installs the tidytext package}

\FunctionTok{install.packages}\NormalTok{(}\StringTok{"tidyverse"}\NormalTok{) }\CommentTok{\# A collection of packages, includes the stringr packages}

\FunctionTok{install.packages}\NormalTok{(}\StringTok{"lubridate"}\NormalTok{) }\CommentTok{\# Provides functions to make working with dates/times easier}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Load the packages to be used in your analysis session}

\FunctionTok{library}\NormalTok{(rtweet)}

\FunctionTok{library}\NormalTok{(tidytext)}

\FunctionTok{library}\NormalTok{(tidyverse)}

\FunctionTok{library}\NormalTok{(lubridate)}

\FunctionTok{library}\NormalTok{(ggrepel)}
\end{Highlighting}
\end{Shaded}

\hypertarget{working-with-string-data}{%
\subsection{Working with string data}\label{working-with-string-data}}

String data is just basically letters, words, symbols, and even emojis. Take for example the following tweet:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{knitr}\SpecialCharTok{::}\FunctionTok{include\_graphics}\NormalTok{(}\FunctionTok{rep}\NormalTok{(}\StringTok{"images/volleyballTweet.png"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics[width=7.53in]{images/volleyballTweet}

Everything contained in the message portion of the tweet is string data, even the emojis. When it comes to emojis, most have a special textual code that is rendered by a browser or device that gets displayed as an image. For example, the ear of corn emoji is actually written as \texttt{:corn:}, but it gets rendered as an image when we view the tweet on our computers/devices. We can extract, analyze, and visualize this string data to tell a wide range of stories from users' tweets. Our goal being to show sentiment over the length of a Husker volleyball Match and football game.

\hypertarget{verifying-your-account-to-access-twitter-data}{%
\section{Verifying your account to access Twitter data}\label{verifying-your-account-to-access-twitter-data}}

Before you can access Twitter data, you will need to verify your account. The \texttt{rtweet} package makes this really easy to do. You will first need to run one of the package's functions for it to walk you through the authentication process. To do this, let's just search for the most recent 8000 (non-retweeted) tweets containing the \textbf{\#huskers} and \textbf{\#GBR} hashtags.

Before you run the following code chunk, though, be aware a few things will take place. First, a browser window will open up asking you to verify that \texttt{rtweet} is allowed to access Twitter data via the API on behalf of your account. Accept this request and enter your credentials if you are asked to. Once you do this, you should get a message in your browser stating you have successfully authenticated the \texttt{rtweet} package. The data will then begin to download. The amount of time needed to import this data will depend on how many tweets the hashtag(s) are associated with. More tweets generally means longer import times.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{huskers }\OtherTok{\textless{}{-}} \FunctionTok{search\_tweets}\NormalTok{(}
  \StringTok{"\#huskers"}\NormalTok{, }\AttributeTok{n =} \DecValTok{8000}\NormalTok{, }\AttributeTok{include\_rts =} \ConstantTok{FALSE}
\NormalTok{)}

\NormalTok{gbr }\OtherTok{\textless{}{-}} \FunctionTok{search\_tweets}\NormalTok{(}
  \StringTok{"\#GBR"}\NormalTok{, }\AttributeTok{n =} \DecValTok{8000}\NormalTok{, }\AttributeTok{include\_rts =} \ConstantTok{FALSE}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\textbf{Important Note}: Depending on when you run the above code chunk, the API will return different data then the data used for the examples later in this chapter. This is due to the query rate cap Twitter places on it's API. Twitter's API caps queries to 18,000 of the most recent tweets during the past couple of days. This cap resets every 15 minutes. The \texttt{rtweet} package does has functionality to pull data once your query limit resets. However, if you're looking to pull tweets for a very popular event (e.g., The Super Bowl), you may want to consider other options to extract this type of data. This is also important to understand because if you are looking to pull tweets for a specific event, you will need to make sure you are pulling this data within a reasonable time during or after the event. If you don't, these rate limits might not allow you access the data you need to do your analysis.

The data we will use later for the examples in the chapter can be found \href{https://unl.box.com/s/x4jjifc394gxfvsvbwb3csez2ne4l4rb}{here} and \href{https://unl.box.com/s/s4ej8khwi9ah9qvqpi2jviqomfocie3n}{here}. The first data set are tweets that use the \#huskers hashtag. The second has data of tweets that use the \#gbr hashtag. You will need to download both data sets, put them in the right directory, and import both for the below examples to work correctly. The code to import this data will look something like this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{huskerTweets }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/huskerTweets.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   created_at = col_datetime(format = ""),
##   display_text_width = col_double(),
##   is_quote = col_logical(),
##   is_retweet = col_logical(),
##   favorite_count = col_double(),
##   retweet_count = col_double(),
##   quote_count = col_logical(),
##   reply_count = col_logical(),
##   symbols = col_logical(),
##   ext_media_type = col_logical(),
##   quoted_created_at = col_datetime(format = ""),
##   quoted_favorite_count = col_double(),
##   quoted_retweet_count = col_double(),
##   quoted_followers_count = col_double(),
##   quoted_friends_count = col_double(),
##   quoted_statuses_count = col_double(),
##   quoted_verified = col_logical(),
##   retweet_status_id = col_logical(),
##   retweet_text = col_logical(),
##   retweet_created_at = col_logical()
##   # ... with 21 more columns
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gbrTweets }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/gbrTweets.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   created_at = col_datetime(format = ""),
##   display_text_width = col_double(),
##   is_quote = col_logical(),
##   is_retweet = col_logical(),
##   favorite_count = col_double(),
##   retweet_count = col_double(),
##   quote_count = col_logical(),
##   reply_count = col_logical(),
##   symbols = col_logical(),
##   ext_media_type = col_logical(),
##   quoted_created_at = col_datetime(format = ""),
##   quoted_favorite_count = col_double(),
##   quoted_retweet_count = col_double(),
##   quoted_followers_count = col_double(),
##   quoted_friends_count = col_double(),
##   quoted_statuses_count = col_double(),
##   quoted_verified = col_logical(),
##   retweet_status_id = col_logical(),
##   retweet_text = col_logical(),
##   retweet_created_at = col_logical()
##   # ... with 21 more columns
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\begin{verbatim}
## Warning: 1 parsing failure.
##  row     col           expected actual                 file
## 4365 symbols 1/0/T/F/TRUE/FALSE    GBR 'data/gbrTweets.csv'
\end{verbatim}

This brings up a good point about saving any data you import from Twitter's API. \textbf{Always save your data.} Remember those rate limits? If you don't save your data and too many days pass, you will not be able to access that data again. To do this, you can use the \texttt{write\_as\_csv()} function from the \texttt{rtweet} package to save a \texttt{.csv} file of your data. The code to do this will look something like this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_as\_csv}\NormalTok{(huskerTweets, }\StringTok{"data/huskerTweets.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\textbf{Be aware that this function will overwrite data.} If you make changes to your \texttt{huskerTweets} object and then run the \texttt{write\_as\_csv()} function again, it will overwrite your saved file with the modifications you made to your object. The lesson then is to always save an extra copy of your data in a separate directory, just in case you do accidentally make a mistake in overwriting your data.

\hypertarget{the-data-used-here}{%
\section{The data used here}\label{the-data-used-here}}

To provide a little context, I pulled the data in this chapter on Sunday, November 3, 2019. This was the day after Nebraska Football lost to Purdue, and Nebraska Volleyball won against Penn State. You can follow the steps above to download this data for the following examples.

To make it easier to work with, I am going to combine these two data sets into one using the \texttt{bind\_rows()} function from \texttt{dplyr}. There is a slight problem though, some people may have had a tweet that contained both the \textbf{\#huskers} and \textbf{\#GBR} hashtags in their tweet. So if we combine these two data sets, there might be duplicate data. To dedupe the data, we can apply a \texttt{distinct(text,\ .keep\_all\ =\ TRUE)} to remove any duplicates. The \texttt{.keep\_all\ =\ TRUE} argument just tells R to keep all columns in the data frame after our data has been deduped.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tweet\_data }\OtherTok{\textless{}{-}} \FunctionTok{bind\_rows}\NormalTok{(huskerTweets, gbrTweets) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{distinct}\NormalTok{(text, }\AttributeTok{.keep\_all =} \ConstantTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{data-exploration}{%
\section{Data Exploration}\label{data-exploration}}

Let's explore the data a bit. Run a \texttt{glimpse(tweet\_data)} to get a view of what data was returned from twitter. My query on November 3rd, 2019 returned 11,523 non-retweeted tweets from 3,917 accounts using the \texttt{\#huskers} and/or the \texttt{\#GBR} hashtag within the tweet's body (again if you ran the code above, your data will be different) .

It's important to remember these tweets can come from accounts that are people, organizations, and even bots. So when drawing conclusions from this data, make sure to keep in mind that these tweets may not represent the sentiment of just one person. Additionally, it is important to remember that not all fans of a sports team are on or use Twitter, so it surely is not a valid representation of all fan sentiment. Indeed, you could also have fans of other teams using your hashtags.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{glimpse}\NormalTok{(tweet\_data)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 11,523
## Columns: 90
## $ user_id                 <chr> "x17636179", "x17636179", "x17636179", "x15...
## $ status_id               <chr> "x1191100304940396544", "x11908526509059440...
## $ created_at              <dttm> 2019-11-03 21:10:16, 2019-11-03 04:46:11, ...
## $ screen_name             <chr> "SeanKeeler", "SeanKeeler", "SeanKeeler", "...
## $ text                    <chr> "ICYMI, #CSURams fans, a recap of @denverpo...
## $ source                  <chr> "Twitter Web App", "Twitter for iPhone", "T...
## $ display_text_width      <dbl> 269, 188, 182, 122, 135, 189, 172, 135, 94,...
## $ reply_to_status_id      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ reply_to_user_id        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ reply_to_screen_name    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ is_quote                <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FA...
## $ is_retweet              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
## $ favorite_count          <dbl> 0, 0, 1, 116, 19, 2, 40, 23, 149, 5, 31, 15...
## $ retweet_count           <dbl> 0, 0, 0, 6, 1, 0, 1, 2, 5, 1, 3, 1, 0, 0, 0...
## $ quote_count             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ reply_count             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ hashtags                <chr> "CSURams Huskers ProudToBe AtThePeak CSU", ...
## $ symbols                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ urls_url                <chr> "tinyurl.com/y2b3kog9 tinyurl.com/yy7ntps4"...
## $ urls_t.co               <chr> "https://t.co/1FlAnRRgEq https://t.co/t8j1L...
## $ urls_expanded_url       <chr> "https://tinyurl.com/y2b3kog9 https://tinyu...
## $ media_url               <chr> NA, NA, NA, "http://pbs.twimg.com/ext_tw_vi...
## $ media_t.co              <chr> NA, NA, NA, "https://t.co/FSCP827hg0", "htt...
## $ media_expanded_url      <chr> NA, NA, NA, "https://twitter.com/HuskerSpor...
## $ media_type              <chr> NA, NA, NA, "photo", "photo", "photo", "pho...
## $ ext_media_url           <chr> NA, NA, NA, "http://pbs.twimg.com/ext_tw_vi...
## $ ext_media_t.co          <chr> NA, NA, NA, "https://t.co/FSCP827hg0", "htt...
## $ ext_media_expanded_url  <chr> NA, NA, NA, "https://twitter.com/HuskerSpor...
## $ ext_media_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ mentions_user_id        <chr> "x8216772", "x8216772", "x24725032", "x1210...
## $ mentions_screen_name    <chr> "denverpost", "denverpost", "DPostSports", ...
## $ lang                    <chr> "en", "en", "en", "en", "en", "en", "en", "...
## $ quoted_status_id        <chr> NA, "x1190803840213209088", NA, NA, NA, NA,...
## $ quoted_text             <chr> NA, "From Nebraska to Fort Collins, how CSU...
## $ quoted_created_at       <dttm> NA, 2019-11-03 01:32:14, NA, NA, NA, NA, N...
## $ quoted_source           <chr> NA, "TweetDeck", NA, NA, NA, NA, NA, NA, NA...
## $ quoted_favorite_count   <dbl> NA, 5, NA, NA, NA, NA, NA, NA, NA, NA, 194,...
## $ quoted_retweet_count    <dbl> NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, 16, ...
## $ quoted_user_id          <chr> NA, "x24725032", NA, NA, NA, NA, NA, NA, NA...
## $ quoted_screen_name      <chr> NA, "DPostSports", NA, NA, NA, NA, NA, NA, ...
## $ quoted_name             <chr> NA, "Denver Post Sports", NA, NA, NA, NA, N...
## $ quoted_followers_count  <dbl> NA, 34841, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ quoted_friends_count    <dbl> NA, 395, NA, NA, NA, NA, NA, NA, NA, NA, 60...
## $ quoted_statuses_count   <dbl> NA, 113697, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ quoted_location         <chr> NA, "Denver, Colorado", NA, NA, NA, NA, NA,...
## $ quoted_description      <chr> NA, "Sports news & analysis from @denverpos...
## $ quoted_verified         <lgl> NA, TRUE, NA, NA, NA, NA, NA, NA, NA, NA, F...
## $ retweet_status_id       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_text            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_created_at      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_source          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_favorite_count  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_retweet_count   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_user_id         <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_screen_name     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_name            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_followers_count <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_friends_count   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_statuses_count  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_location        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_description     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ retweet_verified        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ place_url               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ place_name              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ place_full_name         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ place_type              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ country                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ country_code            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ geo_coords              <chr> "NA NA", "NA NA", "NA NA", "NA NA", "NA NA"...
## $ coords_coords           <chr> "NA NA", "NA NA", "NA NA", "NA NA", "NA NA"...
## $ bbox_coords             <chr> "NA NA NA NA NA NA NA NA", "NA NA NA NA NA ...
## $ status_url              <chr> "https://twitter.com/SeanKeeler/status/1191...
## $ name                    <chr> "Sean Keeler", "Sean Keeler", "Sean Keeler"...
## $ location                <chr> "Denver, CO", "Denver, CO", "Denver, CO", "...
## $ description             <chr> "@DenverPost staffer, dad, husband, drummer...
## $ url                     <chr> "https://t.co/z0eFbv9eaz", "https://t.co/z0...
## $ protected               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
## $ followers_count         <dbl> 5245, 5245, 5245, 30451, 30451, 30451, 3045...
## $ friends_count           <dbl> 1619, 1619, 1619, 701, 701, 701, 701, 701, ...
## $ listed_count            <dbl> 296, 296, 296, 247, 247, 247, 247, 247, 247...
## $ statuses_count          <dbl> 28124, 28124, 28124, 12801, 12801, 12801, 1...
## $ favourites_count        <dbl> 4498, 4498, 4498, 6033, 6033, 6033, 6033, 6...
## $ account_created_at      <dttm> 2008-11-25 23:53:13, 2008-11-25 23:53:13, ...
## $ verified                <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE...
## $ profile_url             <chr> "https://t.co/z0eFbv9eaz", "https://t.co/z0...
## $ profile_expanded_url    <chr> "http://www.seankeeler.tumblr.com", "http:/...
## $ account_lang            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ profile_banner_url      <chr> "https://pbs.twimg.com/profile_banners/1763...
## $ profile_background_url  <chr> "http://abs.twimg.com/images/themes/theme10...
## $ profile_image_url       <chr> "http://pbs.twimg.com/profile_images/118130...
\end{verbatim}

As you can see, \texttt{glimpse()} returns a lot of columns that are not really relevant to our analysis. Let's apply a \texttt{select()} function to only retain the data relevant to our analysis, .

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tweet\_data }\OtherTok{\textless{}{-}}\NormalTok{ tweet\_data }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{select}\NormalTok{(user\_id, status\_id, created\_at, screen\_name, text, display\_text\_width,}
\NormalTok{         favorite\_count, retweet\_count, hashtags, description, followers\_count)}
\end{Highlighting}
\end{Shaded}

\hypertarget{cleaning-data-for-analysis}{%
\section{Cleaning data for analysis}\label{cleaning-data-for-analysis}}

If you examine the data set, you will see this data needs some wrangling. First, we need to fix the \texttt{created\_at} variable. Right now it is represented in Greenwich Mean Time (GMT), but we need it to be in Central Standard Time (CST). We do this so we can make sense of when during the game things happened. Second, the data is outside of the time frame we are interested in examining, so we need to filter the data to be windowed during the time of the game. We will filter the data by date and time, examining tweets a little before and after the game.

\hypertarget{fixing-the-date-and-focusing-only-on-game-tweets}{%
\subsection{Fixing the date and focusing only on game tweets}\label{fixing-the-date-and-focusing-only-on-game-tweets}}

We will use the \texttt{with\_tz()} on our \texttt{created\_at} variable within our \texttt{mutate()} function to transform the \texttt{created\_at} column into Central Standard Time (CST). We do this by setting the \texttt{tzone} argument to \texttt{"America/Chicago"}. Once out time is adjusted, we need group tweets within a specific bin of time. For this example I have decided to bin tweets to the nearest 5 minute mark. We can do this by using the \texttt{round\_time()} function provided to us by the \texttt{lubridate} package.

Then, since we are only interested in tweets during the game, we can apply a \texttt{dplyr} \texttt{filter()} function to window our data set to tweets being posted around the start and end of the game.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{volleyball\_tweets }\OtherTok{\textless{}{-}}\NormalTok{ tweet\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{created\_at =} \FunctionTok{with\_tz}\NormalTok{(created\_at, }\AttributeTok{tzone =} \StringTok{"America/Chicago"}\NormalTok{),}
         \AttributeTok{created\_at =} \FunctionTok{round\_time}\NormalTok{(created\_at, }\StringTok{"5 mins"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(created\_at }\SpecialCharTok{\textgreater{}=} \StringTok{"2019{-}11{-}02 18:30:00"} \SpecialCharTok{\&}\NormalTok{ created\_at }\SpecialCharTok{\textless{}=} \StringTok{"2019{-}11{-}02 23:30:00"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\hypertarget{number-of-tweets-throughout-the-game}{%
\section{Number of tweets throughout the game}\label{number-of-tweets-throughout-the-game}}

One question we might have pertains to the number of tweets that occur during the course of the match. To do this, all we need to do is \texttt{group\_by()} our tweets by our \texttt{created\_at} variable, and then use the \texttt{count()} function to count the number of tweets within each five minute bin. We then use \texttt{ggplot} to plot a line chart where \texttt{created\_at} is placed on the x-axis and \texttt{n}, number of tweets, is placed on the y-axis.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{start\_time }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{(}\AttributeTok{time =} \FunctionTok{as\_datetime}\NormalTok{(}\StringTok{"2019{-}11{-}02 19:35:00"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{), }\AttributeTok{label =} \StringTok{"Start Time"}\NormalTok{) }

\NormalTok{volleyball\_time }\OtherTok{\textless{}{-}}\NormalTok{ volleyball\_tweets }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(created\_at) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{count}\NormalTok{()}

\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data =}\NormalTok{ volleyball\_time, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ created\_at, }\AttributeTok{y =}\NormalTok{ n)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{data =}\NormalTok{ start\_time, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ time, }\AttributeTok{y =} \DecValTok{5}\NormalTok{), }\AttributeTok{color =} \StringTok{"red"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text\_repel}\NormalTok{(}\AttributeTok{data =}\NormalTok{ start\_time, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ time, }\AttributeTok{y =} \DecValTok{3}\NormalTok{, }\AttributeTok{label =}\NormalTok{ label), }\AttributeTok{nudge\_x =} \SpecialCharTok{{-}}\DecValTok{2}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{y =} \StringTok{"Number of Tweets"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Central Standard Time (CST)"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{axis.title.x =} \FunctionTok{element\_blank}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-412-1.pdf}

There you have it. A trend line plotting tweet volume throughout the course of the event. Do you see any areas where the match might have had a significant number of tweets?

\hypertarget{tidying-the-text-data-for-analysis-applying-the-sentiment-scores}{%
\section{Tidying the text data for analysis, applying the sentiment scores}\label{tidying-the-text-data-for-analysis-applying-the-sentiment-scores}}

Okay, that's cool--but what we really want to know is what are peoples' sentiments throughout the game? Did they feel positive or negative throughout the event? Were there times that were more positive or negative? To achieve this, we are going to use the \texttt{tidytext} package to tidy up our text data and apply a sentiment score to each word held within each tweet. Let's break this down step-by-step.

First, we need to get the dictionary that contains the sentiment scoring for thousands of words used in the English language. \texttt{afinn\ \textless{}-\ get\_sentiments("afinn")} does just that for us. The development of these sentiment dictionaries is beyond this chapter. However, most of these dictionaries are crowd sourced by having people provide self-responses on how positive or negative a word is to them. For now, just understand the \texttt{affin} variable contains many words that have been rated for how positive or negative a word is on a scale that ranges from -5 to 5. -5 being the most negative, and 5 being the most positive. If you want to learn more about this dictionary or others, you can read more about them \href{https://www.tidytextmining.com/sentiment.html}{here}.

Second, now that we have our dictionary imported, we need to clean up our tweets data set so we can apply sentiment scores to each word used within each tweet. There's one problem, though. Each row in our data set is a complete tweet. For us to apply a sentiment score for each word, each word needs to get its own row. This is where the \texttt{unnest\_tokens()} function from the \texttt{tidytext} package comes into play. We use this function to create a data set that will create a new column called word, which will place every word from every tweet in our data set on its own row, which it knows which text data to this because we set the the second argument to the column name that holds our text data. In this case, we give it the \texttt{text} column. Once you run this code, if you look at the \texttt{volleyball\_tweets\_tidy} object, you should now have a data set where every row has its own word which was done for every tweet. This data frame should now be a super long data frame.

Lastly, the English language has many words that really don't mean anything in regards to sentiment. Take for example the word `the'. This article really doesn't represent a positive or negative sentiment. Thus, these types of words need to be taken out of our data set to enhance improve the accuracy of our analysis. To do this, we will apply the \texttt{anti\_join(stop\_words)} to our \texttt{dplyr} chain. All this does is get rid of the stop words in our data set that really don't contribute to the sentiment scores we are eventually going to calculate.

If you get an error on the next bit of code, you'll likely need to install the textdata package on the console with \texttt{install.packages("textdata")}. Next, if this next block of code hangs, it's because in the console you're being asked if you want to download some data. You do indeed want to do that, so type 1 and hit enter.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{afinn }\OtherTok{\textless{}{-}} \FunctionTok{get\_sentiments}\NormalTok{(}\StringTok{"afinn"}\NormalTok{)}

\NormalTok{volleyball\_tweets\_tidy }\OtherTok{\textless{}{-}}\NormalTok{ volleyball\_tweets }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{unnest\_tokens}\NormalTok{(word, text) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{anti\_join}\NormalTok{(stop\_words) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "word"
\end{verbatim}

\hypertarget{fan-sentiment-over-the-game}{%
\subsubsection{Fan sentiment over the game}\label{fan-sentiment-over-the-game}}

Now that we have a tidied textual data set, all we need to do is apply our sentiment scores to these words using the \texttt{inner\_join()}, then \texttt{group\_by()} our \texttt{created\_at} variable, and calculate the mean sentiment for each five minute interval. At this point, we will use \texttt{ggplot} to plot sentiment of the tweets over time. We will do this by plotting the \texttt{created\_at} variable on the x-axis and the newly calculated \texttt{sentiment} variable on the y-axis. The rest is just adding annotations and styling, which we are already familiar with.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{volleyball\_tweets\_sentiment }\OtherTok{\textless{}{-}}\NormalTok{ volleyball\_tweets\_tidy }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{inner\_join}\NormalTok{(afinn) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(created\_at) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{sentiment =} \FunctionTok{mean}\NormalTok{(value))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "word"
\end{verbatim}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data =}\NormalTok{ volleyball\_tweets\_sentiment, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ created\_at, }\AttributeTok{y =}\NormalTok{ sentiment)) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =} \FunctionTok{as\_datetime}\NormalTok{(}\StringTok{"2019{-}11{-}02 19:00:00"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{), }\AttributeTok{y =} \DecValTok{3}\NormalTok{), }\AttributeTok{color =} \StringTok{"darkgreen"}\NormalTok{, }\AttributeTok{label =} \StringTok{"Positive Sentiment"}\NormalTok{, }\AttributeTok{size =} \DecValTok{5}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =} \FunctionTok{as\_datetime}\NormalTok{(}\StringTok{"2019{-}11{-}02 19:00:00"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{), }\AttributeTok{y =} \SpecialCharTok{{-}}\DecValTok{3}\NormalTok{), }\AttributeTok{color =} \StringTok{"red"}\NormalTok{, }\AttributeTok{label =} \StringTok{"Negative Sentiment"}\NormalTok{, }\AttributeTok{size =} \DecValTok{5}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"People\textquotesingle{}s Sentiment on Twitter Positive Towards}\SpecialCharTok{\textbackslash{}n}\StringTok{Husker Volleyball\textquotesingle{}s Win Against Penn State"}\NormalTok{,}
       \AttributeTok{subtitle =} \StringTok{"Sentiment mostly positive throughout the game"}\NormalTok{,}
       \AttributeTok{caption =} \StringTok{"Source: \#huskers and \#GBR Tweets, 2019{-}11{-}02 | By Collin K. Berke"}\NormalTok{,}
       \AttributeTok{y =} \StringTok{"Sentiment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Central Standard Time (CST)"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\SpecialCharTok{{-}}\DecValTok{4}\NormalTok{, }\DecValTok{4}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{axis.title.x =} \FunctionTok{element\_blank}\NormalTok{(),}
        \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.text =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{7}\NormalTok{)}
\NormalTok{        )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-414-1.pdf}

When looking at this trend line, you can see that during the volleyball game, tweets using the \#husker and \#gbr hashtags had some wide variation in sentiment. Overall it seems that tweets during the volleyball match were mostly positive, where at times it dipped negative. Why might this be the case? Well, unfortunately, even though this was a big game for Husker volleyball, not many people were tweeting during the match (take a look at the number of tweets chart above). So if there was one word used in a tweet that was ranked as very negative in sentiment, it would have easily drove our average sentiment into the negative region quickly.

Also, we need to consider that this match took place after the Huskers loss to Purdue, which we will examine in the next example. This is important to know because people during the volleyball match may have also been tweeting about how poorly the football game went earlier in the day. Thus, low tweet volume mixed with the potential for tweets referencing something other than the match at hand may have had some influence on the sentiment scores.

There's also one last thing to keep in mind when you draw conclusions from this type of text data. Language is complex--it can have multiple meanings, which is highly influenced by context. Take for example the word `destroy', like its use in the following statement: ``This team is going to destroy the defense today.'' Although we clearly can see this is a positive statement, when a computer applies sentiment scores, the context of the statement is stripped away, and destroy will be scored as negative sentiment. In short, computers are not smart enough to include context when they calculate sentiment, yet. So, keep this limitation in mind when you draw conclusions from your sentiment analyses using text data.

\hypertarget{example-2---nebraskas-loss-to-purdue-what-were-fans-sentiments-towards-this-loss}{%
\subsection{Example 2 - Nebraska's loss to Purdue, what were fan's sentiments towards this loss?}\label{example-2---nebraskas-loss-to-purdue-what-were-fans-sentiments-towards-this-loss}}

The Nebraska Cornhuskers--\href{https://nebraska.rivals.com/news/nebraska-at-purdue-keys-to-victory-hol-score-predictions}{a 3-point favorite going into West Lafayette, IN}--squared off with the Purdue Boilermakers on November 2, 2019. Purdue was 2-6 on the season. Nebraska, with a 4-4 record coming off of a 38-31 home loss to Indiana, had many fans hoping Scott Frost could lead his team to a must needed win. \href{https://journalstar.com/sports/huskers/sipple/steven-m-sipple-frost-s-reaction-to-moos-six-win/article_bcebc067-865e-59a7-948d-c99290201294.html}{Especially given the expectation was the Cornhuskers would go 6-6 on the season}, and the team still had to play Wisconsin (6-2), Maryland (3-6), and Iowa (6-2) to get to those needed six wins to become bowl eligible. So, how did people particularly take this loss? Let's use our Twitter data to get an answer.

Again, we need to fix the time zone with the \texttt{with\_tz()} function so the data is represented in Central Standard Time (CST). Then we apply our \texttt{filter()} command to window our data to when the game was taking place.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{football\_tweets }\OtherTok{\textless{}{-}}\NormalTok{ tweet\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{created\_at =} \FunctionTok{with\_tz}\NormalTok{(created\_at, }\AttributeTok{tzone =} \StringTok{"America/Chicago"}\NormalTok{),}
         \AttributeTok{created\_at =} \FunctionTok{round\_time}\NormalTok{(created\_at, }\StringTok{"5 mins"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{filter}\NormalTok{(created\_at }\SpecialCharTok{\textgreater{}=} \StringTok{"2019{-}11{-}02 11:00:00"} \SpecialCharTok{\&}\NormalTok{ created\_at }\SpecialCharTok{\textless{}=} \StringTok{"2019{-}11{-}02 15:30:00"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

Now, let's just get a sense of the number of tweets that occurred at certain points in the game. We again need to do some data wrangling with \texttt{group\_by()}, and then we use the \texttt{count()} function to add up all the tweets during each five minute interval. Once the data is wrangled, we can use our \texttt{ggplot} code to visualize tweet volume throughout the game.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{football\_time }\OtherTok{\textless{}{-}}\NormalTok{ football\_tweets }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(created\_at) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{count}\NormalTok{()}

\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data =}\NormalTok{ football\_time, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ created\_at, }\AttributeTok{y =}\NormalTok{ n)) }\SpecialCharTok{+}
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{y =} \StringTok{"Number of Tweets"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-416-1.pdf}

Looking at this plot, we can see the tweet volume is a lot higher than that of the volleyball match. In fact, it looks like towards the end of the game there was a five minute interval where \textasciitilde80 or so tweets occurred. Given the outcome of the game, I assume people were not real happy during this spike in activity. Well we have the tools to answer this question.

As before, let's pull in our sentiment library with the \texttt{get\_sentiments()} function. Then lets tidy up our tweet data using the \texttt{unnest\_tokens()} and \texttt{anti\_join(stop\_words)}. Remember this step just places every word within a tweet on its own row and filters out any words that don't have any real meaning to the calculation of sentiment (i.e., and, the, a, etc.).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{afinn }\OtherTok{\textless{}{-}} \FunctionTok{get\_sentiments}\NormalTok{(}\StringTok{"afinn"}\NormalTok{)}

\NormalTok{football\_tweets\_tidy }\OtherTok{\textless{}{-}}\NormalTok{ football\_tweets }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{unnest\_tokens}\NormalTok{(word, text) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{anti\_join}\NormalTok{(stop\_words) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "word"
\end{verbatim}

We now have our clean textual data, let's apply sentiment scores for each word using the \texttt{inner\_join(affin)} function, \texttt{group\_by(created\_at)} to create a group for each five minute interval, and then use \texttt{summarise()} to calculate then mean sentiment for each time period.

You can then use the \texttt{ggplot} code to plot sentiment over time, introduce annotations to highlight specific aspects within our plot, and then apply styling to the plot to move it closer to publication readiness. Now for the big reveal, how did people take the loss to a 2-6 Purdue? Run the code and find out.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{football\_tweets\_sentiment }\OtherTok{\textless{}{-}}\NormalTok{ football\_tweets\_tidy }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{inner\_join}\NormalTok{(afinn) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{group\_by}\NormalTok{(created\_at) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{sentiment =} \FunctionTok{mean}\NormalTok{(value)) }\SpecialCharTok{\%\textgreater{}\%} 
  \FunctionTok{arrange}\NormalTok{(sentiment)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "word"
\end{verbatim}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ggplot}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{data =}\NormalTok{ football\_tweets\_sentiment, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ created\_at, }\AttributeTok{y =}\NormalTok{ sentiment)) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =} \FunctionTok{as\_datetime}\NormalTok{(}\StringTok{"2019{-}11{-}02 14:30:00"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{), }\AttributeTok{y =} \DecValTok{2}\NormalTok{), }\AttributeTok{color =} \StringTok{"darkgreen"}\NormalTok{, }\AttributeTok{label =} \StringTok{"Positive Sentiment"}\NormalTok{, }\AttributeTok{size =} \DecValTok{5}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{geom\_text}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =} \FunctionTok{as\_datetime}\NormalTok{(}\StringTok{"2019{-}11{-}02 14:30:00"}\NormalTok{, }\AttributeTok{tz =} \StringTok{"America/Chicago"}\NormalTok{), }\AttributeTok{y =} \SpecialCharTok{{-}}\DecValTok{2}\NormalTok{), }\AttributeTok{color =} \StringTok{"red"}\NormalTok{, }\AttributeTok{label =} \StringTok{"Negative Sentiment"}\NormalTok{, }\AttributeTok{size =} \DecValTok{5}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_continuous}\NormalTok{(}\AttributeTok{limits =} \FunctionTok{c}\NormalTok{(}\SpecialCharTok{{-}}\FloatTok{2.5}\NormalTok{, }\FloatTok{2.5}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"People\textquotesingle{}s Sentiment on Twitter Negative}\SpecialCharTok{\textbackslash{}n}\StringTok{Towards Husker Football\textquotesingle{}s Loss to Purdue"}\NormalTok{,}
       \AttributeTok{subtitle =} \StringTok{"People were positive at the start and part of the first half,}\SpecialCharTok{\textbackslash{}n}\StringTok{then negative throughout"}\NormalTok{,}
       \AttributeTok{caption =} \StringTok{"Source: \#huskers and \#GBR Tweets, 2019{-}11{-}02 | By Collin K. Berke"}\NormalTok{,}
       \AttributeTok{y =} \StringTok{"Sentiment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Central Standard Time (CST)"}\NormalTok{) }\SpecialCharTok{+} 
  \FunctionTok{theme\_minimal}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{theme}\NormalTok{(}\AttributeTok{axis.title.x =} \FunctionTok{element\_blank}\NormalTok{(),}
        \AttributeTok{plot.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{16}\NormalTok{, }\AttributeTok{face =} \StringTok{"bold"}\NormalTok{),}
    \AttributeTok{axis.title =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{10}\NormalTok{),}
    \AttributeTok{axis.text =} \FunctionTok{element\_text}\NormalTok{(}\AttributeTok{size =} \DecValTok{7}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{SportsData_files/figure-latex/unnamed-chunk-418-1.pdf}

As you can see, it started out pretty positive. Then, it started to go negative throughout the first half. However, there was a bump, which was around the time D-Lineman, Darrion Daniels almost scored a pick six. Around halftime, we can see a little bit of a bump towards positive sentiment. This was probably most likely due to people cheering on the Huskers to come out strong after the half. As the second half progressed, you can see things turned for the worst again, and sentiment became negative up until the end of the game, most likely because people realized they were going to get another L on the schedule. You can relive all this excitement again by catching the game recap \href{https://www.youtube.com/watch?v=m0hKH6Zb0vY\&feature=onebox}{here}.

\hypertarget{intro-to-rvest}{%
\chapter{Intro to rvest}\label{intro-to-rvest}}

All the way back in Chapter 2, we used Google Sheets and importHTML to get our own data out of a website. For me, that's a lot of pointing and clicking and copying and pasting. R has a library that can automate the harvesting of data from HTML on the internet. It's called \texttt{rvest}.

Let's grab \href{http://www.cfbstats.com/2019/leader/national/team/offense/split01/category09/sort01.html}{a simple, basic HTML table from College Football Stats}. This is scoring offense for 2019. There's nothing particularly strange about this table -- it's simply formatted and easy to scrape.

First we'll need some libraries. We're going to use a library called \texttt{rvest}, which you can get by running \texttt{install.packages(\textquotesingle{}rvest\textquotesingle{})} in the console.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(rvest)}
\FunctionTok{library}\NormalTok{(tidyverse)}
\end{Highlighting}
\end{Shaded}

The rvest package has functions that make fetching, reading and parsing HTML simple. The first thing we need to do is specify a url that we're going to scrape.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoringoffenseurl }\OtherTok{\textless{}{-}} \StringTok{"http://www.cfbstats.com/2019/leader/national/team/offense/split01/category09/sort01.html"}
\end{Highlighting}
\end{Shaded}

Now, the most difficult part of scraping data from any website is knowing what exact HTML tag you need to grab. In this case, we want a \texttt{\textless{}table\textgreater{}} tag that has all of our data table in it. But how do you tell R which one that is? Well, it's easy, once you know what to do. But it's not simple. So I've made a short video to show you how to find it.

When you have simple tables, the code is very simple. You create a variable to receive the data, then pass it the url, read the html that was fetched, find the node you need using your XPath value you just copied and you tell rvest that it's a table.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoringoffense }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffenseurl }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{read\_html}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="content"]/div[2]/table\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_table}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

What we get from this is \ldots{} not a dataframe. It's a list with one element in it, which just so happens to be our dataframe. When you get this, the solution is simple: just overwrite the variable you created with the first list element.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoringoffense }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffense[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

And what do we have?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(scoringoffense)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##           Name  G TD FG 1XP 2XP Safety Points Points/G
## 1 1        LSU 15 95 21  89   1      1    726     48.4
## 2 2    Alabama 13 83 12  80   0      0    614     47.2
## 3 3 Ohio State 14 88 13  87   0      1    656     46.9
## 4 4    Clemson 15 88 14  85   2      0    659     43.9
## 5 5        UCF 13 74 15  71   1      1    564     43.4
## 6 6   Oklahoma 14 76 19  75   1      0    590     42.1
\end{verbatim}

We have data, ready for analysis.

\hypertarget{a-slightly-more-complicated-example}{%
\section{A slightly more complicated example}\label{a-slightly-more-complicated-example}}

What if we want more than one year in our dataframe?

This is a common problem. What if we want to look at every scoring offense going back several years? The website has them going back to 2009. How can we combine them?

First, we should note, that the data does not have anything in it to indicate what year it comes from. So we're going to have to add that. And we're going to have to figure out a way to stack two dataframes on top of each other.

So let's grab 2018.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoringoffenseurl18 }\OtherTok{\textless{}{-}} \StringTok{"http://www.cfbstats.com/2018/leader/national/team/offense/split01/category09/sort01.html"}

\NormalTok{scoringoffense18 }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffenseurl18 }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{read\_html}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="content"]/div[2]/table\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_table}\NormalTok{()}

\NormalTok{scoringoffense18 }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffense18[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

First, how are we going to know, in the data, which year our data is from? We can use mutate.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoringoffense18 }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffense }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{YEAR =} \DecValTok{2019}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Error in env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings)): attempt to use zero-length variable name
\end{verbatim}

Uh oh. Error. What does it say? It's \ldots{} not clear, but a hint is that our first column doesn't have a name. Each column must be named. If you look at our data in the environment tab in the upper right corner, you'll see that indeed, the first column has no name. It's the FBS rank of each team. So we can fix that and mutate in the same step. We'll do that using \texttt{rename} and since the field doesn't have a name to rename it, we'll use a position argument. We'll say rename column 1 as Rank.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scoringoffense19 }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffense }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{rename}\NormalTok{(}\AttributeTok{Rank =} \DecValTok{1}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{YEAR =} \DecValTok{2019}\NormalTok{)}
\NormalTok{scoringoffense18 }\OtherTok{\textless{}{-}}\NormalTok{ scoringoffense18 }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{rename}\NormalTok{(}\AttributeTok{Rank =} \DecValTok{1}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{YEAR =} \DecValTok{2018}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

And now, to combine the two tables together length-wise -- we need to make long data -- we'll use a base R function called \texttt{rbind}. The good thing is rbind is simple. The bad part is it can only do two tables at a time, so if you have more than that, you'll need to do it in steps.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{combined }\OtherTok{\textless{}{-}} \FunctionTok{rbind}\NormalTok{(scoringoffense19, scoringoffense18)}
\end{Highlighting}
\end{Shaded}

Note in the environment tab we now have a data frame called combined that has 260 observations -- which just so happens to be what 130 from 2019 and 130 from 2018 add up to.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(combined)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Rank       Name  G TD FG 1XP 2XP Safety Points Points/G YEAR
## 1    1        LSU 15 95 21  89   1      1    726     48.4 2019
## 2    2    Alabama 13 83 12  80   0      0    614     47.2 2019
## 3    3 Ohio State 14 88 13  87   0      1    656     46.9 2019
## 4    4    Clemson 15 88 14  85   2      0    659     43.9 2019
## 5    5        UCF 13 74 15  71   1      1    564     43.4 2019
## 6    6   Oklahoma 14 76 19  75   1      0    590     42.1 2019
\end{verbatim}

\hypertarget{an-even-more-complicated-example}{%
\section{An even more complicated example}\label{an-even-more-complicated-example}}

What do you do when the table has non-standard headers?

Unfortunately, non-standard means there's no one way to do it -- it's going to depend on the table and the headers. But here's one idea: Don't try to make it work.

I'll explain.

Let's try to get \href{https://www.sports-reference.com/cbb/seasons/2019-school-stats.html}{season team stats from Sports Reference}. If you look at that page, you'll see the problem right away -- the headers span two rows, and they repeat. That's going to be all kinds of no good. You can't import that. Dataframes must have names all in one row. If you have two-line headers, you have a problem you have to fix before you can do anything else with it.

First we'll grab the page.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url }\OtherTok{\textless{}{-}} \StringTok{"https://www.sports{-}reference.com/cbb/seasons/2019{-}school{-}stats.html"}
\end{Highlighting}
\end{Shaded}

Now, similar to our example above, we'll read the html, use XPath to find the table, and then read that table with a directive passed to it setting the header to FALSE. That tells rvest that there isn't a header row. Just import it as data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stats }\OtherTok{\textless{}{-}}\NormalTok{ url }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{read\_html}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="basic\_school\_stats"]\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_table}\NormalTok{(}\AttributeTok{header=}\ConstantTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

What we get back is a list of one element (similar to above). So let's pop it out into a data frame.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stats }\OtherTok{\textless{}{-}}\NormalTok{ stats[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

And we'll take a look at what we have.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(stats)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   X1                     X2      X3      X4      X5      X6      X7      X8 X9
## 1                           Overall Overall Overall Overall Overall Overall NA
## 2 Rk                 School       G       W       L    W-L%     SRS     SOS NA
## 3  1 Abilene Christian NCAA      34      27       7    .794   -1.91   -7.34 NA
## 4  2              Air Force      32      14      18    .438   -4.28    0.24 NA
## 5  3                  Akron      33      17      16    .515    4.86    1.09 NA
## 6  4            Alabama A&M      32       5      27    .156  -19.23   -8.38 NA
##     X10   X11 X12  X13  X14 X15  X16  X17 X18    X19    X20 X21    X22    X23
## 1 Conf. Conf.  NA Home Home  NA Away Away  NA Points Points  NA Totals Totals
## 2     W     L  NA    W    L  NA    W    L  NA    Tm.   Opp.  NA     MP     FG
## 3    14     4  NA   13    2  NA   10    4  NA   2502   2161  NA   1370    897
## 4     8    10  NA    9    6  NA    3    9  NA   2179   2294  NA   1300    802
## 5     8    10  NA   14    3  NA    1   10  NA   2271   2107  NA   1325    797
## 6     4    14  NA    4    7  NA    0   18  NA   1938   2285  NA   1295    736
##      X24    X25    X26    X27    X28    X29    X30    X31    X32    X33    X34
## 1 Totals Totals Totals Totals Totals Totals Totals Totals Totals Totals Totals
## 2    FGA    FG%     3P    3PA    3P%     FT    FTA    FT%    ORB    TRB    AST
## 3   1911   .469    251    660   .380    457    642   .712    325   1110    525
## 4   1776   .452    234    711   .329    341    503   .678    253   1077    434
## 5   1948   .409    297    929   .320    380    539   .705    312   1204    399
## 6   1809   .407    182    578   .315    284    453   .627    314   1032    385
##      X35    X36    X37    X38
## 1 Totals Totals Totals Totals
## 2    STL    BLK    TOV     PF
## 3    297     93    407    635
## 4    154     57    423    543
## 5    185    106    388    569
## 6    234     50    487    587
\end{verbatim}

So, that's not ideal. We have headers and data mixed together, and our columns are named X1 to X38. Also note: They're all character fields. Because the headers are interspersed with data, it all gets called character data. So we've got to first rename each field.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stats }\OtherTok{\textless{}{-}}\NormalTok{ stats }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{rename}\NormalTok{(}\AttributeTok{Rank=}\NormalTok{X1, }\AttributeTok{School=}\NormalTok{X2, }\AttributeTok{Games=}\NormalTok{X3, }\AttributeTok{OverallWins=}\NormalTok{X4, }\AttributeTok{OverallLosses=}\NormalTok{X5, }\AttributeTok{WinPct=}\NormalTok{X6, }\AttributeTok{OverallSRS=}\NormalTok{X7, }\AttributeTok{OverallSOS=}\NormalTok{X8, }\AttributeTok{Blank1=}\NormalTok{X9, }\AttributeTok{ConferenceWins=}\NormalTok{X10, }\AttributeTok{ConferenceLosses=}\NormalTok{X11, }\AttributeTok{Blank2=}\NormalTok{X12, }\AttributeTok{HomeWins=}\NormalTok{X13, }\AttributeTok{HomeLosses=}\NormalTok{X14, }\AttributeTok{Blank3=}\NormalTok{X15, }\AttributeTok{AwayWins=}\NormalTok{X16, }\AttributeTok{AwayLosses=}\NormalTok{X17, }\AttributeTok{Blank4=}\NormalTok{X18, }\AttributeTok{ForPoints=}\NormalTok{X19, }\AttributeTok{OppPoints=}\NormalTok{X20, }\AttributeTok{Blank5=}\NormalTok{X21, }\AttributeTok{Minutes=}\NormalTok{X22, }\AttributeTok{FieldGoalsMade=}\NormalTok{X23, }\AttributeTok{FieldGoalsAttempted=}\NormalTok{X24, }\AttributeTok{FieldGoalPCT=}\NormalTok{X25, }\AttributeTok{ThreePointMade=}\NormalTok{X26, }\AttributeTok{ThreePointAttempts=}\NormalTok{X27, }\AttributeTok{ThreePointPct=}\NormalTok{X28, }\AttributeTok{FreeThrowsMade=}\NormalTok{X29, }\AttributeTok{FreeThrowsAttempted=}\NormalTok{X30, }\AttributeTok{FreeThrowPCT=}\NormalTok{X31, }\AttributeTok{OffensiveRebounds=}\NormalTok{X32, }\AttributeTok{TotalRebounds=}\NormalTok{X33, }\AttributeTok{Assists=}\NormalTok{X34, }\AttributeTok{Steals=}\NormalTok{X35, }\AttributeTok{Blocks=}\NormalTok{X36, }\AttributeTok{Turnovers=}\NormalTok{X37, }\AttributeTok{PersonalFouls=}\NormalTok{X38)}
\end{Highlighting}
\end{Shaded}

Now we have to get rid of those headers interspersed in the data. We can do that with filter that say keep all the stuff that isn't this.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stats }\OtherTok{\textless{}{-}}\NormalTok{ stats }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Rank }\SpecialCharTok{!=} \StringTok{"Rk"} \SpecialCharTok{\&}\NormalTok{ Games }\SpecialCharTok{!=} \StringTok{"Overall"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

And finally, we need to change the file type of all the fields that need it. We're going to use a clever little trick, which goes like this: We're going to use \texttt{mutate\_at}, which means mutate these fields. The pattern for \texttt{mutate\_at} is \texttt{mutate\_at} these variables and do this thing to them. But instead of specifying which of 38 variables we're going to mutate, we're going to specify the one we don't want to change, which is the name of the school. And we just want to convert them to numeric, which is simple. Here's what it looks like:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stats }\OtherTok{\textless{}{-}}\NormalTok{ stats }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate\_at}\NormalTok{(}\FunctionTok{vars}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{School), as.numeric)}
\end{Highlighting}
\end{Shaded}

One last thing: Who needs columns called Blank1, Blank2, Blank3, etc?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stats }\OtherTok{\textless{}{-}}\NormalTok{ stats }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\FunctionTok{starts\_with}\NormalTok{(}\StringTok{"Blank"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

And just like that, we have a method for getting up to the minute season stats for every team in Division I.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(stats)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Rank                 School Games OverallWins OverallLosses WinPct OverallSRS
## 1    1 Abilene Christian NCAA    34          27             7  0.794      -1.91
## 2    2              Air Force    32          14            18  0.438      -4.28
## 3    3                  Akron    33          17            16  0.515       4.86
## 4    4            Alabama A&M    32           5            27  0.156     -19.23
## 5    5     Alabama-Birmingham    35          20            15  0.571       0.36
## 6    6          Alabama State    31          12            19  0.387     -15.60
##   OverallSOS ConferenceWins ConferenceLosses HomeWins HomeLosses AwayWins
## 1      -7.34             14                4       13          2       10
## 2       0.24              8               10        9          6        3
## 3       1.09              8               10       14          3        1
## 4      -8.38              4               14        4          7        0
## 5      -1.52             10                8       11          5        6
## 6      -7.84              9                9        8          3        3
##   AwayLosses ForPoints OppPoints Minutes FieldGoalsMade FieldGoalsAttempted
## 1          4      2502      2161    1370            897                1911
## 2          9      2179      2294    1300            802                1776
## 3         10      2271      2107    1325            797                1948
## 4         18      1938      2285    1295            736                1809
## 5          6      2470      2370    1410            906                2003
## 6         13      2086      2235    1250            712                1764
##   FieldGoalPCT ThreePointMade ThreePointAttempts ThreePointPct FreeThrowsMade
## 1        0.469            251                660         0.380            457
## 2        0.452            234                711         0.329            341
## 3        0.409            297                929         0.320            380
## 4        0.407            182                578         0.315            284
## 5        0.452            234                694         0.337            424
## 6        0.404            216                673         0.321            446
##   FreeThrowsAttempted FreeThrowPCT OffensiveRebounds TotalRebounds Assists
## 1                 642        0.712               325          1110     525
## 2                 503        0.678               253          1077     434
## 3                 539        0.705               312          1204     399
## 4                 453        0.627               314          1032     385
## 5                 630        0.673               367          1279     401
## 6                 684        0.652               365          1094     313
##   Steals Blocks Turnovers PersonalFouls
## 1    297     93       407           635
## 2    154     57       423           543
## 3    185    106       388           569
## 4    234     50       487           587
## 5    218     82       399           578
## 6    203    102       451           565
\end{verbatim}

\hypertarget{advanced-rvest}{%
\chapter{Advanced rvest}\label{advanced-rvest}}

With the chapter, we learned how to grab one table from one page. But what if you needed more than that? What if you needed hundreds of tables from hundreds of pages? What if you needed to combine one table on one page into a bigger table, but hundreds of times. There's a way to do this, it just takes patience, a lot of logic, a lot of debugging and, for me, a fair bit of swearing.

So what we are after are game by game stats for each college basketball team in America.

\href{https://www.sports-reference.com/cbb/seasons/2019-school-stats.html}{We can see from this page} that each team is linked. If we follow each link, we get a ton of tables. But they aren't what we need. There's a link to gamelogs underneath the team names.

So we can see from this that we've got some problems.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  The team name isn't in the table. Nor is the conference.
\item
  There's a date we'll have to deal with.
\item
  Non-standard headers and a truly huge number of fields.
\item
  And how do we get each one of those urls without having to copy them all into some horrible list?
\end{enumerate}

So let's start with that last question first and grab libraries we need.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(rvest)}
\FunctionTok{library}\NormalTok{(lubridate)}
\end{Highlighting}
\end{Shaded}

First things first, we need to grab the url to each team from that first link.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url }\OtherTok{\textless{}{-}} \StringTok{"https://www.sports{-}reference.com/cbb/seasons/2019{-}school{-}stats.html"}
\end{Highlighting}
\end{Shaded}

But notice first, we don't want to grab the table. The table doesn't help us. We need to grab the only \emph{link} in the table. So we can do that by using the table xpath node, then grabbing the anchor tags in the table, then get only the link out of them (instead of the linked text).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{schools }\OtherTok{\textless{}{-}}\NormalTok{ url }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{read\_html}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="basic\_school\_stats"]\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\StringTok{"a"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_attr}\NormalTok{(}\StringTok{\textquotesingle{}href\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Notice we now have a list called schools with \ldots{} 353 elements. That's the number of teams in college basketball, so we're off to a good start. Here's what the fourth element is.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{schools[}\DecValTok{4}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "/cbb/schools/alabama-am/2019.html"
\end{verbatim}

So note, that's the relative path to Alabama A\&M's team page. By relative path, I mean it doesn't have the root domain. So we need to add that to each request or we'll get no where.

So that's a problem to note.

Before we solve that, let's just make sure we can get one page and get what we need.

We'll scrape Abilene Christian.

To merge all this into one big table, we need to grab the team name and their conference and merge it into the table. But those values come from somewhere else. The scraping works just about the same, but instead of html\_table you use html\_text.

So the first part of this is reading the html of the page so we don't do that for each element -- we just do it once so as to not overwhelm their servers.

The second part is we're grabbing the team name based on it's location in the page.

Third: The conference.

Fourth is the table itself, noting to ignore the headers. The last bit fixes the headers, removes the garbage header data from the table, converts the data to numbers, fixes the date and mutates a team and conference value. It looks like a lot, and it took a bit of twiddling to get it done, but it's no different from what you did for your last homework.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{page }\OtherTok{\textless{}{-}} \FunctionTok{read\_html}\NormalTok{(}\StringTok{"https://www.sports{-}reference.com/cbb/schools/abilene{-}christian/2019{-}gamelogs.html"}\NormalTok{)}
  
\NormalTok{team }\OtherTok{\textless{}{-}}\NormalTok{ page }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="meta"]/div[2]/h1/span[2]\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_text}\NormalTok{()}

\NormalTok{conference }\OtherTok{\textless{}{-}}\NormalTok{ page }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="meta"]/div[2]/p[1]/a\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_text}\NormalTok{()}

\NormalTok{table }\OtherTok{\textless{}{-}}\NormalTok{ page }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="sgl{-}basic"]\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_table}\NormalTok{(}\AttributeTok{header=}\ConstantTok{FALSE}\NormalTok{)}

\NormalTok{table }\OtherTok{\textless{}{-}}\NormalTok{ table[[}\DecValTok{1}\NormalTok{]] }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{rename}\NormalTok{(}\AttributeTok{Game=}\NormalTok{X1, }\AttributeTok{Date=}\NormalTok{X2, }\AttributeTok{HomeAway=}\NormalTok{X3, }\AttributeTok{Opponent=}\NormalTok{X4, }\AttributeTok{W\_L=}\NormalTok{X5, }\AttributeTok{TeamScore=}\NormalTok{X6, }\AttributeTok{OpponentScore=}\NormalTok{X7, }\AttributeTok{TeamFG=}\NormalTok{X8, }\AttributeTok{TeamFGA=}\NormalTok{X9, }\AttributeTok{TeamFGPCT=}\NormalTok{X10, }\AttributeTok{Team3P=}\NormalTok{X11, }\AttributeTok{Team3PA=}\NormalTok{X12, }\AttributeTok{Team3PPCT=}\NormalTok{X13, }\AttributeTok{TeamFT=}\NormalTok{X14, }\AttributeTok{TeamFTA=}\NormalTok{X15, }\AttributeTok{TeamFTPCT=}\NormalTok{X16, }\AttributeTok{TeamOffRebounds=}\NormalTok{X17, }\AttributeTok{TeamTotalRebounds=}\NormalTok{X18, }\AttributeTok{TeamAssists=}\NormalTok{X19, }\AttributeTok{TeamSteals=}\NormalTok{X20, }\AttributeTok{TeamBlocks=}\NormalTok{X21, }\AttributeTok{TeamTurnovers=}\NormalTok{X22, }\AttributeTok{TeamPersonalFouls=}\NormalTok{X23, }\AttributeTok{Blank=}\NormalTok{X24, }\AttributeTok{OpponentFG=}\NormalTok{X25, }\AttributeTok{OpponentFGA=}\NormalTok{X26, }\AttributeTok{OpponentFGPCT=}\NormalTok{X27, }\AttributeTok{Opponent3P=}\NormalTok{X28, }\AttributeTok{Opponent3PA=}\NormalTok{X29, }\AttributeTok{Opponent3PPCT=}\NormalTok{X30, }\AttributeTok{OpponentFT=}\NormalTok{X31, }\AttributeTok{OpponentFTA=}\NormalTok{X32, }\AttributeTok{OpponentFTPCT=}\NormalTok{X33, }\AttributeTok{OpponentOffRebounds=}\NormalTok{X34, }\AttributeTok{OpponentTotalRebounds=}\NormalTok{X35, }\AttributeTok{OpponentAssists=}\NormalTok{X36, }\AttributeTok{OpponentSteals=}\NormalTok{X37, }\AttributeTok{OpponentBlocks=}\NormalTok{X38, }\AttributeTok{OpponentTurnovers=}\NormalTok{X39, }\AttributeTok{OpponentPersonalFouls=}\NormalTok{X40) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Game }\SpecialCharTok{!=} \StringTok{""}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Game }\SpecialCharTok{!=} \StringTok{"G"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Team=}\NormalTok{team, }\AttributeTok{Conference=}\NormalTok{conference) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate\_at}\NormalTok{(}\FunctionTok{vars}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{Team, }\SpecialCharTok{{-}}\NormalTok{Conference, }\SpecialCharTok{{-}}\NormalTok{Date, }\SpecialCharTok{{-}}\NormalTok{Opponent, }\SpecialCharTok{{-}}\NormalTok{HomeAway, }\SpecialCharTok{{-}}\NormalTok{W\_L), as.numeric) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Date =} \FunctionTok{ymd}\NormalTok{(Date)) }
\end{Highlighting}
\end{Shaded}

Now what we're left with is how do we do this for ALL the teams. We need to send 353 requests to their servers to get each page. And each url is not the one we have -- we need to alter it.

First we have to add the root domain to each request. And, each request needs to go to /2019-gamelogs.html instead of /2019.html. If you look at the urls two the page we have and the page we need, that's all that changes.

What we're going to use is what is known in programming as a loop. We can loop through a list and have it do something to each element in the loop. And once it's done, we can move on to the next thing.

Think of it like a program that will go though a list of your classmates and ask each one of them for their year in school. It will start at one end of the list and move through asking each one ``What year in school are you?'' and will move on after getting an answer.

Except we want to take a url, add something to it, alter it, then request it and grab a bunch of data from it. Once we're done doing all that, we'll take all that info and cram it into a bigger dataset and then move on to the next one. Here's what that looks like:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{uri }\OtherTok{\textless{}{-}} \StringTok{"https://www.sports{-}reference.com"}

\NormalTok{logs }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{()}

\ControlFlowTok{for}\NormalTok{ (i }\ControlFlowTok{in}\NormalTok{ schools)\{}
\NormalTok{  log\_url }\OtherTok{\textless{}{-}} \FunctionTok{gsub}\NormalTok{(}\StringTok{"/2019.html"}\NormalTok{,}\StringTok{"/2019{-}gamelogs.html"}\NormalTok{, i)}
\NormalTok{  school\_url }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(uri, log\_url, }\AttributeTok{sep=}\StringTok{""}\NormalTok{)  }\CommentTok{\# creating the url to fetch}
  
\NormalTok{  page }\OtherTok{\textless{}{-}} \FunctionTok{read\_html}\NormalTok{(school\_url)}
  
\NormalTok{  team }\OtherTok{\textless{}{-}}\NormalTok{ page }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="meta"]/div[2]/h1/span[2]\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_text}\NormalTok{()}
  
\NormalTok{  conference }\OtherTok{\textless{}{-}}\NormalTok{ page }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="meta"]/div[2]/p[1]/a\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_text}\NormalTok{()}

\NormalTok{  table }\OtherTok{\textless{}{-}}\NormalTok{ page }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//*[@id="sgl{-}basic"]\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{html\_table}\NormalTok{(}\AttributeTok{header=}\ConstantTok{FALSE}\NormalTok{)}

\NormalTok{table }\OtherTok{\textless{}{-}}\NormalTok{ table[[}\DecValTok{1}\NormalTok{]] }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{rename}\NormalTok{(}\AttributeTok{Game=}\NormalTok{X1, }\AttributeTok{Date=}\NormalTok{X2, }\AttributeTok{HomeAway=}\NormalTok{X3, }\AttributeTok{Opponent=}\NormalTok{X4, }\AttributeTok{W\_L=}\NormalTok{X5, }\AttributeTok{TeamScore=}\NormalTok{X6, }\AttributeTok{OpponentScore=}\NormalTok{X7, }\AttributeTok{TeamFG=}\NormalTok{X8, }\AttributeTok{TeamFGA=}\NormalTok{X9, }\AttributeTok{TeamFGPCT=}\NormalTok{X10, }\AttributeTok{Team3P=}\NormalTok{X11, }\AttributeTok{Team3PA=}\NormalTok{X12, }\AttributeTok{Team3PPCT=}\NormalTok{X13, }\AttributeTok{TeamFT=}\NormalTok{X14, }\AttributeTok{TeamFTA=}\NormalTok{X15, }\AttributeTok{TeamFTPCT=}\NormalTok{X16, }\AttributeTok{TeamOffRebounds=}\NormalTok{X17, }\AttributeTok{TeamTotalRebounds=}\NormalTok{X18, }\AttributeTok{TeamAssists=}\NormalTok{X19, }\AttributeTok{TeamSteals=}\NormalTok{X20, }\AttributeTok{TeamBlocks=}\NormalTok{X21, }\AttributeTok{TeamTurnovers=}\NormalTok{X22, }\AttributeTok{TeamPersonalFouls=}\NormalTok{X23, }\AttributeTok{Blank=}\NormalTok{X24, }\AttributeTok{OpponentFG=}\NormalTok{X25, }\AttributeTok{OpponentFGA=}\NormalTok{X26, }\AttributeTok{OpponentFGPCT=}\NormalTok{X27, }\AttributeTok{Opponent3P=}\NormalTok{X28, }\AttributeTok{Opponent3PA=}\NormalTok{X29, }\AttributeTok{Opponent3PPCT=}\NormalTok{X30, }\AttributeTok{OpponentFT=}\NormalTok{X31, }\AttributeTok{OpponentFTA=}\NormalTok{X32, }\AttributeTok{OpponentFTPCT=}\NormalTok{X33, }\AttributeTok{OpponentOffRebounds=}\NormalTok{X34, }\AttributeTok{OpponentTotalRebounds=}\NormalTok{X35, }\AttributeTok{OpponentAssists=}\NormalTok{X36, }\AttributeTok{OpponentSteals=}\NormalTok{X37, }\AttributeTok{OpponentBlocks=}\NormalTok{X38, }\AttributeTok{OpponentTurnovers=}\NormalTok{X39, }\AttributeTok{OpponentPersonalFouls=}\NormalTok{X40) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Game }\SpecialCharTok{!=} \StringTok{""}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{filter}\NormalTok{(Game }\SpecialCharTok{!=} \StringTok{"G"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Team=}\NormalTok{team, }\AttributeTok{Conference=}\NormalTok{conference) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate\_at}\NormalTok{(}\FunctionTok{vars}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{Team, }\SpecialCharTok{{-}}\NormalTok{Conference, }\SpecialCharTok{{-}}\NormalTok{Date, }\SpecialCharTok{{-}}\NormalTok{Opponent, }\SpecialCharTok{{-}}\NormalTok{HomeAway, }\SpecialCharTok{{-}}\NormalTok{W\_L), as.numeric) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Date =} \FunctionTok{ymd}\NormalTok{(Date))}

\NormalTok{  logs }\OtherTok{\textless{}{-}} \FunctionTok{rbind}\NormalTok{(logs, table)  }\CommentTok{\# binding them all together}
  \FunctionTok{Sys.sleep}\NormalTok{(}\DecValTok{3}\NormalTok{)  }\CommentTok{\# Sys.sleep(3) pauses the loop for 3s so as not to overwhelm website\textquotesingle{}s server}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

The magic here is in \texttt{for\ (i\ in\ schools)\{}. What that says is for each iterator in schools -- for each school in schools -- do what follows each time. So we take the code we wrote for one school and use it for every school.

This part:

\begin{verbatim}
  log_url <- gsub("/2019.html","/2019-gamelogs.html", i)
  school_url <- paste(uri, log_url, sep="")  # creating the url to fetch
  
  page <- read_html(school_url)
\end{verbatim}

\texttt{log\_url} is what changes our school page url to our logs url, and \texttt{school\_url} is taking that log url and the root domain and merging them together to create the complete url. Then, page just reads that url we created.

What follows that is taken straight from our example of just doing one.

The last bits are using rbind to take our data and mash it into a bigger table, over and over and over again until we have them all in a single table. Then, we tell our scraper to wait a few seconds because we don't want our script to machine gun requests at their server as fast as it can go. That's a guaranteed way to get them to block scrapers, and could knock them off the internet. Aggressive scrapers aren't cool. Don't do it.

Lastly, we write it out to a csv file.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_csv}\NormalTok{(logs, }\StringTok{"logs.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

So with a little programming knowhow, a little bit of problem solving and the stubbornness not to quit on it, you can get a whole lot of data scattered all over the place with not a lot of code.

\hypertarget{one-last-bit}{%
\section{One last bit}\label{one-last-bit}}

Most tables that Sports Reference sites have are in plain vanilla HTML. But some of them -- particularly player based stuff -- are hidden with a little trick. The site puts the data in a comment -- where a browser will ignore it -- and then uses javascript to interpret the commented data. To a human on the page, it looks the same. To a browswer or a scraper, it's invisible. You'll get errors. How do you get around it?

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Scrape the comments.
\item
  Turn the comment into text.
\item
  Then read that text as html.
\item
  Proceed as normal.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{h }\OtherTok{\textless{}{-}} \FunctionTok{read\_html}\NormalTok{(}\StringTok{\textquotesingle{}https://www.baseball{-}reference.com/leagues/MLB/2017{-}standard{-}pitching.shtml\textquotesingle{}}\NormalTok{)}

\NormalTok{df }\OtherTok{\textless{}{-}}\NormalTok{ h }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{html\_nodes}\NormalTok{(}\AttributeTok{xpath =} \StringTok{\textquotesingle{}//comment()\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}    \CommentTok{\# select comment nodes}
    \FunctionTok{html\_text}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}    \CommentTok{\# extract comment text}
    \FunctionTok{paste}\NormalTok{(}\AttributeTok{collapse =} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}    \CommentTok{\# collapse to a single string}
    \FunctionTok{read\_html}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}    \CommentTok{\# reparse to HTML}
    \FunctionTok{html\_node}\NormalTok{(}\StringTok{\textquotesingle{}table\textquotesingle{}}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}    \CommentTok{\# select the desired table}
    \FunctionTok{html\_table}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\hypertarget{building-your-own-blog-with-blogdown}{%
\chapter{Building your own blog with blogdown}\label{building-your-own-blog-with-blogdown}}

If you listen to the \href{https://www.measurablespod.com/podcast}{Measurables Podcast} for about two episodes, you'll detect a pattern. The host asks each guest how they got started in sports analytics. To a one, they'll say they found public data and started blogging about their analysis of it. For nearly every single guest, this is their path into the field. They started messing around with data in a toolset, found something interesting and wrote a post about what they found and how they found it. Other analysts noticed it, or a hiring manager liked what they read, and the rest is history, as they say.

So, let's do that. Let's get you a blog so you can post your work.

Here's our requirements:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  This doesn't cost you anything.
\item
  There's zero maintenance work or upkeep. No servers to manage. No account to pay for.
\item
  Since you're going to be writing about your code, you should be able to create your blog posts in R Studio.
\end{enumerate}

\hypertarget{setup}{%
\section{Setup}\label{setup}}

With those requirements in mind, we're going to use a library called Blogdown, which creates blog posts from R Markdown files, similar to what you've been working with in this book.

It installs how you think it should. Go into the console and run this:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{install.packages}\NormalTok{(}\StringTok{\textquotesingle{}blogdown\textquotesingle{}}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

After that, run this:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{blogdown}\SpecialCharTok{::}\FunctionTok{install\_hugo}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

After that, we're ready to start making a blog. To do that, go to File \textgreater{} New Project and select New Directory (or Version Control if you know what you're doing).

\includegraphics[width=19.61in]{images/blog1}

In the Project Type view, you may have to scroll down to find ``Website using blogdown'' but that's your choice. Mine will probably look different than yours, but below is what the line you're looking for looks like.

\includegraphics[width=15.39in]{images/blog2}

And now we've come to our first decision point.

First, name the directory you're going to put this into. Keep it simple. Then decide where on your computer you're going to put it.

Now for the big decision: What theme to use. There's four choices, but the default theme is the most fully documented. If you want to use another theme, you'll be better off creating two projects: the one you're going to publish, and the lithium theme you can steal stuff from.

Here's the easiest themes to work with:

\begin{itemize}
\tightlist
\item
  \href{https://github.com/yihui/hugo-lithium}{yihui/hugo-lithium}
\item
  \href{https://github.com/yihui/hugo-prose}{yihui/hugo-prose}
\item
  \href{https://github.com/yihui/hugo-ivy}{yihui/hugo-ivy}
\item
  \href{https://github.com/yihui/hugo-xmin}{yihui/hugo-xmin}
\end{itemize}

\includegraphics[width=16.58in]{images/blog3}

When you hit Create Project, you should get an R Studio screen with a file open called config.toml. This file will look different depending on which theme you used. If you used the lithium theme, your config.toml is pretty fully featured. If you tried ivy, for example, it's not. This configuration file needs to have a few things on it. This is the config.toml for my website, but you should use it as a guide for what yours will need:

\begin{verbatim}
theme = "hugo-ivy"
baseURL = "http://mattwaite.github.io/"
languageCode = "en-us"
title = "Matt Waite"

googleAnalytics = ""

ignoreFiles = ["\\.Rmd$", "\\.Rmarkdown$", "_cache$", "\\.knit\\.md$", "\\.utf8\\.md$"]

[permalinks]
    post = "/:year/:month/:day/:slug/"

[[menu.main]]
    name = "About"
    url = "/about/"
[[menu.main]]
    name = "GitHub"
    url = "https://github.com/mattwaite"
[[menu.main]]
    name = "Twitter"
    url = "https://twitter.com/mattwaite"

[params]
    description = "A website built through Hugo and blogdown."

    # options for highlight.js (version, additional languages, and theme)
    highlightjsVersion = "9.12.0"
    highlightjsCDN = "//cdnjs.cloudflare.com/ajax/libs"
    highlightjsLang = ["r", "yaml"]
    highlightjsTheme = "github"

    MathJaxCDN = "//cdnjs.cloudflare.com/ajax/libs"
    MathJaxVersion = "2.7.5"

    # path to the favicon, under "static"
    favicon = "favicon.ico"
\end{verbatim}

The top parts are generally filled in regardless of the theme. You'll obviously want to change the base url and the title. For Google Analytics, you'll add your tracking id there if and when you want to set that up. Leave ignorefiles and permalinks alone. For your menu -- that is up to you. If someone wanting to hire you came across your site, what would you include here? Another thing to change? The description. It will be the text that appears below the name in Google.

Notice that you have a version of your site in the Viewer tab on R Studio. Change the title of your site and save it. See it change? If you're using the lithium theme, change one of the menu items and save it. That's how you'll see your site change.

\hypertarget{editing-existing-markdown-content}{%
\section{Editing existing Markdown content}\label{editing-existing-markdown-content}}

There's two types of files you can work with -- plain Markdown files or R Markdown files. Plain Markdown are generally pages without code. R Markdown are the ones with the code.

Let's start with a simple Markdown file -- the homepage of the site. To edit files, use the Files tab. All of the material you'll be editing is in the content folder.

\includegraphics[width=27.69in]{images/blog4}

Let's start with editing the about.md file. Some templates also have a \_index.md file, which is the static part of the homepage.

\includegraphics[width=28.81in]{images/blog5}

At the top, you will have something called a yaml header, which are things that set up the page. They are quite simple to figure out. Here's the yaml header the Ivy theme generates. Obviously, I need to change the author, the date and the title for my site.

\begin{verbatim}
---
author: Yihui Xie
date: "2017-08-06"
title: About Hugo Ivy
---
\end{verbatim}

Change yours, then change the text below the yaml header, and then save it. Click on the Viewer tab and check out your work.

\hypertarget{creating-a-new-post-in-r-markdown}{%
\section{Creating a new post in R Markdown}\label{creating-a-new-post-in-r-markdown}}

You'll notice in your content folder that there is a folder called post, and in there are multiple posts. Depending on your theme, you might just have folders in post, and you might have some markdown files. They work mostly the same way.

The magic here is how you name them. You'll remember from confic.toml that we have a post url format that goes post / year / month / day / slug. The reason for that is that urls matter for Google. Clean urls with meaningful information in them rank higher.

So in post, you'll see the folders are named similarly -- a date separated by dashes followed by a slug -- the headline of your post in all lower case with dashes instead of spaces. So if I were writing a post called ``I love sports data'', the slug version of that would be i-love-sports-data. If I were writing it on December 15, which I am, my folder would be 2020-12-15-i-love-sports-data.
So let's create a post. You can name it what you want, but the folder name needs to be in the form of year-month-day-a-title-here. Start by creating the folder.

\includegraphics[width=32.83in]{images/blog6}

Then, start a new R notebook and save it into your new folder. It must be named index (which is a website thing, if you're wondering).

Your typical yaml header in an R notebook doesn't have all the parts you need. For example, here's one for a post about this book.

\begin{verbatim}
---
title: "Sports Data Analysis and Visualization"
author: "Matt Waite"
date: 2019-07-29T21:13:14-05:00
categories: ["Books"]
tags: ["R", "data", "sports"]
---
\end{verbatim}

You need a title, an author and date. Categories and tags are optional, but I think they help people explore a site and aren't that hard to deal with.

Below the yaml header? That's up to you. Go do some of that writing stuff you do.

\hypertarget{publishing-your-site}{%
\section{Publishing your site}\label{publishing-your-site}}

Blogdown is built on top of the Hugo static site generator. What that means is that Blogdown is going to take your Markdown files and create static html. What does static html mean? It means there's no server creating it on the fly -- that's called dynamic html -- so this can be hosted on the simplest of servers.

Publishing takes a lot of steps to get set up, but once it is, it's easy.

\textbf{Step 1: Get a Github account}

Go to \href{https://github.com/}{Github} and # for an account. NOTE: Your username will be part of your website address, and you could potentially be sending this to employers. I'm sure blaze420rryday has been your handle since middle school, but do you want an employer to see that? My Github user name is mattwaite. See what I'm getting at here?

\textbf{Step 2: Set up your website repository}

For the low low price of nothing, Github will host a website for you, and we are going to take them up on that bargain. There's several tricks to getting this to work, but none of them are hard. They just require you to be specific.

To start, you can click on the plus # the top right corner or hit the green new button, depending on what your page looks like vs mine (I'm reasonably active github user, so mine will look different from yours).

\includegraphics[width=16.54in]{images/blog7}

In the create a new repository page, the name of your respository needs to follow this pattern: yourusernamehere.github.io where yourusernamehere is \ldots{} your username. So my site is mattwaite.github.io because my username is mattwaite. This is why you do not want to select swaggylovedoctor as your username, no matter how attached to it you are. Your employment chances are zero with something dumb like that.

After you've named the repository correctly, leave it public, check add .gitignore and leave the rest. Click create repository.

\textbf{Step 3:}

Before you close the tab with github on it, copy the url of the repository. For example, mine looks like this: \url{https://github.com/mattwaite/mattwaite.github.io}

Github is a version control system and a social network mixed together. Version control is like Track Changes in Word, but on steroids. If you plan on a career in analytics or data science, Github is a skill you will have to learn, but it's beyond the scope of this book.

To work around this, we're going to use R Studio to manage our interactions with Github. The first thing we need to do is create a new project for our website repository. Go to File \textgreater{} New Project

First: Choose Version Control

\includegraphics[width=13.17in]{images/blog9}

Second: Choose Github.

Third: Paste your github repository url into the Repository URL box. That will autofill the Project Director Name.

\textbf{MOST IMPORTANTLY} here, you need to put your website project in the same directory as your blog project. So if your blog project is in your Documents folder, put your website in your Documents folder. If you've got your blog project in some homework folder somewhere, put the website there too. But when you're done, you should have a place on your computer somewhere that has two folders:

blog-project-name/
username.github.io/

After you've put it in the right place, check the Open in new session box and hit Create Project.

\includegraphics[width=17.1in]{images/blog11}

\textbf{Step 4:}

Let's make your first file in your website. It's a simple file. Go to File \textgreater{} New File \textgreater{} Text File.

Copy and paste this block into that file:

\begin{verbatim}
.DS_Store
*/.DS_Store
.Rproj.user
*.Rproj
.Rhistory
\end{verbatim}

If you're on windows, the first two lines are unnecessary, but won't hurt anything.

Go to File \textgreater{} Save and name this file .gitignore

You will get a warning about file starting with a . being reserved for the system and will be hidden. Good news! That's exactly what this is. This tells git to ignore these files. They're files you don't want on Github, so this prevents that.

After you save it, click on the Git tab in the top right. You should see your .gitignore file there and you may see others. Click the Commit button.

\includegraphics[width=10.08in]{images/blog12}

This is where you will commit files to your repository. Github is a two step process for getting files from your computer to Github. The first is committing the files. The second is pushing them. Committing files is a three step process. You first have to check the box next to the files you are committing, then write a commit message -- a note to yourself what this is -- and then hitting commit.

\includegraphics[width=13.92in]{images/blog13}

In your commit message, type ``Updating gitignore''

Click commit. A window should pop up with some information. Unless you get an error saying the commit failed, you can close this window.

You're halfway home. Now click the push button in the top right. Once you do that, your new file will be on Github. Congrats, you've sent your first file to version control.

\textbf{Step 5}

Now we're set up to make a website, but we need to tell your website project that you want to publish to your github folder, not the default.

Open config.toml in your blog project. Near the top, above ignoreFiles, add this line, replacing yourusername with \ldots{} your user name. This says publish to your github project that you put in the same folder as your blog project:

\begin{verbatim}
publishDir = "../yourusername.github.io"
\end{verbatim}

Save config.toml.

Now you need to do the work to make your site. Change the title, the description, edit the about page, write a post.

Once you are ready to build your site, you need to click on the Build tab and then on Build Website.

\includegraphics[width=29.03in]{images/blog8}

If it worked, you should see two things: A window should pop up with your site in it (and it may be that the css doesn't work, which isn't an issue when you publish to Github) and output in the build screen that tells you what it did.

\includegraphics[width=24.44in]{images/blog14}

\textbf{Step 6}

At long last, we're ready to see your work on the internet. After you have built your website, flip over to your username.github.io project in R studio.

Click the Git tab.

You should see a LOT more files -- your website. Click the Commit tab and follow the steps in Step 4 -- check the boxes next to all the files (or click on the first one, hold shift and click on the last one, then check any box), then add a commit message (``first website commit'' or ``updating my website'') and click Commit. Then, once that's done, click Push.

In a few minutes, you should be able to go to username.github.io in a browser and you'll see your site. \href{http://mattwaite.github.io/}{Here's mine}.

\hypertarget{assignments}{%
\chapter{Assignments}\label{assignments}}

\textbf{NOTE:} This is a collection of assignments I've used in my Sports Data Analysis and Visualization course at the University of Nebraska-Lincoln. The book has been updated repeatedly and regularly -- every new class even -- and a lot of these assignments have changed, along with the ordering of the chapter. I intend to get back to this and fix it, but class prep is the priority.

The overriding philosophy is to have students do lots of small assignments that directly apply what they learned, and often pulling from other assignments. Each small assignment is just a few points each -- I make them 5 points each and make the grading a yes/no decision on 5 different questions -- so a bad grade on one doesn't matter. Then, twice during the semester, I have them create blog posts with visualizations on a topic of their choosing. The topic must have a point of view -- Nebraska's woes on third down are why the team is struggling, for example -- and must be backed up with data. They have to write a completely documented R Notebook explaining what they did and why; they have to write a publicly facing blog post for a general audience and that post has to have at least three graphs backing up their point; and they have to give a lightning talk (no more than five minutes) in class about what they have found. Those two assignments are typically worth 50 percent of the course grade.

I think rubrics are crap, but I give students these questions as a guide to what I'm expecting:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Did you read the data into a dataframe?
\item
  Did you use the skill discussed in the chapter correctly?
\item
  Did you answer all the questions posed by the assignment?
\item
  Did you use Markdown comments to explain your steps, what you did and why?
\end{enumerate}

\textbf{Chapter 1: Intro}

\begin{itemize}
\tightlist
\item
  Install \href{https://slack.com/get}{Slack} on your computer and your phone.
\item
  If on a Mac, \href{http://osxdaily.com/2014/02/12/install-command-line-tools-mac-os-x/}{install the Command Line Tools}.
\item
  Install \href{https://rweb.crmda.ku.edu/cran/}{R for your computer}.
\item
  Install \href{https://www.rstudio.com/products/rstudio/download/\#download}{R Studio Desktop for your computer} ONLY AFTER YOU HAVE INSTALLED R
\end{itemize}

\textbf{Chapter 2: Basics}

Part 1:

In the console, type \texttt{install.packages("swirl")}

Then \texttt{library("swirl")}

Then \texttt{swirl()}

Follow instructions on the screen. Each time you are asked if which one you want, you want the first one. The basics, the beginning, the first parts. All the first ones. Then just follow the instructions on the screen.

Part 2:

Create an R notebook (which you should have done if you were following along). In it, delete all the generated text from it, so you have a blank document. Then, write a sentence in the document telling me what the last thing you did in Swirl was. Then add a code block (see about inserting code in the chapter) and add two numbers together. Any two numbers. Run that code block. Save the file and submit the .Rmd file created when you save it to Canvas. That's it. Simple.

\textbf{Chapter 3: Data, structures and types}

Using what you learned in the chapter, fetch \href{http://www.cfbstats.com/2018/leader/827/player/split01/category19/sort01.html}{the list of the Big Ten's leading tacklers}. Submit the CSV file to Canvas. In the comments, label each field type. What are they? Dates? Characters? Numeric?

\textbf{Chapter 4: Aggregates}

Import \href{https://unl.box.com/s/a8m91bro10t89watsyo13yjegb1fy009}{this dataset of every college basketball game in the 2018-19 season}. Using what you learned in the chapter, answer the following questions:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  What team shot the most shots?
\item
  What team averaged the most shots?
\item
  What team had the highest median number of shots?
\item
  How much difference is there between the top average shots team and the top median shots team? Why do you think that is?
\end{enumerate}

\textbf{Chapter 5: Mutating Data}

Import \href{https://unl.box.com/s/a8m91bro10t89watsyo13yjegb1fy009}{this dataset of every college basketball game in the 2018-19 season}. Using what you learned in the chapter, mutate a new variable: differential. Differential is the difference between the team score and the opponent score. A positive number means the team in question won. A negative number means the team in question lost. After creating the differential, average them together and sort them in descending order. Which team had the highest average point differential in college basketball? In other words, which team consistently won by the largest margins?

\textbf{Chapter 6: Filters and selections}

Import the data of \href{https://unl.box.com/s/s1wzw61u9ia50qmirfhuvprgpmmah9rj}{every college basketball player's season stats in 2018-19 season}. Using this data, let's get closer to a real answer to where the cutoff for true shooting season should be from the chapter. First, find the median number of shots attempted in the season, then set the cutoff filter for who had the best true shooting percentage using that number.

\textbf{Chapter 7: Transforming data}

Import this dataset of \href{https://unl.box.com/s/fs3rj0dns1xh2y1dx0c2yc0adh4u3zsy}{college football attendance data from 2013-2018}. This data is long data -- one team, one year, one row. We need it to be wide data. Hint: it'll be much easier if you select only the columns you need to make it wide instead of using them all. Submit your notebook.

\textbf{Chapter 8: Simulations}

On Feb.~6, Nebraska's basketball team had a nightmare night shooting the ball. They attempted 57 shots \ldots{} and made only 12. The team shot .429 on the season. Simulate 1000 games of them taking 57 shots using their season long .429 as the probability that they'll make a shot. How many times do they make just 12?

\textbf{Chapter 9: Correlations and regressions}

Do the same thing described in the chapter, but for defense. Report your R-squared number, your p-value, what those mean and from that, how close does it come to predicting the Iowa Nebraska game?

\textbf{Chapter 10: Multiple regression}

You have been hired by Fred Hoiberg to build a team. He's interested in the model started in the chapter, but wants more.

There are more predictors to be added to our model. You are to find two. Two that contribute to the predictive quality of the model without largely overlapping another predictor.

In your notebook, report the adjusted r-squared you achieved.

You are to generate a new set of coefficients, a new formula and a new set of numbers of what a conference champion would expect in terms of differential. I've done a lot of work for you. Continue it. Add two more predictors and complete the prediction. And compare that to Nebraska of this season.

Turn in your notebook with these answers and comments to the code you added, making sure to add WHY you are doing things. Why did you select those two variables.

\textbf{Chapter 11: Residuals}

Using the same data from the chapter, model defensive third down percentage and defensive points allowed. Which teams are overperforming that model given the residual analysis?

\textbf{Chapter 12: Z scores}

Refine the composite Z Score I started in the chapter. Add two more elements to it. What else do you think is important to the success of a basketball team? I've got shooting, rebounds and the opponents shooting. What else would you add?

In an R Notebook, make your case for the two elements you are adding -- what is your logic? Why these two?. Then, follow my steps here until you get to the \texttt{teamquality} dataframe step, where you'll need to add the fields you are going to add to the composite. Then you'll need to add your fields to the \texttt{teamtotals} dataframe. Then you'll need to adjust \texttt{teamzscore}.

Finally, look at your ranking of Big Ten teams and compare it to mine. Did adding more elements to the composite change anything? Explain the differences in your notebook. Which one do you think is more accurate? I won't be offended if you say yours, but why do you feel that way?

\textbf{Chapter 13: Intro to ggplot}

\href{https://unl.box.com/s/hvxmnxhr41x4ikgt3vk38aczcbrf97pn}{Take this same attendance data}. I want you to produce a bar chart of the top 10 schools by percent change in attendance between 2018 and 2013. I want you to change the title and the labels and I want you to apply a theme different from the ones I used above. You can find \href{https://ggplot2.tidyverse.org/reference/ggtheme.html}{more themes in the ggplot documentation}.

\textbf{Chapter 14: Stacked bar charts}

I want you to make this same chart, except I want you to make the weight the percentage of the total number of graduates that gender represents. You'll be mutating a new field to create that percentage. You'll then chart it with the fill. The end result should be a stacked bar chart allowing you to compare genders between universities. Answer the following question: Which schools have the largest gender imbalances?

\textbf{Chapter 15: Waffle charts}

Compare Nebraska and Michgan's night on the basketball court using a Waffle chart and another metric than what I've done above for the game.

\href{https://github.com/hrbrmstr/waffle}{Here's the library's documentation}.
\href{https://www.sports-reference.com/cbb/boxscores/2019-02-28-19-michigan.html}{Here's the stats from the game}.

Turn in your notebook with your waffle chart. It must contain these two things:

\begin{itemize}
\tightlist
\item
  Your waffle chart
\item
  A written narrative of what it says. What does your waffle chart say about how that game turned out?
\end{itemize}

\textbf{Chapter 16: Line Charts}

Import \href{https://unl.box.com/s/a8m91bro10t89watsyo13yjegb1fy009}{this dataset of every college basketball game in the 2018-19 season}.

\begin{itemize}
\item
  How does Nebraska's shooting percentage compare to the Big Ten over the season? Put the Big Ten on the same chart as Nebraska, you'll need two dataframes, two geoms and with your Big Ten dataframe, you need to use \texttt{group} in the aesthetic.
\item
  After working on this chart, your boss comes in and says they don't care about field goal percentage anymore. They just care about three-point shooting because they read on some blog that three-point shooting was all the rage. Change what you need to change to make your line chart now about how the season has gone behind the three-point line. How does Nebraska compare to the rest of the Big Ten?
\end{itemize}

\textbf{Chapter 17: Step charts}

Re-make the chart in the chapter, but with rebounding. I want you to visualize the differential between our rebounds and their rebounds, and then plot the step chart showing over the course of the season. Highlight Nebraska. Highlight the top team. Add annotation layers to label both of them.

\textbf{Chapter 18: Ridge charts}

You've been hired by Fred Hoiberg to tell him how to win the Big Ten. He's not impressed with that I came up with. So what you need to do is look for a \emph{composite} measure that produces a meaningful ridgeplot. What that means is you're going to mutate \texttt{wintotalgroupinglogs} one more time. Is the differential between rebounding meaningful instead of just the total? Or assists? Or something else? Your call. Your goal is to produce a ridgeplot that tells The Mayor he needs to focus on doing X better than the opponent to win a Big Ten title.

\textbf{Chapter 19: Lollipop charts}

You've been hired by Fred Hoiberg to tell him how to win the Big Ten. He's not impressed with that I came up with. So what else could you look at with lollipop charts? Your call. Your goal is to produce a lollipop chart that tells The Mayor he needs to focus on the gap between X and Y if he wants to win a Big Ten title.

\textbf{Chapter 20: Scatterplot}

Using the data from the walkthrough, model and graph two other elements of Nebraska's season versus wins. How much does your choices of metrics predict the season? What do the scatterplots of what you chose look like? What do the linear models say (r-squared, p-values)? How predictive are they, i.e.~using y=mx+b, how close to Nebraska's win total do your models get to?

\textbf{Chapter 21: Facet Wraps}

Which Big Ten teams were good a shooting three point shots? Which teams weren't? Using a facet grid, chart each teams three point shooting season against the league average.

\textbf{Chapter 22}

Import \href{https://unl.box.com/s/a8m91bro10t89watsyo13yjegb1fy009}{this dataset of every college basketball game in the 2018-19 season}.

Create a dataframe that shows the 10 best or 10 worst at something. Or rank the Big Ten. Your choice. Then use formattable to show in both a table and visually how the best were different from the worst.

Export it to a PNG using the example above. Then, in Illustrator, add a headline, chatter, source and credit lines. Turn in the PNG file you export from Illustrator.

\textbf{New Chapter}

Import \href{https://unl.box.com/s/a8m91bro10t89watsyo13yjegb1fy009}{this dataset of every college basketball game in the 2018-19 season}. Create a bubble chart looking at two stats to make your scatterplot and a third making the size of your bubble. Make the color the conference name.

I want to see your bubble chart, but more importantly, I want you to discuss if what you came up with makes an effective bubble chart. Does it tell a story? Does the size of the bubble enhance understanding? No is an acceptable answer. But explain why it did or didn't work.

\textbf{New Chapter}

Your turn: Let's evaluate the second part of the quote from the chapter: November basketball tells you where you are.

We've looked at wins. What else could you look at over the course of the season that tells you where you are? Pick a metric. Explain your choice. Make a circular bar chart. Evaluate the result. What does it say?

\textbf{Chapter 23: Rvest}

I am a huge Premiere League fan, so I want data on the league. \href{https://fbref.com/en/comps/9/stats/Premier-League-Stats}{For now, I just want teams}. Scrape the team data at the top, but before you do, look at the header. Is it one row? Does that make it standard? Nope. So what now?

\textbf{Chapter 24: Advanced Rvest}

I don't usually assign an advanced rvest assignment because I don't want to turn 30 students loose on some poor provider's servers.

\textbf{Note:} There are no assignments for annotations and finishing touches. In my classes, I have students present two major visual stories where they have to incorporate the elements of those assignments as part of their grade.

\textbf{Chapter 30: Plotly}

First, create a simple ggplot like we did above exploring WRAA -- \href{http://m.mlb.com/glossary/advanced-stats/weighted-runs-above-average}{weighted runs above average} -- as your x value and and plate appearances (PA) as your y variable.

Next, create a plotly visualization using the same two variables. Alter the hover elements to show relevant data. If you leave it the same from the chapter, you lose points.

Export your plotly visualization to plotly's website. Include a link of your viz in your notebook. In your notebook, discuss the relative advantages and disadvantages of this interactive plot versus the static plots we've been doing.

\textbf{Chapter 31: Clustering}

We looked at who Cam Mack's peers are, but what about the team? Use k-means clustering on a \href{https://unl.box.com/s/qdqu5rbz7f9jtk04fhuiqrb9p0nx5a0z}{dataset of every college basketball team's season stats} and determine who Nebraska's peers are.

To complete this assignment, you'll need to pick the metrics you want to measure teams by. One note -- teams haven't played the same number of games, so it would be wise to either focus on the per game or percentage metrics, either using them or creating them yourself. You'll then need to scale them. You'll need to decide the optimal number of clusters (k) and then run them. Combine the data back, determine which cluster Nebraska is in in your clustering and then show Nebraska's peers.

In your notebook, write a few sentences and answering this question: Is the peer group Nebraska is in fair?

\textbf{Chapter 32: Rtweet}

Using the Rtweet library, gather tweets about the Maryland game Saturday night. WARNING: The API only lets you go back 18000 tweets -- we settled on 8000 in the walkthrough. If you wait too long to run the scrape after the game, you won't get the tweets you need to complete it. If you don't intend to do the assignment Saturday night, at least scrape the data and save it as a CSV as detailed in the chapter. You can then analyze it any time you want.

Analyze sentiment and chart it as in the chapter. Compare the Maryland game to the Purdue game. Are they different? Is sentiment better or worse for one or the other? Describe the differences in your notebook.

\hypertarget{appendix}{%
\chapter{Appendix}\label{appendix}}

These are some additional materials I use in my classes.

\hypertarget{how-to-get-help-in-this-class}{%
\section{How to get help in this class}\label{how-to-get-help-in-this-class}}

\begin{quote}
This is the contents of a document I send out every semester. I use Slack to help students with code problems outside of class. It's much easier than other options, such as email. A suggestion: set ground rules on when you will and won't answer Slack messages.
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \textbf{Use Slack}. Email is a miserable way to handle technological questions. I'm not answering code questions via email. On Slack we can have a back and forth where we solve this quickly instead of waiting on each other to respond to an email.
\item
  \textbf{Don't use screenshots}. Tell me what you're trying to do and then copy and paste your code into the Slack message.
\item
  \textbf{Always copy and paste the error message you are getting}. There is a near infinite number of things you could have done and a nearly limitless number of errors you could be getting. Both help.
\end{enumerate}

\textbf{Slack tips}

\begin{itemize}
\tightlist
\item
  Slack uses Markdown in messages. Did you know your code blocks in R Notebook are Markdown themselves? If you copy the whole block -- with the ``` and everything, Slack will format it like this:
\end{itemize}

\begin{verbatim}
simulations <- rbinom(n = 1000, size = 39, prob = .309)

hist(simulations)

table(simulations)
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Using the Slack app will also mean getting alerted to messages right away. If you are logging in through a web browser, you won't know when I've responded. If we're having a Slack conversation and I see you log in, send me a message, then disappear right away, I know you're using a browser and when that conversation drags because you aren't getting messages, \textbf{I'm going to get frustrated}. You likely aren't the only one asking for help at that moment.
\end{itemize}

  \bibliography{packages.bib}

\end{document}