-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOverview_dplyr.Rmd
231 lines (166 loc) · 4.83 KB
/
Overview_dplyr.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
---
title: "Overview_dplyr"
author: "Ellie Honan"
date: "2023-03-01"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Load Libraries
```{r}
library(dplyr)
library(tidyr)
library(readr)
```
## Load Data
```{r}
catch_original <- read.csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.302.1", method = "libcurl"))
```
The *select* command from the dyplyr package selects **columns**
The *filter* command from the dyplyr package filters **rows**
## Clean Data
```{r}
#r emove the All and notesRegCode columns from the catch_original data frame
catch_data <- catch_original %>%
select(Region, Year, Chinook, Sockeye, Coho, Pink, Chum)
# or
# catch_data <- catch_original %>%
# select(-All, -notesRegCode)
head(catch_data)
glimpse(catch_data)
```
```{r}
# Mutate Chinook class from character to numeric
catch_clean <- catch_data %>%
mutate(Chinook = as.numeric(Chinook))
# search for the na value that gave the warning
i <- which(is.na(catch_clean$Chinook))
# call it
i
#check that line in the original df - it's a I rather than a 1
catch_data[i,]
# change the l to a 1
catch_clean <- catch_data %>%
mutate(Chinook = if_else(Chinook == "I", "1", Chinook)) %>%
mutate(Chinook = as.integer(Chinook))
glimpse(catch_clean)
```
## Change shape of data using *pivot_longer* and *pivot_wider*
```{r}
# pivot longer practice
catch_long <- catch_clean %>%
pivot_longer(cols = -c(Region, Year), names_to = "species", values_to ="catch")
View(catch_long)
# names_to == column names that will go into a new column. In the case above,
# a new column called "species"
# values_to == values from within those columns, into a new column.
# In the case above, a new column called "catch"
# pivot wider practice
catch_wide <- catch_long %>%
pivot_wider(names_from = species, values_from = catch)
View(catch_wide)
# so this reverts it back
```
## Practice *renaming* columns
```{r}
catch_long <- catch_long %>%
rename(catch_thousands = catch)
head(catch_long)
```
## Add columns with *mutate* function
```{r}
catch_long <- catch_long %>%
mutate(catch = catch_thousands * 1000)
head(catch_long)
# remove catch_thousands
catch_long <- catch_long %>%
mutate(catch = catch_thousands * 1000) %>%
select(-catch_thousands)
head(catch_long)
```
## Practice using *group_by* and *summarise*
```{r}
mean_region <- catch_long %>%
group_by(Region) %>%
summarise(catch_mean = mean(catch))
head(mean_region)
# summarise count by region
n_region <- catch_long %>%
group_by(Region) %>%
summarise(number_of_observations = n(),
total_catch = sum(catch)) # number of rows of each group
head(n_region)
```
## Lesson of *filtering*
```{r}
ALU_catch <- catch_long %>%
filter(Region == "ALU")
nrow(ALU_catch)
```
## Arrange some rows with *arrange*
```{r}
# this arranges data by default smallest to largest
mean_region_arrange <- catch_long %>%
group_by(Region) %>%
summarise(mean_catch = mean(catch)) %>%
arrange(mean_catch)
# if it was largest to smallest it would be:
# arrange(desc(mean_catch))
head(mean_region_arrange)
```
## Practicing *joins*
```{r}
# loading in the byerlyRegionDegs.csv file
# remember this is the mac operator, would need to add url stuff from above
region_defs <- read.csv("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.303.1") %>%
select(code, mgmtArea)
head(region_defs)
```
## Join our data
```{r}
catch_joined <- left_join(catch_long, region_defs, by = c("Region" = "code"))
head(catch_joined)
```
```{r}
region_defs <- region_defs %>%
rename(Region = code, Region_Name = mgmtArea)
catch_joined <- left_join(catch_long, region_defs, by = "Region")
head(catch_joined)
```
## Practice with *seperate()* and *unite()*
```{r}
sites_df <- data.frame(site = c("HAW-101",
"HAW-103",
"OAH-320",
"OAH-219",
"MAI-039"))
sites_df %>%
separate(site, c("island", "site_number"), "-")
```
```{r}
# Unite things back together
dates_df <- data.frame(year = c("1930",
"1930",
"1930"),
month = c("12",
"12",
"12"),
day = c("14",
"15",
"16"))
head(dates_df)
dates_df %>%
unite(date, year, month, day, sep = "-")
# could also use / in place of -
```
## Exercise: \*Split the city column in the following data.frame into city and state_code columns\*
```{r}
cities_df <- data.frame(city = c("Juneau AK",
"Sitka AK",
"Anchorage AK"))
head(cities_df)
cities_df %>%
separate(city, c("city", "state code"), " ")
head(cities_df)
```