-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata-analysis.Rmd
422 lines (361 loc) · 14.2 KB
/
data-analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
---
title: "SOCS-SOC"
author: "Stephen Wood"
date: "8/6/2019"
output: md_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# LOAD PACKAGES
library(tidyverse) # For data manipulation
library(readxl) # For reading in .xls files
library(VSURF) # For random forest model selection
library(doBy)
library(lme4) # For mixed-effects regression
library(lmerTest) # For p-value calculations of lmer models
library(MuMIn)
library(sjstats)
library(jtools) # For effect plots
```
## Salinas Organic Cropping Systems Experiment
### Read in the data files
The data are stored in separate files. usda\_soil\_data has the soil
carbon fractionation data analyzed by Wood, icp has the data analyzed by
UC Davis, and the other site-level descriptors are in the other files.
```{r}
# READ IN ALL DATA FILES
usda_soil_data <- read_excel("data/usda-soil-data.xlsx") %>%
mutate(
MAOM_CN = `MAOM C` / `MAOM N`,
POM_CN = `POM C` / `POM N`
)
bd <- read_excel("data/site-data.xlsx",
sheet = "Bulk Density Data",
col_types = c("skip", "skip", "numeric", "numeric", "text", "text", "numeric", "numeric", "numeric")
) %>%
filter(calyr == "2011") %>%
mutate(
rep = paste0("Rep ", rep)
)
inputs <- read_excel("data/site-data.xlsx",
sheet = "Org Matter and N inputs Summary",
col_types = c(
"skip", "skip", "text", "numeric", "numeric", "numeric",
"numeric", "numeric", "numeric", "numeric", "numeric",
"numeric", "numeric", "numeric", "numeric", "numeric",
"numeric", "numeric", "numeric", "numeric"
)
) %>%
mutate(
rep = paste0("Rep ", rep)
)
icp <- read_excel("data/icp.xlsx")
```
### Data manipulation
Now we need to do some slight manipulations to rename variables, merge
the data sets together, and select the variables that we'll want to used
for analysis.
```{r}
inputsLabels <- colnames(inputs)
colnames(inputs)[4:18] <- c(
"total_compost",
"annual_cc_shoot",
"total_cc_shoot",
"annual_legume_cc_shoot",
"total_legume_cc_shoot",
"n_fixation",
"compost_n",
"fertilizer_n",
"total_n",
"perc_n_from_fixation",
"total_veg_residue_shoot",
"total_om",
"fresh_om",
"fresh_om_perc",
"total_C_no_roots_no_exudates"
)
inputs$RepTreat <- paste(inputs$trt, inputs$rep)
usda_soil_data$RepTreat <- paste(usda_soil_data$Trt_id, usda_soil_data$Replicate)
bd$RepTreat <- paste(bd$trt, bd$rep)
icp$RepTreat <- paste(icp$Trt_id, icp$Replicate)
all_data <- merge(usda_soil_data, bd, by = "RepTreat") %>%
select(RepTreat, System, Replicate, Trt_id, cover_crop, Compost:POM_CN, blkden) %>%
mutate(
pom.stock = (`POM C`*blkden*30)/10,
pom.n.stock = (`POM N`*blkden*30)/10,
maom.stock = (`MAOM C`*blkden*30)/10,
maom.n.stock = (`MAOM N`*blkden*30)/10,
totC.stock = pom.stock + maom.stock
)
all_data <- all_data %>%
merge(inputs, by = "RepTreat") %>%
select(RepTreat,System,Replicate:totC.stock, total_compost:total_C_no_roots_no_exudates)
all_data$CC <- recode(all_data$Trt_id,
leg3x="Legume-rye every year",
mus1x="Mustard every year",
nocc="Legume-rye every four years",
noccnocp="Legume-rye every four years",
rye1x="Rye every year")
all_data <- all_data %>%
merge(icp %>% select(IC,EC:RepTreat), by="RepTreat")
rm(bd); rm(inputs); rm(usda_soil_data); rm(icp)
```
### Models of management effect on soil fractions
Now that we have the data, let's look through some models. We are going
to used hierarchical models with a random intercept for replicate block.
This is to capture random differences among plots within a block.
Let's start with a series of models, using all of the data, and looking
at the impact on particulate and mineral-associated C and microbial
biomass.
```{r}
# Particulate carbon
lmer(pom.stock ~ Compost + Cover_crop_freq + cover_crop + (1|Replicate),
data = all_data
) -> pom.model
summary(pom.model)
r.squaredGLMM(pom.model)
rm(pom.model)
# Mineral-associated carbon
lmer(maom.stock ~ Compost + Cover_crop_freq + cover_crop + (1|Replicate),
data = all_data
) -> maom.model
summary(maom.model)
r.squaredGLMM(maom.model)
rm(maom.model)
# Microbial biomass
lmer(substrate_induced_respiration ~ Compost + Cover_crop_freq + cover_crop + (1|Replicate),
data = all_data
) -> sir.model
summary(sir.model)
r.squaredGLMM(sir.model)
rm(sir.model)
```
Looking at the model estimates, there does not seem to be a relationship
between cover crop type and our variables of interest. We could continue
by dropping that variable from the model and re-running, but that would
take the average effect of the other variables across systems with
different cover crops. Even though cover crop type does not have an
effect in our statistical model, for the remaining comparisons we felt
that it makes more sense to only look at the variables that we know have
the same cover crop type.
Let's filter out observations that are not legume rye, and re-run the
models. We'll also select a smaller set of predictor variables that
might be of special interest.
```{r}
leg_rye <- all_data %>%
filter(cover_crop=="legume-rye") %>%
select(Replicate, cover_crop, Compost, Cover_crop_freq, organic_matter,
water_holding_capacity,substrate_induced_respiration:POM_CN,pom.stock,
pom.n.stock, maom.stock, maom.n.stock,
total_C_no_roots_no_exudates, CEC, pH, `X-Ca`
)
# Organic matter
lmer(organic_matter ~ Compost + Cover_crop_freq + (1|Replicate),
data = leg_rye
) -> om.model
summary(om.model)
r.squaredGLMM(om.model)
rm(om.model)
# Particulate C
lmer(pom.stock ~ Compost + Cover_crop_freq + (1|Replicate),
data = leg_rye
) -> pom.model
summary(pom.model)
r.squaredGLMM(pom.model)
rm(pom.model)
# Particulate N
lmer(pom.n.stock ~ Compost + Cover_crop_freq + (1|Replicate),
data = leg_rye
) -> pom.n.model
summary(pom.n.model)
r.squaredGLMM(pom.n.model)
rm(pom.n.model)
# Mineral-associated C
lmer(maom.stock ~ Compost + Cover_crop_freq + (1|Replicate),
data = leg_rye
) -> maom.model
summary(maom.model)
r.squaredGLMM(maom.model)
rm(maom.model)
# Mineral-associated N
lmer(maom.n.stock ~ Compost + Cover_crop_freq + (1|Replicate),
data = leg_rye
) -> maom.n.model
summary(maom.n.model)
r.squaredGLMM(maom.n.model)
rm(maom.n.model)
```
For our model of microbial biomass--measured by substrate induced
respiration--we're going to take a slightly different approach. Whereas
for the previous models we were mainly interested in how soil fractions
respond to management, now we think that microbial biomass could depend
on the stock sizes themselves, which we might want to control for. So we
can look at including stocks as predictor variables. For this we're
going to actually keep all of the data, rather than drop cover crop
type, because visual inspection suggested that microbial biomass might
be slightly higher with leguminous cover crops.
```{r}
# SIR
lmer(substrate_induced_respiration ~ cover_crop + Compost + Cover_crop_freq + (1|Replicate),
data = all_data
) -> sir.model
summary(sir.model)
r.squaredGLMM(sir.model)
rm(sir.model)
# SIR, depending on N input to microbes
lmer(substrate_induced_respiration ~ cover_crop + Compost + Cover_crop_freq + pom.n.stock + (1|Replicate),
data = all_data
) -> sir.model.cn
car::vif(sir.model.cn)
summary(sir.model.cn)
r.squaredGLMM(sir.model.cn)
rm(sir.model.cn)
```
### Predicting soil fractions by continuous inputs
There is a certain feedback between some of the treatments and the
drivers of soil fraction. For instance, cover crops increase soil carbon
pools partially because of root exudation. More cover crop biomass can
lead to more root inputs. On plots with compost, cover crop biomass can
be greater, which can mean different plots have different amounts of
carbon going into the soil through cover crops. In other words, cover
crop frequency is not a perfect categorical variable to capture the
amount of carbon going belowground.
We estimated the quantitative amount of nitrogen and carbon inputs going
into the system. Let's fit some models to explore if any of these
variables predict soil fractions.
First, we're going to use a machine learning algorithm called Random
Forest to see which of the quantitative input variables are the most
closely related to the fractions that we're interested in. We're going
to create a list of all of the different Random Forest models for all of
our outcomes of interest
```{r}
rf.res <- list(
VSURF(
y = all_data$organic_matter,
x = all_data %>%
select(total_compost:total_C_no_roots_no_exudates),
parallel = TRUE
),
VSURF(
y = all_data$pom.stock,
x = all_data %>%
select(total_compost:total_C_no_roots_no_exudates),
parallel = TRUE
),
VSURF(
y = all_data$maom.stock,
x = all_data %>%
select(total_compost:total_C_no_roots_no_exudates),
parallel = TRUE
),
VSURF(
y = all_data$substrate_induced_respiration,
x = all_data %>%
select(total_compost:total_C_no_roots_no_exudates),
parallel = TRUE
)
)
```
Now we can look at each of these model results and see which variables
are the most relevant. This Random Forest procedure generates lists of
variables most suitable for both interpretatoin and prediction. Briefly,
prediction is more strict in the number of variables. We're not using
our data for predictive modeling, so we'll look at the variables
identified for interpretation.
```{r}
for(i in 1:length(rf.res)){
vars = rf.res[[i]]$varselect.interp
# Print the response variable
print(rf.res[[i]]$call$y)
# Print the predictor variables selected
all_data %>%
select(total_compost:total_C_no_roots_no_exudates) %>%
select(vars) %>% names() %>% print()
}
```
Let's fit these models. Note that some of these variables are highly correlated with each other so interpretation of the individual coefficients would be invalid for models with multiple predictor variables. Also, we will use OLS to estimates these models rather than ME because the random effects do not add any explanatory power, as you'll see with the example of the first model.
```{r}
# Organic matter
lm.om <- lmer(organic_matter ~ total_C_no_roots_no_exudates + total_om + total_cc_shoot + fresh_om_perc + (1|Replicate), data=all_data)
summary(lm.om)
r.squaredGLMM(lm.om)
lm.om <- lm(organic_matter ~ total_C_no_roots_no_exudates + total_om + total_cc_shoot + fresh_om_perc, data=all_data)
summary(lm.om)
car::vif(lm.om)
# Particulate
lm.pom <- lm(pom.stock ~ fresh_om, data=all_data)
summary(lm.pom)
# Mineral
lm.maom <- lm(maom.stock ~ total_veg_residue_shoot + annual_legume_cc_shoot, data=all_data)
summary(lm.maom)
# SIR
lm.sir <- lm(substrate_induced_respiration ~ annual_cc_shoot, data=all_data)
summary(lm.sir)
```
A surprising part of these models is how poorly the fixed effects
predict mineral-associated C, while how well the random effects do. This
says that there's an important component of among-replicate variation
that we are not capturing with fixed effects. Since mineral-associated C
is known to be maintained by physical and chemical properties of the
soil, there might be some soil properties that explain this variation.
We would not expect soil properties to be as important to particulate C,
because that C is mainly in the form of partially broken down plant
material and it is not as reactive with soil surfaces.
To explore the potential for soil properties to impact
mineral-associated carbon we used a tool called two-stage least squares
regression. This approach, which is more commonly used by economists,
first models the effect of the management practices on
mineral-associated carbon (we've already done this). Then we want to
know how much of the left-over variation we can explain with other soil
properties. To do that, we extract the residuals from the first
regression and regress those against soil properties we think might be
important.
Let's start with the management model, even though we've already done
this above. First we're going to use Random Forest to identify the most
important management properties for mineral-associated C. Then we'll
used those predictors in a linear model. We're not using random effects
because we want to try to explain the variance that the random effects
were capturing with new fixed effects.
```{r}
# Determine best management predictors of MAOM
mgmt <- VSURF(
y = all_data$maom.stock,
x = all_data %>%
select(total_compost:total_C_no_roots_no_exudates),
parallel = TRUE
)
# Fit model with best management predictors
mgmt.model <- lm(maom.stock ~ Compost + Cover_crop_freq + total_veg_residue_shoot + annual_legume_cc_shoot, data = all_data)
mgmt.model %>% summary()
```
Now that we have our management model, we can use a Random Forest
approach to identify the variables related to the residuals of the
previous model. We can then use those in a model. We'll then plot the
effect of each of the final selected variables on the residuals. This
will show which variables are the most important to explaining the
variation in mineral-associated C that is not explained by management.
We will also look at model fit statistics (like R2) to see how well this
model does.
```{r}
# Find best non-management predictors of model residuals
soil.prop <- VSURF(
y = mgmt.model$residuals,
x = all_data %>%
select(perc_sand:perc_clay, EC:`Fe (DTPA)`),
parallel = TRUE
)
```
```{r}
# Create data frame of resulting variables, also including clay
resid.data <- data.frame(mgmt.model$residuals, all_data$perc_clay, all_data$`X-K`, all_data$`Ca (SP)`, all_data$`Mn (DTPA)`)
names(resid.data) <- c("model_residuals", "perc_clay", "K", "Ca", "Mn")
resid.model <- lm(model_residuals ~ perc_clay + K + Ca + Mn, data = resid.data)
resid.model %>% summary()
jtools::effect_plot(model = resid.model, pred = Ca, interval = TRUE, rug = TRUE, plot.points = TRUE, partial.residuals = TRUE)
jtools::effect_plot(model = resid.model, pred = K, interval = TRUE, rug = TRUE, plot.points = TRUE, partial.residuals = TRUE)
jtools::effect_plot(model = resid.model, pred = perc_clay, interval = TRUE, rug = TRUE, plot.points = TRUE, partial.residuals = TRUE)
jtools::effect_plot(model = resid.model, pred = Mn, interval = TRUE, rug = TRUE, plot.points = TRUE, partial.residuals = TRUE)
```
The adjusted R2 of this model is 0.3! That's a big improvement
over the management model, which was barely predicting 10% of the
variation.