-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathBruceCampbell_ST503_HW8_FarawayCh9.Rmd
306 lines (226 loc) · 9.47 KB
/
BruceCampbell_ST503_HW8_FarawayCh9.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
---
title: "NCSU ST 503 HW 6"
subtitle: "Probems 8.5, 9.4, 9.5 9.6 Faraway, Julian J. Linear Models with R, Second Edition Chapman & Hall / CRC Press."
author: "Bruce Campbell"
date: "`r format(Sys.time(), '%d %B, %Y')`"
fontsize: 12pt
header-includes:
- \usepackage{bbm}
output: pdf_document
---
---
```{r setup, include=FALSE,echo=FALSE}
rm(list = ls())
knitr::opts_chunk$set(echo = FALSE)
knitr::opts_chunk$set(dev = 'pdf')
knitr::opts_chunk$set(cache=TRUE)
knitr::opts_chunk$set(tidy=TRUE)
knitr::opts_chunk$set(prompt=FALSE)
knitr::opts_chunk$set(fig.height=5)
knitr::opts_chunk$set(fig.width=7)
knitr::opts_chunk$set(warning=FALSE)
knitr::opts_chunk$set(message=FALSE)
knitr::opts_knit$set(root.dir = ".")
library(latex2exp)
library(pander)
library(ggplot2)
library(ggplot2)
library(GGally)
library(broom)
library(printr)
library(faraway)
```
## 8.5 Comparing model fitting methods with the stackloss data
Using the stackloss data, fit a model with stack.loss as the response and the other three variables as predictors using the following methods:
### (a) Least squares
```{r, echo = FALSE}
data(stackloss, package="faraway")
lm.fit <- lm(stack.loss ~ . , data=stackloss)
summary(lm.fit)
plot(fitted(lm.fit),residuals(lm.fit),xlab="Fitted",ylab="Residuals")
abline(h=0)
```
we see there may be an association with the variance of the residuals and the value of the response.
### (b) Least absolute deviations
We use the quantreg::rq method for the $L^1$ regression. Its worth reading the details of the algorithmic methods for computing the fit here. Also worthy of note is that ```quantrg::rq``` provides a lasso option for sparse regression.
```{r}
require(quantreg)
lm.ell1 <- rq(stack.loss ~ ., data= stackloss)
summary(lm.ell1)
```
### (c) Huber method
We use the MASS::rlm() function to fit the model with the Huber loss.
```{r}
require(MASS)
lm.mestimator <- rlm(stack.loss ~ . ,data=stackloss)
summary(lm.mestimator)
```
```{r}
mest.weights <- lm.mestimator$w
names(mest.weights) <- row.names(stackloss)
head(sort(mest.weights),10)
```
We see that ```21 4 and 3``` have weights less than 1. We will investigate these points in our diagnostics later.
### (d) Least trimmed squares Compare the results.
```{r}
least.trimmed..sq.fit <- ltsreg(stack.loss ~ . ,data=stackloss)
coef(least.trimmed..sq.fit)
```
### Now use diagnostic methods to detect any outliers or influential points. Remove these points and then use least squares. Compare the results.
#### Check Leverage
```{r}
df <-stackloss
numPredictors <- ( ncol(df)-1)
hatv <- hatvalues(lm.fit)
lev.cut <- (numPredictors+1) *2 * 1/ nrow(df)
high.leverage <- df[hatv > lev.cut,]
pander(high.leverage, caption = "High Leverage Data Elements")
```
We've used the rule of thumb that points with a leverage greater than $\frac{2 p }{n}$ should be looked at.
#### Check for outliers.
```{r}
studentized.residuals <- rstudent(lm.fit)
max.residual <- studentized.residuals[which.max(abs(studentized.residuals))]
range.residuals <- range(studentized.residuals)
names(range.residuals) <- c("left", "right")
pander(data.frame(range.residuals=t(range.residuals)), caption="Range of Studentized residuals")
p<-numPredictors+1
n<-nrow(df)
t.val.alpha <- qt(.05/(n*2),n-p-1)
pander(data.frame(t.val.alpha = t.val.alpha), caption = "Bonferroni corrected t-value")
outlier.index <- abs(studentized.residuals) > abs(t.val.alpha)
outliers <- df[outlier.index==TRUE,]
if(nrow(outliers)>=1)
{
pander(outliers, caption = "outliers")
}
```
Here we look for studentized residuals that fall outside the interval given by the Bonferroni corrected t-values.
#### Check for influential points.
We plot the Cook's distances and the residual-leverage plot with level set contours of the Cook distance.
```{r}
plot(lm.fit,which =4)
plot(lm.fit,which = 5)
```
#### Check for structure in the model.
##### Plot residuals versus predictors
```{r}
predictors <-names(lm.fit$coefficients)
predictors <- predictors[2:length(predictors)]
for(i in 1:length(predictors))
{
predictor <- predictors[i]
plot(df[,predictor],residuals(lm.fit),xlab=,ylab="Residuals",main = paste(predictor, " versus residuals", sep = ''))
}
```
#### Perform partial regression
```{r}
predictors <-names(lm.fit$coefficients)
predictors <- predictors[2:length(predictors)]
lm.formula <- formula(lm.fit)
response <- lm.formula[[2]]
for(i in 1:length(predictors))
{
predictor <- predictors[i]
others <- predictors[ which(predictors != predictor) ]
d.formula <-paste(response, " ~ ",sep='')
m.formula <-paste(predictor, " ~ ",sep='')
for(j in 1:(length(others)-1))
{
d.formula <-paste(d.formula, others[j]," + ", sep='')
m.formula <-paste(m.formula, others[j]," + ", sep='')
}
d.formula <-paste(d.formula, others[length(others)], sep='')
d.formula <-formula(d.formula)
m.formula <-paste(m.formula, others[length(others)], sep='')
m.formula <-formula(m.formula)
d <- residuals(lm(d.formula,df))
m <- residuals(lm(m.formula,df))
plot(m,d,xlab=paste(predictor, " residuals",sep=''),ylab="response residuals",main = paste("Partial regression plot for " , predictor,sep=''))
}
```
## 9.4 Using transformations in model of pressure data
Use the pressure data to fit a model with pressure as the response and temperature as the predictor using transformations to obtain a good fit.
```{r}
rm(list = ls())
data(pressure, package="faraway")
lm.fit <- lm(pressure ~ . , data=pressure)
summary(lm.fit)
plot(fitted(lm.fit),residuals(lm.fit),xlab="Fitted",ylab="Residuals", main = "fitted versus residuals for pressure ~ temperature")
abline(h=0)
plot(pressure$temperature,log(pressure$pressure), main = "temperature versus log(pressure) ")
```
Based on the plots above we look into fitting a series of models of the form $log(pressure) \sim \sum b_i temperature^i$
We note this data looks highly regular, and appears to originate from a physical process. There's obviously some functional relationship between these variables. Knowing this may help us in our modelling. $PV=nRT$ is a good place to start! We also note that there are only 19 observations in this data set so we should not fit too many models or add too many predictors in looking for a good fit.
```{r}
lm.fit.polynomial <- lm(log(pressure) ~ temperature + I(temperature^2), data=pressure)
summary(lm.fit.polynomial)
plot(fitted(lm.fit.polynomial),residuals(lm.fit.polynomial),xlab="Fitted",ylab="Residuals")
abline(h=0)
```
```{r}
lm.fit.polynomial <- lm(log(pressure) ~ temperature + I(temperature^2)+ I(temperature^3), data=pressure)
summary(lm.fit.polynomial)
plot(fitted(lm.fit.polynomial),residuals(lm.fit.polynomial),xlab="Fitted",ylab="Residuals")
abline(h=0)
```
## 9.5 Use transformations to find a good model for volume in terms of girth and height using the trees data.
```{r}
rm(list = ls())
data(trees, package="faraway")
lm.fit <- lm(sqrt(Volume) ~ Girth + Height , data=trees)
summary(lm.fit)
plot(fitted(lm.fit),residuals(lm.fit),xlab="Fitted",ylab="Residuals", main = "fitted versus residuals for pressure ~ temperature")
abline(h=0)
plot(trees$Girth,trees$Volume)
plot(trees$Height,trees$Volume)
```
We chose a sqrt transformation of the response after seeing a quadratic relationship among fitted versus residuals. Now we use the Box-Cox method to validate our choice.
```{r}
lm.fit.notransform <- lm(Volume ~ Girth + Height , data=trees)
boxcox(lm.fit.notransform, plotit=T)
boxcox(lm.fit.notransform, plotit=T, lambda=seq(0.2,.5,by=0.01))
```
We see the Box-Cox suggests a lambda of $\sim 0.3$
```{r}
lm.fit <- lm(Volume^0.3 ~ Girth + Height , data=trees)
summary(lm.fit)
```
Indeed we do have a better fit as evidenced by the lower RSE.
## 9.6 Response surface for odor data
### (a) Fit a second order response surface for the odor response using the other three variables as predictors. How many parameters does this model use and how many degrees of freedom are left?
There should be 3^2 +1 parameters in this model.
```{r}
rm(list = ls())
data(odor, package="faraway");df<-odor
lm.fit <- lm(odor ~ polym(temp,gas,pack,degree=2),odor)
summary(lm.fit)
```
As expected there are 9 predictors. There are VERY few degrees of freedom left. Any model we produce with this many predictors and so few degrees of freedom would be dubious.
```{r}
#odor.max <- sapply(df, min)
#odor.min <- sapply(df, max)
#temp.grid <- seq(-1,1 , len=10)
#gas.grid <- seq(-1,1 , len=10)
#pack.grid <- seq(-1,1 , len=10)
#temp.gas.grid <- expand.grid(temp=temp.grid, gas=gas.grid, pack = pack.grid)
#pv <- predict(lm.fit, temp.gas.grid)
#persp(pop15r, ddpir, matrix(pv, 10, 10), theta=45, xlab="Pop under 15", ylab="Growth", zlab = "Savings rate", ticktype="detailed", shade = 0.25)
```
### (b) Fit a model for the same response but now excluding any interaction terms but including linear and quadratic terms in all three predictors. Compare this model to the previous one. Is this simplification justified?
```{r}
lm.fit.nointeractions <- lm(odor ~ temp+ gas+ pack + I(temp^2) + I(gas^2) + I(pack^2) ,data=odor)
summary(lm.fit.nointeractions)
```
Based on the adjusted $R^2$ the simplification is justified.
### (c) Use the previous model to determine the values of the predictors which result in the minimum predicted odor.
```{r}
yhat <- lm.fit.nointeractions$fitted.values
min.fit = min(yhat)
max.fit = max(yhat)
df <- cbind(df, yhat)
df <- as.data.frame(df)
index <- which.min(df$yhat) #very usefull command
row.in.df <- df[index,]
pander(data.frame(row.in.df), caption = "Predictor values resulting in minimum fitted value")
```