-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLecture 9 Lab.Rmd
302 lines (245 loc) · 8.86 KB
/
Lecture 9 Lab.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
---
title: "Lab 9"
author: "Nadir Nibras"
date: "March 7, 2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Lab 9
### Loading datasets
```{r}
library(ISLR)
x=model.matrix (Salary~.,Hitters )[,-1]
y=Hitters$Salary
y= y[complete.cases(y)]
```
The model.matrix() function is particularly useful for creating x; not only does it produce a matrix corresponding to the 19 predictors but it also automatically transforms any qualita tive variables into dummy variables.
The latter property is important because glmnet() can only take numerical, quantitative inputs.
## Ridge regression
alpha = 0 means ridge regresion model is fit
```{r}
library (glmnet)
grid =10^ seq (10,-2, length =100)
ridge.mod =glmnet(x,y,alpha =0, lambda =grid)
```
-The glmnet() function performs ridge regression for an automatically selected range of ?? values
-We have chosen to implement the function over a grid of values ranging from ?? = 10^10 to ?? = 10^???2,
B-y default, the glmnet() function standardizes the variables so that they are on the same scale. To turn off this default setting, use the argument standardize=FALSE.
Coef will give us the weights + coefficient for all 100 values of ?? that we trained on
```{r}
dim(coef(ridge.mod ))
# Note that coefficients get smaller as ?? gets bigger
# coef(ridge.mod)
```
### Comparing Ridge vs Lasso regression
#### creating train and test sets
```{r}
set.seed (1)
train=sample (1: nrow(x), nrow(x)/2)
test=(- train )
y.test=y[test]
```
#### Observing the MSE for lambda or s= 4
```{r}
ridge.mod =glmnet(x[train,],y[train],alpha =0, lambda =grid ,thresh =1e-12)
ridge.pred=predict (ridge.mod ,s=4, newx=x[test ,])
mean(( ridge.pred -y.test)^2)
```
#### Using large value of lambda or s= 1e10, we see a larger MSE
```{r}
ridge.pred=predict (ridge.mod ,s=1e10 ,newx=x[test ,])
mean(( ridge.pred -y.test)^2)
```
#### This is the same as setting all weights to 0 and just using the intercept/average of the training outputs to predict the test outputs
```{r}
mean(( mean(y[train ])-y.test)^2)
```
Now to check predictions for when lambda= 0
```{r}
ridge.pred=predict(ridge.mod ,s=0, newx=x[test ,])
mean((ridge.pred -y.test)^2)
lm(y???x, subset =train)
predict (ridge.mod ,s=0, type="coefficients") [1:20 ,]
```
#### Using cross-validation to pick the best lambda
```{r}
set.seed (1)
cv.out =cv.glmnet (x[train ,],y[train],alpha =0)
plot(cv.out)
bestlam =cv.out$lambda.min
bestlam
```
#### Finding the best MSE with the best lambda
```{r}
ridge.pred=predict (ridge.mod ,s=bestlam ,newx=x[test ,])
mean(( ridge.pred -y.test)^2)
```
### Refitting ridge regression model using the best lambda
```{r}
out=glmnet (x,y,alpha =0)
predict(out ,type="coefficients",s=bestlam )[1:20 ,]
```
As expected, none of the coefficients are 0
## 6.6.2 Lasso regression
```{r}
lasso.mod =glmnet (x[train ,],y[train],alpha =1, lambda =grid)
plot(lasso.mod)
```
#### Finding MSE using the best lambda
```{r}
set.seed (1)
cv.out =cv.glmnet (x[train ,],y[train],alpha =1)
plot(cv.out)
bestlam =cv.out$lambda.min
lasso.pred=predict (lasso.mod ,s=bestlam ,newx=x[test ,])
mean(( lasso.pred -y.test)^2)
```
Similar MSE but do note that 12 of the 19 initial variables are now 0
```{r}
out=glmnet (x,y,alpha =1, lambda =grid)
lasso.coef=predict (out ,type ="coefficients",s=bestlam )[1:20 ,]
lasso.coef
```
## Lab 6.5.1 Best Subset selection
```{r}
library (ISLR)
# fix(Hitters )
names(Hitters )
dim(Hitters )
sum(is.na(Hitters$Salary))
```
Removing null values
```{r}
Hitters =na.omit(Hitters )
dim(Hitters )
```
```{r}
library (leaps)
regfit.full=regsubsets (Salary???.,Hitters )
summary (regfit.full)
```
the nvmax option can be used in order to return as many variables as are desired. Here we fit up to a 19-variable model.
```{r}
regfit.full=regsubsets (Salary???.,data=Hitters ,nvmax =19)
reg.summary =summary (regfit.full)
# Checking the variables we have for evaluating our results
names(reg.summary)
```
```{r}
reg.summary$rsq
```
Plotting RSS, adjusted R2, Cp, and BIC for all of the models at once will help us decide which model to select. Note the type="l" option tells R to connect the plotted points with lines.
```{r}
par(mfrow =c(2,2))
plot(reg.summary$rss ,xlab="Number of Variables ",ylab=" RSS", type="l")
plot(reg.summary$adjr2 ,xlab ="Number of Variables ", ylab=" Adjusted RSq",type="l")
# The red point indicates the best Rsquared value
which.max(reg.summary$adjr2)
points(11, reg.summary$adjr2[11], col ="red",cex =2, pch =20)
```
Repeating for Cp and BIC models
```{r}
plot(reg.summary$cp ,xlab ="Number of Variables ",ylab="Cp", type='l')
which.min (reg.summary$cp )
points (10, reg.summary$cp [10], col ="red",cex =2, pch =20)
which.min (reg.summary$bic )
plot(reg.summary$bic ,xlab=" Number of Variables ",ylab=" BIC",type='l')
points(6, reg.summary$bic[6], col ="red",cex =2, pch =20)
```
The regsubsets() function has a built-in plot() command which can be used to display the selected variables for the best model with a given number of predictors, ranked according to the BIC, Cp, adjusted R2, or AIC. To find out more about this function, type ?plot.regsubsets
```{r}
plot(regfit.full ,scale ="r2")
plot(regfit.full ,scale ="adjr2")
plot(regfit.full ,scale ="Cp")
plot(regfit.full ,scale ="bic")
```
We can use the coef() function to see the coefficient estimates associated with this model.
```{r}
coef(regfit.full ,6)
```
## 6.5.2 Forward and Backward Stepwise Selection
We can also use the regsubsets() function to perform forward stepwise or backward stepwise selection, using the argument method="forward" or method="backward".
```{r}
regfit.fwd=regsubsets(Salary~.,data=Hitters ,nvmax =19,method ="forward")
summary (regfit.fwd )
regfit.bwd=regsubsets (Salary~.,data=Hitters ,nvmax =19,method ="backward")
summary (regfit.bwd )
```
We see that using forward stepwise selection, the best onevariable model contains only CRBI, and the best two-variable model additionally includes Hits. For this data, the best one-variable through sixvariable models are each identical for best subset and forward selection.
However, the best seven-variable models identified by forward stepwise selection, backward stepwise selection, and best subset selection are different:
```{r}
coef(regfit.full ,7)
coef(regfit.fwd ,7)
coef(regfit.bwd ,7)
```
#### Separating train and test sets
```{r}
set.seed (1)
train=sample (c(TRUE ,FALSE), nrow(Hitters ),rep=TRUE)
test =(!train )
```
Training our mdoel on the train sets
```{r}
regfit.best=regsubsets (Salary~.,data=Hitters [train ,], nvmax =19)
# The model.matrix() function is used in many regression packages for buildmodel. ing an "X" matrix from data.
test.mat=model.matrix (Salary???.,data=Hitters [test ,])
```
Now we run a loop, and for each size i, we matrix() extract the coefficients from regfit.best for the best model of that size, multiply them into the appropriate columns of the test model matrix to form the predictions, and compute the test MSE.
```{r}
val.errors =rep(NA ,19)
for(i in 1:19){
coefi=coef(regfit.best ,id=i)
pred=test.mat [,names(coefi)]%*% coefi
val.errors [i]= mean(( Hitters$Salary[test]-pred)^2)
}
```
The vector val.errors is storing the MSE(s) for different number of varialbes
```{r}
val.errors
which.min (val.errors )
```
This was a little tedious, partly because there is no predict() method for regsubsets(). Since we will be using this function again, we can capture our steps above and write our own predict method.
```{r}
predict.regsubsets =function (object ,newdata ,id ,...){
form=as.formula (object$call [[2]])
mat=model.matrix(form ,newdata )
coefi =coef(object ,id=id)
xvars =names (coefi )
mat[,xvars ]%*% coefi
}
```
Now we find the best subsets using the full data-set
```{r}
regfit.best=regsubsets (Salary~.,data=Hitters ,nvmax =19)
coef(regfit.best ,10)
```
Using cross-validation to choose between models of different sizes
```{r}
k=10
set.seed (1)
folds=sample (1:k,nrow(Hitters ),replace =TRUE)
cv.errors =matrix (NA ,k,19, dimnames =list(NULL , paste (1:19) ))
```
```{r}
for(j in 1:k){
best.fit =regsubsets (Salary???.,data=Hitters [folds !=j,], nvmax =19)
for(i in 1:19) {
pred=predict.regsubsets(best.fit ,Hitters [folds ==j,], id=i)
cv.errors [j,i]=mean( (Hitters$Salary[folds ==j]-pred)^2)
}
}
```
This has given us a 10×19 matrix, of which the (i, j)th element corresponds to the test MSE for the ith cross-validation fold for the best j-variable model. We use the apply() function to average over the columns of this matrix in order to obtain a vector for which the jth element is the crossvalidation error for the j-variable model.
```{r}
mean.cv.errors =apply(cv.errors ,2, mean)
mean.cv.errors
par(mfrow =c(1,1))
plot(mean.cv.errors ,type= 'b')
```
We see that cross-validation selects an 11-variable model. We now perform best subset selection on the full data set in order to obtain the 11-variable model.
```{r}
reg.best=regsubsets (Salary???.,data=Hitters , nvmax =19)
coef(reg.best ,11)
```