-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxgb_tutorial.R
93 lines (74 loc) · 3.93 KB
/
xgb_tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
library(plyr)
library(dplyr)
library(xgboost)
library(tidyr)
library(dummies)
### SECTION ONE - COMBINING###
#Combine test and train into one feature set
test <- read.csv("test.csv", stringsAsFactors = FALSE)
train <- read.csv("train.csv", stringsAsFactors = FALSE)
df <- rbind.fill(train, test)
rm(test, train) #Remove to save memory
### SECTION TWO - FEATURE ENGINEERING & CLEANING ###
#Feature Engineering
#We need to combibe year and month to get a better date attribute
#Use the lazy way of appending month as a decimal place to year attribute
df$better_date <- df$yearOfRegistration + (df$monthOfRegistration / 12)
#Lets keep things simple
#convert the categorical variables "brand" and "notRepairedDamage" into dummy variables
#The dummies package is a lifesaver for this conversion process
df <- df %>% dummy.data.frame( c("brand", "notRepairedDamage"))
###SECTION THREE - MODELLING###
#Get the features we want along with the target variable (price)
#For this simple model, we are using the vehicle brand, if its had repairs, kilometers and power to predict price
train_df <- df %>%
select(-yearOfRegistration, -monthOfRegistration, -postalCode, -gearbox, -fuelType, -vehicleType, -model, -id) %>%
filter(!is.na(price)) %>%
apply(2, as.numeric) %>% #Sometimes xgboost doesnt like the variable type. convert everything to numeric because its linear regression
as.data.frame()
#convert to xgb matrix
xgb_mat <- xgb.DMatrix(data = as.matrix(select(train_df, -price)),
label = train_df$price)
#Create our parameter input
#These are the default inputs, get experimenting and see how xgb.cv changes!
param_list = list(eta = 0.3,
gamma = 0,
max_depth = 6,
min_child_weight = 1,
max_delta_step = 0,
subsample = 1,
colsample_bytree = 1,
colsample_bylevel = 1)
set.seed(117)
#Do Cross Validation to select best model/parameters
cv_model <- xgb.cv(data = xgb_mat,
objective = "reg:linear", #Linear Regression
eval_metric = "rmse", #Root Mean Square Error
params = param_list,
nrounds = 10000, #Set to a high number, early_stopping_rounds is how we terminate the cv
nfold = 4, #4 K-Fold CV
early_stopping_rounds = 20) #Stop after 20 rounds of not improving test score
#With default parameters this yields a test score of...
rmse_score <- as.integer(cv_model$evaluation_log$test_rmse_mean[cv_model$best_iteration])
#Train the actual xgb model
#Use the same parameters and the best number of iterations from our cross validation model
xgb_model <- xgb.train(xgb_mat,
objective = "reg:linear",
eval_metric = "rmse",
params = param_list,
nrounds = cv_model$best_iteration)
#Look at the most important attributes of the model
xgb.importance(feature_names = colnames(select(train_df, -price)), model = xgb_model)
#With default parameters and our chosen features we have date, power and kilometers the top3 most important features.
#Porsche has the most gain in terms of brand and is more "important" than if repair status.
###SECTION FOUR###
#Create predictions.
#We want all the observations with NA as price
predict_df <- df %>%
select(-yearOfRegistration, -monthOfRegistration, -postalCode, -gearbox, -fuelType, -vehicleType, -model, -id) %>%
filter(is.na(price)) %>%
select(-price)
#Get the vector of predictions and convert it into dataframe
predictions <- predict(xgb_model, as.matrix(predict_df))
upload <- data.frame(id = 1:length(predictions), price = predictions)
write.csv(upload, paste0("upload1_starterscript_", rmse_score, ".csv"), row.names = FALSE)