-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateModel.R
160 lines (112 loc) · 4.99 KB
/
createModel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Intro Comments ----------------------------------------------------------
# This script creates the model to be accessed by the API later.
# It is intended to be run only once and is not needed for production.
# The resulting API and model can be tested locally from within R from
# `runAPI.R`.
# Created by George Kampolis
# Load packages -----------------------------------------------------------
# Ensure correct path:
# here::here() # if `here` is installed
# Typically, double-clicking on the .Rproj file
# should set the directory correctly.
getwd()
# Load packages.
source("scripts/pkgs.R")
# Load data ---------------------------------------------------------------
titanic <- read_csv("data/titanic.csv")
names(titanic) <- c("survived",
"class",
"name",
"sex",
"age",
"siblingsSpouses",
"parentsChildren",
"fare"
)
# Specify types when needed, read_csv assigns only characters and doubles.
titanic %<>% mutate(
survived=as.logical(survived),
class=as.factor(class),
sex=as.factor(sex),
siblingsSpouses=as.integer(siblingsSpouses),
parentsChildren=as.integer(parentsChildren)
)
# Create Features ---------------------------------------------------------
# Family on board
titanic %<>% mutate(familyOnBoard = siblingsSpouses + parentsChildren)
# Finalize Data Set For `mlr` ---------------------------------------------
# The names column only complicates things. For our purposes, there's no need
# to retrieve name information later, so no need to keep the name column.
# Furthermore, drop features replaced by the "familyOnBoard".
titanic %<>% select(- name, - siblingsSpouses, - parentsChildren)
# Build And Validate Model ------------------------------------------------
set.seed(2018)
## Create classficiation task
titanicTask <- makeClassifTask(data = as.data.frame(titanic),
target = "survived",
positive = "TRUE"
)
## Specify Naive Bayes, set a threshold of 50%.
## The Laplace operator is used automatically.
titanicLearner <- makeLearner("classif.naiveBayes",
predict.type = "prob",
predict.threshold = 0.5)
## Naive Bayes predictions will not change unless the features available or the
## data change. Therefore the repeated CV serves only to provide context for the
## aforementioned threshold.
titanicCV <- makeResampleDesc("RepCV",
folds = 10,
reps = 20,
stratify = TRUE
)
titanicModel <- resample(titanicLearner,
titanicTask,
titanicCV,
list(acc, # accuracy
ppv, # precision
tpr, # recall/sensitivity
auc, # Area Under the Curve
kappa,
f1, # F1 score / F-measure
timetrain
),
show.info = FALSE)
calculateConfusionMatrix(titanicModel$pred)
# predicted
# true FALSE TRUE -err.-
# FALSE 10052 848 848
# TRUE 3051 3789 3051
# -err.- 3051 848 3899
titanicModel$aggr # Show measures
# acc.test.mean ppv.test.mean
# 0.7802483 0.8196398
# tpr.test.mean auc.test.mean
# 0.5540714 0.8353770
# kappa.test.mean f1.test.mean
# 0.5056464 0.6573248
# timetrain.test.mean
# 0.0035500
# High AUC but relativily low F1 shows we might be able to do better with
# differet threshold or different classifier. Good precision is reassuring in
# that the cases which were identified as positive (i.e. survived) the
# classifier got many of them right. Accuracy isn't to be heavily relied upon,
# due to the somewhat unbalanced data set. Somewhat poor recall is expected due
# to unbalanced data sets (obvious in the confusion matrix and high observed
# false negatives).
# However the aim of this project is not to go for best results but to build an
# API. This classifier will do for our purposes as a proof of concept.
## Train final model on all available data
titanicModel <- train(learner = titanicLearner,
task = titanicTask
)
# Export Model ------------------------------------------------------------
saveRDS(titanicModel, file = "model/titanicModel.rds")
## For convenience, export one row of the data set, which includes all
## information regarding factors etc.
titanicNewData <- titanic %>% select(-survived) %>% head(n = 1) %>% as.data.frame()
saveRDS(titanicNewData, file = "model/titanicNewData.rds")
# Clean Up ----------------------------------------------------------------
rm(titanic, titanicNewData,
titanicCV, titanicLearner,
titanicModel, titanicTask
)