-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.R
331 lines (243 loc) · 16.1 KB
/
script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# Librerie da importare ed installare se non presenti
library(ggplot2)
library(dplyr)
library(caret)
library(rpart.plot)
library(ROCR)
library(magrittr)
# Importazione Dataset
# Importo il file csv con l'elenco di tutti i Pokemon e relative caratteristiche
pokemon<-read.csv("./pokemon.csv",sep=",",stringsAsFactors=F)
# Importo il file csv con i combattimenti e i risultati per fare il training ed il testing del modello
combats<-read.csv('./combats.csv',sep=",",stringsAsFactors=F)
# Importo il file csv contenente informazioni riguardo le relazioni tra tipi di pokemon in merito alla proprietà di attacco
pokemonTypeComp<-read.csv("./pokemonTypeComp.csv",sep=",", stringsAsFactors=F)
# Importo il file con i combattimenti privi di risultati per mostrare un esempio di applicazione "Reale" del modello di ML
real_test<-read.csv('./tests.csv',sep=",",stringsAsFactors=F)
# Definisco il nome delle colonne di pokemon
colnames(pokemon)<-c("id","Name","Type.1","Type.2","HP","Attack","Defense","Sp.Atk","Sp.Def","Speed","Generation","Legendary")
# Definisco l'associazione id-nome del pokemon
names <- pokemon %>% select(id, Name)
# Misure di qualità Dataset Singoli non integrati
# Misure di qualità dataset Pokemon
# Percentuale valori nulli per singolo attributo
na_count_pokemon <-sapply(pokemon, function(y) sum(length(which(y==""))))
Completness_pokemon <- na_count_pokemon/length(pokemon$id)
#evitabile serve per scoprire a quale pokemon manca il nome
#na_index <-sapply(pokemon, function(y) which(y==""))
#na_1 <-sapply(combats, function(y) which(y==63))
# Unicità percentuale per singolo attributo
unique_count_pokemon <- sapply(pokemon, function(y) sum(length(unique(y))))
Uniqueness_pokemon <- unique_count_pokemon/length(pokemon$id)
# Misure di qualità dataset Combats
# Percentuale valori nulli per singolo attributo
na_count_combats <-sapply(combats, function(y) sum(length(which(y==""))))
Completness_combats <- na_count_combats/length(combats$First_pokemon)
# Unicità percentuale per singolo attributo (poco senso)
unique_count_combats<- sapply(combats, function(y) sum(length(unique(y))))
Uniqueness_combats <- unique_count_combats/length(combats$First_pokemon)
# Unicità percentuale tra le coppie di combattenti
unique_versus <- sum(length(unique(combats$First_pokemon, combats$Second_pokemon)))
Uniqueness_versus <- unique_versus/length(combats$First_pokemon)
# Misure di qualità dataset pokemonTypeComp
# Aggiungere percentuale valori nulli per singolo attributo
na_count_type <-sapply(pokemonTypeComp, function(y) sum(length(which(y==""))))
Completness_type <- na_count_type/length(pokemonTypeComp$Attacking)
# Aggiungere altra misura di qualità diversa dall'unicità (poco senso)
unique_count_type<- sapply(pokemonTypeComp, function(y) sum(length(unique(y))))
Uniqueness_type <- unique_count_type/length(pokemonTypeComp$Attacking)
# Misure di qualità dataset Tests
# Percentuale valori nulli per singolo attributo
na_count_real_test <-sapply(real_test, function(y) sum(length(which(y==""))))
Completness_real_test <- na_count_real_test/length(real_test$First_pokemon)
# Unicità percentuale per singolo attributo (poco senso)
unique_count_real_test<- sapply(real_test, function(y) sum(length(unique(y))))
Uniqueness_real_test <- unique_count_real_test/length(real_test$First_pokemon)
# Integrazione dei dati provenienti dai diversi Dataset
# Integro in combats il nome dei pokemon avversari ricercandoli tramite il loro id
combats$First_pokemon_name<-sapply(combats$First_pokemon, function(x) names$Name[match(x, names$id)])
combats$Second_pokemon_name<-sapply(combats$Second_pokemon, function(x) names$Name[match(x, names$id)])
# Integro in combats i vari parametri caratteristici dei due pockemon avversari e calcolo le differenze tra di essi
combats$First_pokemon_attack<-sapply(combats$First_pokemon_name, function(x) pokemon$Attack[match(x, pokemon$Name)])
combats$Second_pokemon_attack<-sapply(combats$Second_pokemon_name, function(x) pokemon$Attack[match(x, pokemon$Name)])
combats$Diff_attack<-combats$First_pokemon_attack - combats$Second_pokemon_attack
combats$First_pokemon_defense<-sapply(combats$First_pokemon_name, function(x) pokemon$Defense[match(x, pokemon$Name)])
combats$Second_pokemon_defense<-sapply(combats$Second_pokemon_name, function(x) pokemon$Defense[match(x, pokemon$Name)])
combats$Diff_defense<-combats$First_pokemon_defense - combats$Second_pokemon_defense
combats$First_pokemon_sp_defense<-sapply(combats$First_pokemon_name, function(x) pokemon$Sp.Def[match(x, pokemon$Name)])
combats$Second_pokemon_sp_defense<-sapply(combats$Second_pokemon_name, function(x) pokemon$Sp.Def[match(x, pokemon$Name)])
combats$Diff_sp_defense<-combats$First_pokemon_sp_defense - combats$Second_pokemon_sp_defense
combats$First_pokemon_sp_attack<-sapply(combats$First_pokemon_name, function(x) pokemon$Sp.Atk[match(x, pokemon$Name)])
combats$Second_pokemon_sp_attack<-sapply(combats$Second_pokemon_name, function(x) pokemon$Sp.Atk[match(x, pokemon$Name)])
combats$Diff_sp_attack<-combats$First_pokemon_sp_attack - combats$Second_pokemon_sp_attack
combats$First_pokemon_speed<-sapply(combats$First_pokemon_name, function(x) pokemon$Speed[match(x, pokemon$Name)])
combats$Second_pokemon_speed<-sapply(combats$Second_pokemon_name, function(x) pokemon$Speed[match(x, pokemon$Name)])
combats$Diff_speed<-combats$First_pokemon_speed - combats$Second_pokemon_speed
combats$First_pokemon_HP<-sapply(combats$First_pokemon_name, function(x) pokemon$HP[match(x, pokemon$Name)])
combats$Second_pokemon_HP<-sapply(combats$Second_pokemon_name, function(x) pokemon$HP[match(x, pokemon$Name)])
combats$Diff_HP<-combats$First_pokemon_HP - combats$Second_pokemon_HP
combats$First_pokemon_type<-sapply(combats$First_pokemon_name, function(x) pokemon$Type.1[match(x, pokemon$Name)])
combats$Second_pokemon_type<-sapply(combats$Second_pokemon_name, function(x) pokemon$Type.1[match(x, pokemon$Name)])
combats$First_pokemon_legendary<-sapply(combats$First_pokemon_name, function(x) pokemon$Legendary[match(x, pokemon$Name)])
combats$Second_pokemon_legendary<-sapply(combats$Second_pokemon_name, function(x) pokemon$Legendary[match(x, pokemon$Name)])
# Determino attributo winner_first_label per la tabella combats
combats$winner_first_label<-ifelse(combats$Winner==combats$First_pokemon,'yes','no')
# Funzione utilizzata per ottenere informazione sulla relazione tra due pokemon avversari (informazione contenuta in pokemonTypeComp)
makeAdvantage2<-function(type_1,type_2){
val <- pokemonTypeComp[ which(pokemonTypeComp$Attacking==type_1),c(type_2)]
if(val==0){
return('no effect')
}
else if(val==0.5){
return('not too effective')
}
else if(val==1){
return('normal')
}
else if(val==2){
return('effective')
}
}
# Determino attributo advantage per la tabella combats
combats$advantage<-mapply(makeAdvantage2, combats$First_pokemon_type, combats$Second_pokemon_type)
# salvo combats su csv
write.csv (combats, file = "integrated.csv")
# Misure di qualità dataset Integrato
# Percentuale valori nulli per singolo attributo
na_count_combats_integrated <-sapply(combats, function(y) sum(length(which(y==""))))
Completness_combats_integrated <- na_count_combats_integrated/length(combats$First_pokemon)
# Unicità percentuale per singolo attributo
unique_count_combats_integrated <- sapply(combats, function(y) sum(length(unique(y))))
Uniqueness_combats_integrated <- unique_count_combats_integrated/length(combats$First_pokemon)
# Mostro distrubuzione percentuale sulla base di advantage in relazione al fatto di essere il pokemon che attacca per primo
combats %>%
dplyr::select(advantage, winner_first_label) %>%
dplyr::group_by(advantage,winner_first_label) %>%
dplyr::summarize(count=n()) %>%
dplyr::group_by(advantage) %>%
dplyr::mutate(perc = count / sum(count))
#temp <- data.frame(combats %>% dplyr::select(winner_first_label,Diff_attack ,Diff_defense, Diff_sp_defense,Diff_sp_attack,Diff_speed ,Diff_HP, First_pokemon_legendary, Second_pokemon_legendary, advantage))
temp <- data.frame(combats %>% dplyr::select(winner_first_label,Diff_attack ,Diff_defense, Diff_sp_defense,Diff_sp_attack,Diff_speed ,Diff_HP, First_pokemon_legendary, Second_pokemon_legendary))
ind <- sapply(temp, is.numeric)
temp[ind] <- lapply(temp[ind], scale)
# Garanzia di riproduzione dei risultati
set.seed(2345)
# Suddivido tra Training e Testing
split <- createDataPartition(y=temp$winner_first_label, p = 0.75, list = FALSE)
train <- temp[split,]
test <- temp[-split,]
# Training del modello
#res.tree<-train(winner_first_label~Diff_attack+Diff_defense+Diff_sp_defense+Diff_sp_attack+Diff_speed+Diff_HP+First_pokemon_legendary+Second_pokemon_legendary+advantage,data=train,method='rpart',trControl = trainControl(method = "cv",number = 10))
res.tree<-train(winner_first_label~Diff_attack+Diff_defense+Diff_sp_defense+Diff_sp_attack+Diff_speed+Diff_HP+First_pokemon_legendary+Second_pokemon_legendary,data=train,method='rpart',trControl = trainControl(method = "cv",number = 10))
# Testing del Modello
probs <- predict(res.tree, newdata=test, type='prob')
# Aggiungo a prob l'attributo contenente le informazione sul vincitore (Winner_first_label)
probs<-data.frame(cbind(probs,winner_first = test$winner_first_label))
# Aggiungo attributo winner_first_num pari 0 se il primo pokemon ad attaccare non vince, 1 altrimenti
probs$winner_first_num<-ifelse(probs$winner_first=='no',0,1)
# Rappresento il modello graficamente
rpart.plot(res.tree$finalModel)
# Mostro grafico sull'importanza delle feature
plot(caret::varImp(res.tree))
# Determino la confusionMatrix del modello
cm <- caret::confusionMatrix(res.tree)
# Determino parametri di qualità
precision <- cm$table[1]/(cm$table[1]+cm$table[2])
recall <- cm$table[1]/(cm$table[1]+cm$table[3])
f <- (2*precision*recall)/(precision+recall)
# Determino la confusionMatrix del modello di previsione
cm_pred <- confusionMatrix(as.factor(probs$winner_first), as.factor(test$winner_first_label))
# Determino parametri di qualità
precision_pred <- cm_pred$table[1]/(cm_pred$table[1]+cm_pred$table[2])
recall_pred <- cm_pred$table[1]/(cm_pred$table[1]+cm_pred$table[3])
f_pred <- (2*precision_pred*recall_pred)/(precision_pred+recall_pred)
# Test Reale
# Trova i nomi dei contendenti dati gli ID
real_test$First_pokemon_name<-sapply(real_test$First_pokemon, function(x) names$Name[match(x, names$id)])
real_test$Second_pokemon_name<-sapply(real_test$Second_pokemon, function(x) names$Name[match(x, names$id)])
# Vengono recuperati i parametri dei contendenti e calcolate le differenze su alcuni di essi
real_test$First_pokemon_attack<-sapply(real_test$First_pokemon_name, function(x) pokemon$Attack[match(x, pokemon$Name)])
real_test$Second_pokemon_attack<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Attack[match(x, pokemon$Name)])
real_test$Diff_attack<-real_test$First_pokemon_attack - real_test$Second_pokemon_attack
#real_test$winner_first_label<-ifelse(real_test$Winner==real_test$First_pokemon,'yes','no')
real_test$First_pokemon_defense<-sapply(real_test$First_pokemon_name, function(x) pokemon$Defense[match(x, pokemon$Name)])
real_test$Second_pokemon_defense<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Defense[match(x, pokemon$Name)])
real_test$Diff_defense<-real_test$First_pokemon_defense - real_test$Second_pokemon_defense
real_test$First_pokemon_sp_defense<-sapply(real_test$First_pokemon_name, function(x) pokemon$Sp.Def[match(x, pokemon$Name)])
real_test$Second_pokemon_sp_defense<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Sp.Def[match(x, pokemon$Name)])
real_test$Diff_sp_defense<-real_test$First_pokemon_sp_defense - real_test$Second_pokemon_sp_defense
real_test$First_pokemon_sp_attack<-sapply(real_test$First_pokemon_name, function(x) pokemon$Sp.Atk[match(x, pokemon$Name)])
real_test$Second_pokemon_sp_attack<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Sp.Atk[match(x, pokemon$Name)])
real_test$Diff_sp_attack<-real_test$First_pokemon_sp_attack - real_test$Second_pokemon_sp_attack
real_test$First_pokemon_speed<-sapply(real_test$First_pokemon_name, function(x) pokemon$Speed[match(x, pokemon$Name)])
real_test$Second_pokemon_speed<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Speed[match(x, pokemon$Name)])
real_test$Diff_speed<-real_test$First_pokemon_speed - real_test$Second_pokemon_speed
real_test$First_pokemon_HP<-sapply(real_test$First_pokemon_name, function(x) pokemon$HP[match(x, pokemon$Name)])
real_test$Second_pokemon_HP<-sapply(real_test$Second_pokemon_name, function(x) pokemon$HP[match(x, pokemon$Name)])
real_test$Diff_HP<-real_test$First_pokemon_HP - real_test$Second_pokemon_HP
real_test$First_pokemon_type<-sapply(real_test$First_pokemon_name, function(x) pokemon$Type.1[match(x, pokemon$Name)])
real_test$Second_pokemon_type<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Type.1[match(x, pokemon$Name)])
real_test$First_pokemon_legendary<-sapply(real_test$First_pokemon_name, function(x) pokemon$Legendary[match(x, pokemon$Name)])
real_test$Second_pokemon_legendary<-sapply(real_test$Second_pokemon_name, function(x) pokemon$Legendary[match(x, pokemon$Name)])
# Determino attributo advantage per la tabella real_test
real_test$advantage<-mapply(makeAdvantage2, real_test$First_pokemon_type, real_test$Second_pokemon_type)
#scale numerical features
#temp_real_test<- data.frame(real_test %>% dplyr::select(Diff_attack ,Diff_defense, Diff_sp_defense,Diff_sp_attack,Diff_speed ,Diff_HP, First_pokemon_legendary, Second_pokemon_legendary, advantage))
temp_real_test<- data.frame(real_test %>% dplyr::select(Diff_attack ,Diff_defense, Diff_sp_defense,Diff_sp_attack,Diff_speed ,Diff_HP, First_pokemon_legendary, Second_pokemon_legendary))
# Determino quali colonne di temp sono attributi di tipo numerico
ind <- sapply(temp_real_test, is.numeric)
# temp[ind] contiene solo le colonne di real_test aventi valori di tipo numerico
temp_real_test[ind] <- lapply(temp_real_test[ind], scale) # Scala le colonne
test_real_pred <- predict(res.tree, newdata=temp_real_test, type='prob')
real_test$winner_first_label<-test_real_pred
# salvo real_test su csv
write.csv(real_test[,c("First_pokemon","Second_pokemon","winner_first_label")], file="real_tests_result.csv")
# stampo parametri
print(cm)
print(precision)
print(recall)
print(f)
print(cm_pred)
print(precision_pred)
print(recall_pred)
print(f_pred)
# Calcolo Curva ROC
pred<-ROCR::prediction(probs$yes,probs$winner_first_num)
perf<-ROCR::performance(pred,"tpr", "fpr")
roc.data <- data.frame(fpr=unlist(perf@x.values),tpr=unlist(perf@y.values),model="tree1")
auc <- performance(pred, measure = "auc")
auc <- round(auc@y.values[[1]],3)
g <- ggplot(roc.data, aes(x=fpr, ymin=0, ymax=tpr)) +
geom_ribbon(alpha=0.2) + geom_line(aes(y=tpr)) +
labs(title= paste0("ROC CURVE AUC = ", auc))
g <- g + geom_segment(x = 0, y = 0, xend = 1, yend = 1, colour = 'red')
g
# Unicità percentuale per singolo attributo
# Misure di qualità Dataset Singoli non integrati
# Misure di qualità dataset Pokemon
# Percentuale valori nulli per singolo attributo
Completness_pokemon
# Unicità percentuale per singolo attributo
Uniqueness_pokemon
# Misure di qualità dataset Combats
# Percentuale valori nulli per singolo attributo
Completness_combats
# Unicità percentuale per singolo attributo (poco senso)
Uniqueness_combats
# Unicità percentuale tra le coppie di combattenti
Uniqueness_versus
# Misure di qualità dataset pokemonTypeComp
# Aggiungere percentuale valori nulli per singolo attributo
Completness_type
# Aggiungere altra misura di qualità diversa dall'unicità (poco senso)
Uniqueness_type
# Misure di qualità dataset Tests (forse da togliere)
# Percentuale valori nulli per singolo attributo
Completness_real_test
# Unicità percentuale per singolo attributo (poco senso)
Uniqueness_real_test
# Misure di qualità dataset Integrato
# Percentuale valori nulli per singolo attributo
Completness_combats_integrated
# Unicità percentuale per singolo attributo
Uniqueness_combats_integrated