-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinear Discriminant Analysis - FINAL.R
104 lines (83 loc) · 3.44 KB
/
Linear Discriminant Analysis - FINAL.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
library(tidyverse)
library(ISLR)
library(MASS)
library(gtools)
# Import the data
df <- diamonds
# Clean the data
df %>% dplyr::select(df, carat, cut, color, depth, table, price, x, y, z, clarity)
df <- df <- dplyr::select(df, carat, cut, color, depth, table, price, x, y, z, clarity)
# Name the last column as "Last", we will use it later in the analysis
names(df)[names(df)==names(df[ncol(df)])] = 'last'
df <- as.data.frame(df)
#### randomize the rows ######
randomnumbers <- runif(nrow(df))
df <- df[order(randomnumbers),]
# Set up our set of true values, based on the randomized rows
last <- df$last
#### Initialize variables ####
accuracy <- 0
accuracytemp <- 0
results <- data.frame(accuracy)
sumdiagtmp <- 0
#### Make basic confusion matrix, based on the dimensions of the feature we are studying ####
x =nlevels(df$last)
data1 <- seq(1:x^2)
newmatrix <- matrix()
newmatrix <- matrix(data = data1, nrow = x)
rownames(newmatrix) <- c(levels(df$last))
colnames(newmatrix) <- c(levels(df$last))
table1.df <- as.table(newmatrix)
table1.df <- data.frame(unclass(table1.df))
results.table <- data.frame(table1.df)
#### Analysis starts here ####
for (i in 1:1){ # accuracy increases as the 2nd number is raised: 1:10 gives higher accuracy results than 1:1 or 1:2, but this increases run time.
for (j in 1:ncol(df)){ # this creates all possible permutations of the columns
combin <- combinations(n = ncol(df), r = j, repeats.allowed = FALSE) ### remember to change this!!!
for (k in 1:nrow(combin)){# Loop to create all possible data sets by column, then we can do the analysis of each one
print(i)
print(j)
print(k)
colvals <- c(combin[k,])
newdf <- data.frame(df[,colvals])
newdf <- cbind(newdf, last)
# Break the data set into random amounts of test and train
ratio <- round(runif(1, 0.05, 0.95),2)
train <- sort(sample(nrow(df), nrow(df)*ratio))
df.train <- as.data.frame(df[train,])
df.test <- as.data.frame(df[-train,])
#### Run linear discriminant analysis ####
df.lda <- lda(last~., data = df.train)
df.pred <- predict(df.lda, df.test)
# Make the predictions based on unseen (new) data
pred = predict(df.lda, df.test)
Pred.Class <- pred$class
# Output results table
Table1 <- table(Pred.Class, df.test$last)
table1.df <- data.frame(unclass(Table1))
# Calculate accuracy
accuracy <- sum(diag(Table1)) / sum(Table1)
# Save accuracy data, only add rows if accuracy increases
if(sum(diag(Table1))>sumdiagtmp){
if(accuracy>=accuracytemp){
results.table <- rbind(results.table, table1.df)
accuracytemp <- accuracy
sumdiagtmp <- sum(diag(Table1))
results <- rbind(results, accuracy)
saveRDS(df.lda, file = "/tmp/lda.max.accuracy.rda")
} # closing for if(sum(diag(Table1))>sumdiagtemp)
} # closing for if (accuracy>= accuracytemp)
} # closing for k
} # closing for j
} # closing for i
######## ------- display results to the user ------------ ###############
results <- results[2:nrow(results),]
results.table[(nrow(results.table)-(nlevels(df$last))+1):nrow(results.table),]
min(results)
mean(results)
max(results)
results <- as.data.frame(results)
ggplot(data = results,mapping = aes(x = results)) +
geom_histogram(binwidth = .001)
max.accuracy <- readRDS(file = "/tmp/lda.max.accuracy.rda")
max.accuracy