-
Notifications
You must be signed in to change notification settings - Fork 5
/
Logistic Regression.R
119 lines (89 loc) · 3.38 KB
/
Logistic Regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#### Probabilty odds, and logit
## odd= p/(1-p)
curve(p/(1-p), from=0 , to=1, type= "l", xname = "p", las= 1,
xlab= "Probability of success", ylab= "logit", xaxt= "n")
## logit= log(odd)= log(p/(1-p))
curve(log(p/(1-p)), from=0 , to=1, type= "l", xname = "p", las= 1,
xlab= "Probability of success", ylab= "logit", xaxt= "n")
axis(1,pos=0)
df= read.xlsx(file.choose(),1,header = T)
df= df[,!apply(is.na(df),2,all)]
str(df)
dfb= df
df= df[,-5]
df$Promoffer= as.factor(df$Promoffer)
df$Online= as.factor(df$Online)
## Partitioning (60%:40%)
partidx= sample(1:nrow(df),0.6*nrow(df),replace= F)
dftrain= df[partidx,]
dftest= df[-partidx,]
mod= glm(Promoffer ~ Income, family = binomial(link="logit"),data= dftrain)
summary(mod)
b0= unname(mod$coefficients[1])
b1= unname(mod$coefficients[2])
# "P(Prmoffer= Yes| Income= X)" = 1/(1+e^ -(b0+b1*x))
range(dftrain$Income)
plot(dftrain$Income, as.numeric(as.character(dftrain$Promoffer)),
type="p",xlab = "Income", ylab= "Promoffer")
curve(1/(1+exp(-(mod$coefficients[[1]]+mod$coefficients[[2]]*x))),
xlim= c(0,250), type= "l", xname= "x", add = T)
mod1= glm(Promoffer ~ ., family = binomial(link="logit"),data= dftrain)
summary(mod1)
#P=odds/(1+odds)
curve(odds/(1+odds), from =0, to=100, type="l", xname= "odds",
xlab= "Odds", ylab= "Probability of Success")
#P= exp(logit)/(1+exp(logit))
curve(exp(logit)/(1+exp(logit)), from =-100, to=100, type="l", xname= "logit",
xlab= "logit", ylab= "Probability of Success")
modtest= predict(mod1, dftest[,-c(3)],type= "response")
### response returns probabilities
modtestl= predict(mod1, dftest[,-c(3)],type= "link")
### return logit values
modtestc= ifelse(modtest>0.5,1,0)
table("Actual value"=dftest$Promoffer, "Predicted"=modtestc)
mean(modtestc == df$Promoffer)
mean(modtestc != df$Promoffer)
head(data.frame(
"Predicted class"= modtestc,
"ACtual class"=dftest$Promoffer,
"Prob for 1(success)"= modtest,
"Log odds"= modtestl,
dftest[,-3], check.names = F
))
#Cumulative Lift Curve
dflift= data.frame("Probabilty of class 1"=modtest,"Actual class"= as.numeric(as.character(dftest$Promoffer)),check.names = F)
dflift= dflift[order(dflift[,1],decreasing = T),]
CumACtualClass= cumsum(dflift[,2])
dflift= cbind(dflift, CumACtualClass)
head(dflift)
plot(1:nrow(dflift), dflift$CumACtualClass, "l",
xlab = "# cases", ylab="cumulative", xlim= c(0,2100),
ylim = c(0,210))
legend(800,70,inset=0.005,
c("Cumulative Personal Loan when sorted using predicted values",
"Cumulative Personal Loan using average"),
lty= c(1,2), bty= "n", cex= 0.7, x.intersp=0.3, y.intersp= 0.5)
################ Flight Details ################
df= read.xlsx(file.choose(),1,header = T)
df= df[,!apply(is.na(df),2,all)]
str(df)
dfb= df
df$STD= strptime(format(df$STD, "%H:%M:%S"), "%H:%M:%S")
df$ATD= strptime(format(df$ATD, "%H:%M:%S"), "%H:%M:%S")
df$STA= strptime(format(df$STA, "%H:%M:%S"), "%H:%M:%S")
df$ATA= strptime(format(df$ATA, "%H:%M:%S"), "%H:%M:%S")
breaks= seq(strptime("00:00:00","%H:%M:%S"),strptime("24:00:00","%H:%M:%S"),
by= "6 hours")
labelsv= c("0-6","6-12","12-18","18-24")
DEPT= cut(df$ATD, breaks= breaks, right= F, labels = labelsv)
df1= cbind(df, DEPT)
df1$Day= as.factor(df1$Day)
levels(df1$Day)
levels(df1$Day=c("Sunday","Monday"))
df1$FLTIME= as.difftime(as.character(df1$FLTIME))
str(df1)
head(df1)
dfb1= df1
df1= df1[,-c(1,3,5:8)]
str(df1)
head(df1)