-
Notifications
You must be signed in to change notification settings - Fork 0
/
week1_assi1_sol2.R
242 lines (178 loc) · 9.24 KB
/
week1_assi1_sol2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# # -*- coding: utf-8 -*-
# """Week1 Assi1 Sol2.ipynb
#
# Automatically generated by Colaboratory.
#
# Original file is located at
# https://colab.research.google.com/drive/1sloDXcEBLZDNisk0usmXCOtsFb1c8DgI
# """
###########################################################################
## Week-1, Homework-1, Sol-2
## Sreya Dhar
## Created: Feb 09, 2021
## Edited: Feb 13, 2021
###########################################################################
rm(list=ls())
## installing all the libaries in R kernel
# install.packages("ISLR")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("repr")
# install.packages("psych")
## importing the libraries in R kernel
library(ISLR)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling)
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(ggstatsplot)
library(psych)
library("gplots")
## set directory ##
setwd("C:/File G/EAS 507 Statistical Mining II/Week 1/HW-1")
## """**Data Processing or Exploratory Data Analysis on 'Auto' Dataset**"""
# View different features in Auto dataset
names(Auto)
### a) Check against missing values? ###
which(is.na(Auto==TRUE))
sum(is.na(Auto))
mean(is.na(Auto))
### b) What variables are numerical (continuous) or factors (categorical)?
glimpse(Auto) ## overall view of Auto dataset
sapply(Auto, class)
summary(Auto)
Auto_C <- Auto
table(Auto_C$cylinders)
table(Auto_C$origin)
### c) Report the mean and standard deviation for each continuous variable in the data
profiling_num(Auto_C) ## describing the metric table of the variables which includes range, mean, standard deviation and variation
Auto_1 <- sapply(Auto_C, is.numeric)
Auto_1
# d) Remove the 5th through 55th observation. What is the range, mean and
# standard deviation?
Auto_2 <- sapply(Auto[-5:-55, Auto_1], function(x) round(c(range(x), mean(x), sd(x)), 2))
rownames(Auto_2) <- c("min", "max", "mean", "sd")
Auto_2
# e) In the full Auto dataset, are there any variables you would consider removing,
# or representing differently? Why?
status(Auto) ## Display several statistical parameters including datatype and unique values in variables
Auto_p1 <- Auto_C[,-9]
Auto_p <- Auto_p1 %>%
mutate(cylinders = factor(cylinders, labels = c('cyl_3', 'cyl_4', 'cyl_5', 'cyl_6', 'cyl_8')),
origin = factor(origin, labels = c('American', 'European', 'Japanese')))
# f) In the full Auto dataset, graphically explore the relationships between the
# variables in the data set.
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200) ## ## histogram plot for the variables for better visalization and distribution of data
plot_num(Auto_p)
describe(Auto_p) ## description of the overall dataset, including lowest, highest and frequency of high occurance variables
summary(Auto_p) ## Statistical parameters of all the variables
head(data.matrix(Auto_p)) ## provides top 6 rows as heading matrix from dataset
options(repr.plot.width=9, repr.plot.height=9, repr.plot.res = 200)
pairs.panels(Auto_p[,-8], main = "Pairs plot on Auto dataset, unclassed on Origin", pch = 21, bg = c("blue", "green", "yellow")[unclass(Auto_p$origin)], hist.col="red")
# pairs(Auto_p, main = "Pairwise plot") ## pair wise plot :: between any two variable to show the correlation between variables
## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(Auto_p1, histogram=TRUE, pch=15)
## defining the figure plot like width, ht and resolution of the below figure
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
## Showing the correlation plots in lower triangular matrix :: intensities can be visualize by color range variation
M <- cor(Auto_p1)
corrplot(M, method = "circle", type = "lower")
## upload/pull qruery_cormat in R to visualise more exploration of data
source("http://www.sthda.com/upload/rquery_cormat.r")
## plotting the heatmap diagram of correlation matrix on modified dataset
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
rquery.cormat(Auto_p1, graphType="heatmap")
corrplot(M,method = "color" )
## heatmap ##
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
Auto_S <- as.data.frame(scale(Auto_p1,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Auto_S), scale = "none", col = bluered(100), trace = "none", density.info = "none")
## correlation and p values between any two variables
rquery.cormat(Auto_p1, type="flatten", graph=FALSE)
## overall summary of the data with several statistical parameters removing 'name' variable
summary(Auto_p1)
origin_C <- factor(Auto$origin, labels = c("Amerian", "European", "Japanese"))
# Boxplots showing mpg broken down by different origins
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230)
qplot(origin_C, mpg, geom = c("violin"), data = Auto_C, , xlab= "Different Origin")+ geom_boxplot(width=0.1, color="blue") +theme_bw()
cylinders_C <- factor(Auto$cylinders, labels = c("c_3", "c_4", "c_5","c_6", "c_8"))
# Violin+Boxplots showing mpg broken down by different origins
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230)
qplot(cylinders_C, mpg, geom = c("violin"), data = Auto_C, xlab= "Number of cylinders")+geom_boxplot(width=0.1, color="red") +theme_bw()
# Violin+Boxplots showing horsepoer broken down by different origins
par(mfrow = c(1,3))
dens <- density(Auto_p$horsepower)
options(repr.plot.width=9, repr.plot.height=3, repr.plot.res = 350)
boxplot(Auto_p$horsepower, main="Detecting horsepower outliers",
xlab="horsepower ", ylab="value")
hist(Auto_p$horsepower, breaks = 15, xlab="horsepower distribution ",
main="Histogram of horsepower", probability = T)
lines(dens)
hist(log(Auto_p$horsepower), breaks = 15, xlab="horsepower log distribution ",
main="Histogram of horsepower in log scale")
## identifying the outliers from horsepower variable
outl_hp <- boxplot.stats(Auto_p$horsepower)$out
outl_hpw <- which(Auto_p$horsepower %in% c(outl_hp))
outl_hpw
outl_hp
## replacing outliers by the median of horsepower in Auto_p dataset
Auto_p[Auto_p$horsepower %in% c(outl_hpw), "horsepower"] = median(Auto_p$horsepower)
Auto_p[outl_hpw >=200, "horsepower"] <- median(Auto_p$horsepower)
## Relationhip between mpg and year variable; from the plot its been seen that as the years passed by Performance (mpg) increases in vehicles.
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs year
plot(mpg ~ year, xlab = "year", ylab = "Performance in mpg", data = Auto_p)
# add a smooth trendline on top of scatter plot to verify the pattern
with(Auto_p, lines(lowess(mpg~year), lwd=2, col='red'))
# boxplot to visualize the transmision range increasing by year.
boxplot(mpg ~ year, data = Auto_p, ylab = "Performance in mpg", xlab = "Year")
options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 300)
fil_Auto_C1 <- filter(Auto_p, year == c(70,82))
fil_Auto_C1$origin <- factor(fil_Auto_C1$origin, labels = c("Amerian","European", "Japanese"))
fil_Auto_C1$year <- factor(fil_Auto_C1$year, labels = c("1972","1980"))
comb_plot<- fil_Auto_C1 %>%
mutate(day = fct_reorder(origin, mpg)) %>%
mutate(day = factor(origin)) %>%
ggplot(aes(x=origin, y=mpg, fill=year)) +
geom_boxplot(aes(color = year), width = 0.07, position = position_dodge(0.65))+
theme_bw() +
xlab("Origin") +
ylab("mpg")
comb_plot
################## exploring weight variable from normal scale to logscale ########################
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs weight
plot(mpg ~ weight, xlab = "weight", ylab = "Performance in mpg", data = Auto_C)
with(Auto_p, lines(lowess(mpg~weight), lwd=2, col='red'))
plot(log(mpg) ~ log(weight), xlab = "log of weight", ylab = "log of mpg", data = Auto_C)
with(Auto_p, lines(lowess(log(mpg)~log(weight)), lwd=2, col='red'))
# g) In the full Auto dataset, consider the variable mpg. You are going to
# create a new categorical variable for MPG, which has the categories:
# {low, med, high}. Call this variable "my_mpg", and create a new_Auto dataset,
# which contains all of the Auto variables, and your new variable "my_mpg".
Auto_p['my_mpg'] <- Auto['mpg']
new_Auto <- Auto_p
quantile(Auto$mpg, probs=c(0.33, 0.66))
new_Auto$my_mpg[Auto['mpg']<18.503] = "low"
new_Auto$my_mpg[Auto['mpg']<26.601 & Auto['mpg']>18.503] = "med"
new_Auto$my_mpg[Auto['mpg']>26.601] = "high"
table(my_mpg=new_Auto$my_mpg)
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
ggplot(data = new_Auto, mapping = aes(x=my_mpg,y=horsepower)) +
geom_jitter(aes(color='blue'),alpha=0.9) +
geom_boxplot(fill="bisque",color="black",alpha=0.5) +
labs(x='mpg', y = 'Horsepower') +
guides(color=FALSE) +
theme_bw()
# (g) Save the dataset as an *.RData file and submit it with your assignment.
write.table(new_Auto, file="new_Auto.Rdata")