week1_assi1_sol2.R

# # -*- coding: utf-8 -*-
# """Week1 Assi1 Sol2.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1sloDXcEBLZDNisk0usmXCOtsFb1c8DgI
# """

###########################################################################
## Week-1, Homework-1, Sol-2 
## Sreya Dhar 
## Created: Feb 09, 2021
## Edited: Feb 13, 2021
###########################################################################

rm(list=ls())

## installing all the libaries in R kernel

# install.packages("ISLR")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("repr")
# install.packages("psych")


## importing the libraries in R kernel
library(ISLR)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling) 
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(ggstatsplot)
library(psych)
library("gplots")

## set directory ##
setwd("C:/File G/EAS 507 Statistical Mining II/Week 1/HW-1")

## """**Data Processing or Exploratory Data Analysis on 'Auto' Dataset**"""

# View different features in Auto dataset
names(Auto)

### a) Check against missing values? ###

which(is.na(Auto==TRUE))
sum(is.na(Auto))
mean(is.na(Auto))

### b) What variables are numerical (continuous) or factors (categorical)?

glimpse(Auto) ## overall view of Auto dataset

sapply(Auto, class)

summary(Auto)

Auto_C <- Auto

table(Auto_C$cylinders)

table(Auto_C$origin)

### c) Report the mean and standard deviation for each continuous variable in the data

profiling_num(Auto_C) ## describing the metric table of the variables which includes range, mean, standard deviation and variation

Auto_1 <- sapply(Auto_C, is.numeric)
Auto_1

# d) Remove the 5th through 55th observation. What is the range, mean and
# standard deviation?
Auto_2 <- sapply(Auto[-5:-55, Auto_1], function(x) round(c(range(x), mean(x), sd(x)), 2))
rownames(Auto_2) <- c("min", "max", "mean", "sd")
Auto_2

# e) In the full Auto dataset, are there any variables you would consider removing,
# or representing differently? Why?

status(Auto) ## Display several statistical parameters including datatype and unique values in variables

Auto_p1 <- Auto_C[,-9]
Auto_p <- Auto_p1 %>%
    mutate(cylinders = factor(cylinders, labels = c('cyl_3', 'cyl_4', 'cyl_5', 'cyl_6', 'cyl_8')),
           origin = factor(origin,  labels = c('American', 'European', 'Japanese')))

# f) In the full Auto dataset, graphically explore the relationships between the
# variables in the data set.

options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200) ## ## histogram plot for the variables for better visalization and distribution of data
plot_num(Auto_p)

describe(Auto_p) ## description of the overall dataset, including lowest, highest and frequency of high occurance variables

summary(Auto_p) ## Statistical parameters of all the variables

head(data.matrix(Auto_p)) ## provides top 6 rows as heading matrix from dataset



options(repr.plot.width=9, repr.plot.height=9, repr.plot.res = 200)
pairs.panels(Auto_p[,-8], main = "Pairs plot on Auto dataset, unclassed on Origin", pch = 21, bg = c("blue", "green", "yellow")[unclass(Auto_p$origin)], hist.col="red")

# pairs(Auto_p, main = "Pairwise plot") ## pair wise plot :: between any two variable to show the correlation between variables

## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(Auto_p1, histogram=TRUE, pch=15)


## defining the figure plot like width, ht and resolution of the below figure 
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200) 

## Showing the correlation plots in lower triangular matrix :: intensities can be visualize by color range variation 
M <- cor(Auto_p1)
corrplot(M, method = "circle",  type = "lower")


## upload/pull qruery_cormat in R to visualise more exploration of data
source("http://www.sthda.com/upload/rquery_cormat.r")
## plotting the heatmap diagram of correlation matrix on modified dataset
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
rquery.cormat(Auto_p1, graphType="heatmap") 
corrplot(M,method = "color" )

## heatmap ##
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
Auto_S <- as.data.frame(scale(Auto_p1,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Auto_S), scale = "none", col = bluered(100), trace = "none", density.info = "none")


## correlation and p values between any two variables
rquery.cormat(Auto_p1, type="flatten", graph=FALSE)

## overall summary of the data with several statistical parameters removing 'name' variable 
summary(Auto_p1)

origin_C <- factor(Auto$origin, labels = c("Amerian", "European", "Japanese"))

# Boxplots showing mpg broken down by different origins
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230) 
qplot(origin_C, mpg, geom = c("violin"), data = Auto_C, , xlab= "Different Origin")+ geom_boxplot(width=0.1, color="blue") +theme_bw()

cylinders_C <- factor(Auto$cylinders, labels = c("c_3", "c_4", "c_5","c_6", "c_8"))

# Violin+Boxplots showing mpg broken down by different origins
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230) 
qplot(cylinders_C, mpg, geom = c("violin"), data = Auto_C, xlab= "Number of cylinders")+geom_boxplot(width=0.1, color="red") +theme_bw()

# Violin+Boxplots showing horsepoer broken down by different origins
par(mfrow = c(1,3))
dens <- density(Auto_p$horsepower)
options(repr.plot.width=9, repr.plot.height=3, repr.plot.res = 350) 
boxplot(Auto_p$horsepower, main="Detecting horsepower outliers",
        xlab="horsepower ", ylab="value")
hist(Auto_p$horsepower, breaks = 15,  xlab="horsepower distribution ", 
     main="Histogram of horsepower", probability = T)
lines(dens)
hist(log(Auto_p$horsepower), breaks = 15, xlab="horsepower log distribution ", 
     main="Histogram of horsepower in log scale")

## identifying the outliers from horsepower variable
outl_hp <- boxplot.stats(Auto_p$horsepower)$out
outl_hpw <- which(Auto_p$horsepower %in% c(outl_hp))
outl_hpw
outl_hp

## replacing outliers by the median of horsepower in Auto_p dataset
Auto_p[Auto_p$horsepower %in% c(outl_hpw), "horsepower"] = median(Auto_p$horsepower)

Auto_p[outl_hpw >=200, "horsepower"] <- median(Auto_p$horsepower)

## Relationhip between mpg and year variable; from the plot its been seen that as the years passed by Performance (mpg) increases in vehicles.
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs year 
plot(mpg ~ year, xlab = "year", ylab = "Performance in mpg", data = Auto_p)
# add a smooth trendline on top of scatter plot to verify the pattern
with(Auto_p, lines(lowess(mpg~year), lwd=2, col='red'))
# boxplot to visualize the transmision range increasing by year.
boxplot(mpg ~ year, data = Auto_p, ylab = "Performance in mpg", xlab = "Year")

options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 300)

fil_Auto_C1 <- filter(Auto_p, year == c(70,82))
fil_Auto_C1$origin <- factor(fil_Auto_C1$origin, labels = c("Amerian","European", "Japanese"))
fil_Auto_C1$year <- factor(fil_Auto_C1$year, labels = c("1972","1980"))

comb_plot<- fil_Auto_C1 %>% 
  mutate(day = fct_reorder(origin, mpg)) %>%
  mutate(day = factor(origin)) %>%
  ggplot(aes(x=origin, y=mpg, fill=year))  +
  geom_boxplot(aes(color = year), width = 0.07, position = position_dodge(0.65))+
  theme_bw()  +
  xlab("Origin") +
  ylab("mpg")

comb_plot

################## exploring weight variable from normal scale to logscale ########################
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs weight 
plot(mpg ~ weight, xlab = "weight", ylab = "Performance in mpg", data = Auto_C)
with(Auto_p, lines(lowess(mpg~weight), lwd=2, col='red'))
plot(log(mpg) ~ log(weight), xlab = "log of weight", ylab = "log of mpg", data = Auto_C)
with(Auto_p, lines(lowess(log(mpg)~log(weight)), lwd=2, col='red'))

# g) In the full Auto dataset, consider the variable mpg. You are going to 
# create a new categorical variable for MPG, which has the categories: 
# {low, med, high}. Call this variable "my_mpg", and create a new_Auto dataset, 
# which contains all of the Auto variables, and your new variable "my_mpg".

Auto_p['my_mpg'] <- Auto['mpg']
new_Auto <- Auto_p
quantile(Auto$mpg, probs=c(0.33, 0.66))

new_Auto$my_mpg[Auto['mpg']<18.503] = "low"
new_Auto$my_mpg[Auto['mpg']<26.601 & Auto['mpg']>18.503] = "med"
new_Auto$my_mpg[Auto['mpg']>26.601] = "high"
table(my_mpg=new_Auto$my_mpg)

options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
ggplot(data = new_Auto, mapping = aes(x=my_mpg,y=horsepower)) + 
  geom_jitter(aes(color='blue'),alpha=0.9) +
  geom_boxplot(fill="bisque",color="black",alpha=0.5) + 
  labs(x='mpg', y = 'Horsepower') +
  guides(color=FALSE) +
  theme_bw()

# (g) Save the dataset as an *.RData file and submit it with your assignment.
write.table(new_Auto, file="new_Auto.Rdata")