-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning.py
70 lines (52 loc) · 2.35 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 20 12:15:48 2021
@author: he
"""
#importing the libraries
import pandas as pd
import numpy as np
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
#checking for the missing value
train.isnull().sum()
'''
For categorical let's impute using mode and
# =============================================================================
# For numerical use mean or median.
# =============================================================================
'''
#for the train dataset.
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train.Married.fillna(train.Married.mode()[0], inplace=True)
train.Dependents.fillna(train.Dependents.mode()[0], inplace=True)
train.Self_Employed.fillna(train.Self_Employed.mode()[0], inplace=True)
train.Credit_History.fillna(train.Credit_History.mode()[0], inplace=True)
train.Loan_Amount_Term.value_counts()
train.Loan_Amount_Term.fillna(train.Loan_Amount_Term.mode()[0], inplace=True)
#since mean is affected by outliers we are going to use median.
train.LoanAmount.fillna(train.LoanAmount.median(), inplace=True)
train.isnull().sum()
# =============================================================================
# #for the test datset.
# =============================================================================
test['Gender'].fillna(test['Gender'].mode()[0], inplace=True)
test.Married.fillna(test.Married.mode()[0], inplace=True)
test.Dependents.fillna(test.Dependents.mode()[0], inplace=True)
test.Self_Employed.fillna(test.Self_Employed.mode()[0], inplace=True)
test.Credit_History.fillna(test.Credit_History.mode()[0], inplace=True)
test.Loan_Amount_Term.fillna(test.Loan_Amount_Term.mode()[0], inplace=True)
test.LoanAmount.fillna(test.LoanAmount.median(), inplace=True)
#Outlier treament.
'''
Log transformation to make the data set normally distributed on the
LoanAmount to remove right skewness which is brought by outliers.
'''
train['LoanAmount_log'] = np.log(train.LoanAmount)
test['LoanAmount_log'] = np.log(test.LoanAmount)
# =============================================================================
# # visualizing the results.
# =============================================================================
train.LoanAmount_log.hist(bins=20)
train.to_csv('train_new.csv', index = False)
test.to_csv('test_new.csv', index = False)