-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing_final.py
executable file
·230 lines (155 loc) · 8.38 KB
/
data_preprocessing_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
This file imports the csv file with the Lending Club data and does the
following:
1) High level cleaning as far as fixing data types, E.g. 19.29% is
converted to 0.1929
2) Subsets the data
3) Adds some basic measures E.g. debt to income after the loan is added,
current monthly debt payments, calculates net gain as loan amount - received
interest, principal and late fees. Net gain was calculated in this fashion
as it was a straight forward measure that didn't require using anticipated
principle + interest, as a customer who paid their loan back early would
"technically" be counted as a loss, since 100% of principle would be paid
back, but less than the anticipated interest would've been paid.
4) Fills in 0 for NaN in scenarios where it's a simple fix and doesn't
require a lot of analysis. E.g. NaNs for customers without a credit
card balance
Overall this file does a lot of the heavy lifting for subsetting and
cleaning the data in order to make the next level of feature engineering,
cleaning, EDA and machine learning simpler. The subsetted columns are based
on a preliminary analysis of the data, focusing on the columns that Lending
Club would've when the customer first applied for the loan and getting rid of
the columns that contained data gathered over the course of the customer paying
back the loan.
"""
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
def import_data(file_name: str) -> object:
# function that imports data file with Lending Club approved loan data
lending = pd.read_csv(file_name, low_memory=False)
return (lending)
def subset_data(df: pd.DataFrame) -> pd.Dataframe:
lending_subset = df[['funded_amnt', 'term', 'int_rate', 'installment',
'grade', 'emp_length', 'home_ownership', 'annual_inc',
'verification_status', 'issue_d', 'loan_status',
'purpose', 'addr_state', 'dti', 'delinq_2yrs',
'earliest_cr_line', 'fico_range_low',
'inq_last_6mths', 'mths_since_last_delinq',
'open_acc', 'pub_rec', 'pub_rec_bankruptcies',
'revol_bal', 'revol_util', 'total_acc', 'total_pymnt',
'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
'total_rec_late_fee', 'collections_12_mths_ex_med',
'application_type', 'acc_now_delinq',
'chargeoff_within_12_mths', 'acc_open_past_24mths',
'avg_cur_bal', 'delinq_amnt', 'mo_sin_old_rev_tl_op',
'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',
'mths_since_recent_inq',
'mths_since_recent_revol_delinq',
'num_accts_ever_120_pd', 'num_il_tl',
'num_tl_120dpd_2m', 'pct_tl_nvr_dlq', 'tot_coll_amt',
'tot_cur_bal', 'total_bal_ex_mort']]
return (lending_subset)
def fix_data_types(data: pd.DataFrame) -> pd.DataFrame:
# Fix columns that are the wrong data type or have characters we
# don't need like '%'
# fix interest rate
data.loc[:, 'int_rate'] = data.loc[:, 'int_rate'].\
str.strip('%').astype('float64')
data.loc[:, 'int_rate'] = data.loc[:, 'int_rate'] / 100
# fix revolving utilization
data.loc[:, 'revol_util'] = data.loc[:, 'revol_util'].\
str.strip('%').astype('float64')
data.loc[:, 'revol_util'] = data.loc[:, 'revol_util'] / 100
# extract digit from employment length and
# create column with employment in months
data['emp_length'] = data.emp_length.str.extract(r'(\d+)')
# got errors prior, so we'll drop the rows with "NaN" in
# emp_length as it's critical to credit scoring
data.dropna(subset=['emp_length'], inplace=True)
# reset the data frame index since we deleted rows
data.reset_index()
# convert values in employment length column to integers
data['emp_length'] = data['emp_length'].astype(int)
# create new column with employment length in months
data.loc[:, 'emp_length_months'] = data.loc[:, 'emp_length'] * 12
# convert the DTI column to a decimal
data.loc[:, 'dti_dec'] = data.loc[:, 'dti'] / 100
# convert issue date and earliest credit line columns to dates
# used dt.floor to just extract the date due to receiving out
# of bounds errors via the typical pandas date conversion methods
data['issue_d'] = pd.to_datetime(data['issue_d'],
format='%b-%y',
errors='coerce').dt.floor('D')
data['earliest_cr_line'] = pd.to_datetime(data['earliest_cr_line'],
format='%b-%y',
errors='coerce').dt.floor('D')
# create length of credit history column in months
data['length_of_credit_history'] = ((data['issue_d'] -
data['earliest_cr_line'])
/ np.timedelta64(1, 'M'))
data['length_of_credit_history'] =\
data['length_of_credit_history'].astype(int)
return data
def fix_dates(data: pd.DataFrame) -> pd.DataFrame:
# common for dates to mistakenly get entered in as
# "future dates" for earliest credit line this function removes those rows
data = data[(data['length_of_credit_history'] >= 0)]
return (data)
def measures(data: pd.DataFrame) -> pd.DataFrame:
# calculate monthly income
data.loc[:, 'monthly_income'] = (data.loc[:, 'annual_inc'] / 12)
# calculate monthly debt payments
data.loc[:, 'monthly_debt_payments'] = (data.loc[:, 'dti_dec'] *
data.loc[:, 'monthly_income'])
# monthly debt payments cost loan
data.loc[:, 'updated_monthly_debt_payments'] = (data.loc[:,
'monthly_debt_payments']
+ data.loc[:,
'installment'])
# calculate lost principle (defaulted loans)
data.loc[:, 'lost_principle'] = (data.loc[:, 'funded_amnt']
- data.loc[:, 'total_rec_prncp'])
# calculate # of payments received per loan
data.loc[:, 'total_payments'] = (data.loc[:, 'total_pymnt']
/ data.loc[:, 'installment'])
# updated DTI
data.loc[:, 'post_loan_dti'] = (data.loc[:,
'updated_monthly_debt_payments']
/ data.loc[:, 'monthly_income'])
data.loc[:, 'net_gain'] = ((data.loc[:, 'total_rec_prncp'] +
data.loc[:, 'total_rec_int'] +
data.loc[:, 'total_rec_late_fee']) -
data.loc[:, 'funded_amnt'])
return data
def fix_na(data: pd.DataFrame) -> pd.DataFrame:
# fill in NaNs with zeroes for those columns as they won't require
# data transforms E.g. situations like months since last delinquency
# where having a zero is better than but 30 is better 1, but not as
# good as having had zero delinquencies.
# the zeros are customers without credit card balances
data.loc[:, 'revol_util'].fillna(0, inplace=True)
# similar to the above, if there are no inquiries, when the
# credit is pulled no data comes across so a NaN occurs, at
# least that's the working presumption
data.loc[:, 'mths_since_recent_inq'].fillna(0, inplace=True)
# if you have had no significant past dues, there would be nothing
# to enter, hence the NaNs. This was verified by digging through the data,
# customers with NaNs here had higher FICO scores and zeroes in
# related columns
data.loc[:, 'num_tl_120dpd_2m'].fillna(0, inplace=True)
return data
def write_file(data: pd.DataFrame, filename: str) -> str:
data.to_csv(filename, date_format='%Y-%m-%d', index=False)
return (print('file processing complete'))
def main():
lending = import_data('data/LC_2015_Data.csv')
subset = subset_data(lending)
fixed_data = fix_data_types(subset)
fixed_data = fix_dates(fixed_data)
final_data = measures(fixed_data)
final_data = fix_na(final_data)
write_file(final_data, 'data/processed_2015_loan_data_april22.csv')
if __name__ == '__main__':
main()