-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.py
96 lines (59 loc) · 3.25 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# prepare file
import os
import pandas as pd
import numpy as np
from scipy import stats
from env import host, user, password
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
def remove_outlier(df):
new_df = df[(np.abs(stats.zscore(df['sqft'])) < 3)]
new_df = df[(np.abs(stats.zscore(df['baths'])) < 3)]
new_df = df[(np.abs(stats.zscore(df['beds'])) < 3)]
new_df = df[(np.abs(stats.zscore(df['tax_value'])) < 3)]
return new_df
def train_validate_test(df, target):
# split df into test (20%) and train_validate (80%)
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
# split train_validate off into train (70% of 80% = 56%) and validate (30% of 80% = 24%)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
# split train into X (dataframe, drop target) & y (series, keep target only)
X_train = train.drop(columns=[target])
y_train = train[target]
# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate = validate.drop(columns=[target])
y_validate = validate[target]
# split test into X (dataframe, drop target) & y (series, keep target only)
X_test = test.drop(columns=[target])
y_test = test[target]
return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test
def get_object_cols(df):
# creating a mask of columns into 'object'
mask = np.array(df.dtypes == "object")
# getting a list of column names that are objects from mask
object_cols = df.iloc[:, mask].columns.tolist()
return object_cols
def get_numeric_X_cols(X_train, object_cols):
numeric_cols = [col for col in X_train.columns.values if col not in object_cols]
return numeric_cols
def min_max_scale(X_train, X_validate, X_test, numeric_cols):
# create the scaler object and fit it to X_train (i.e. identify min and max)
# if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).
scaler = MinMaxScaler(copy=True).fit(X_train[numeric_cols])
#scale X_train, X_validate, X_test using the mins and maxes stored in the scaler derived from X_train.
X_train_scaled_array = scaler.transform(X_train[numeric_cols])
X_validate_scaled_array = scaler.transform(X_validate[numeric_cols])
X_test_scaled_array = scaler.transform(X_test[numeric_cols])
# convert arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled_array,
columns=numeric_cols).\
set_index([X_train.index.values])
X_validate_scaled = pd.DataFrame(X_validate_scaled_array,
columns=numeric_cols).\
set_index([X_validate.index.values])
X_test_scaled = pd.DataFrame(X_test_scaled_array,
columns=numeric_cols).\
set_index([X_test.index.values])
return X_train_scaled, X_validate_scaled, X_test_scaled