-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathpreprocessing.py
49 lines (41 loc) · 1.49 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
import pandas as pd
def loadDataset(filename):
baseDeDados = pd.read_csv(filename, delimiter=';')
X = baseDeDados.iloc[:,:-1].values
y = baseDeDados.iloc[:,-1].values
return X, y
def fillMissingData(X, inicioColuna, fimColuna):
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
return X
#só funciona se i = 0 ou i = ultima coluna
def computeCategorization(X, i):
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, i] = labelencoder_X.fit_transform(X[:, i])
#one hot encoding
D = pd.get_dummies(X[:,i]).values
if(i == 0):
X = X[:,1:]
X = np.insert(X, 0, D, axis=1)
#removendo dummy variable trap
X = X[:,1:]
else:
X = X[:,:i]
for j in range(0, D.shape[1]):
X = np.insert(X, i, D[:,j], axis=1)
#removendo dummy variable trap
X = X[:,:-1]
return X
def splitTrainTestSets(X, y, testSize):
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
return XTrain, XTest, yTrain, yTest
def computeScaling(train, test):
from sklearn.preprocessing import StandardScaler
scaleX = StandardScaler()
train = scaleX.fit_transform(train)
test = scaleX.fit_transform(test)
return train, test