-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlinear-regression.py
72 lines (54 loc) · 2.33 KB
/
linear-regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, metrics
import csv
from copy import deepcopy
def load_csv(filename):
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = np.array(x).astype('float')
return data
def splitDataset(dataset, splitratio):
trainsize = int(np.round(dataset.shape[0]*splitratio))
trainset = np.zeros((trainsize,dataset.shape[1]))#array to store the training set.
testset = deepcopy(dataset)#create a copy of the dataset in test set.
for numsamples in range(trainsize):
indx = np.random.randint(0,testset.shape[0])#random index generation
trainset[numsamples,:] = testset[indx,:]#adding the randomly selected data vector to the training set
testset = np.delete(testset, indx, axis = 0)#delete the selected observation from the test set.
return trainset,testset
data = load_csv('data.csv')
trainset, testset = splitDataset(data,0.7)
x_train = trainset[:,:-1]
y_train = trainset[:,-1]
x_test = testset[:,:-1]
y_test = testset[:,-1]
# create linear regression object
reg = linear_model.LinearRegression()
# train the model using the training sets
reg.fit(x_train, y_train)
# regression coefficients
print('Coefficients: \n', reg.coef_)
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(x_test, y_test)))
# mean squared error
print('Mean Squared Error of test data: ', metrics.mean_squared_error(y_test, reg.predict(x_test)))
print('Mean Squared Error of train data: ', metrics.mean_squared_error(y_train, reg.predict(x_train)))
# plot for residual error
## setting plot style
plt.style.use('fivethirtyeight')
## plotting residual errors in training data
plt.scatter(reg.predict(x_train), reg.predict(x_train) - y_train,
color = "green", s = 10, label = 'Train data')
## plotting residual errors in test data
plt.scatter(reg.predict(x_test), reg.predict(x_test) - y_test,
color = "blue", s = 10, label = 'Test data')
## plotting line for zero residual error
plt.hlines(y = 0, xmin = 0, xmax = 50, linewidth = 2)
## plotting legend
plt.legend(loc = 'upper right')
## plot title
plt.title("Residual errors")
## function to show plot
plt.show()