-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsp_regress.m
executable file
·146 lines (133 loc) · 8.05 KB
/
sp_regress.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
function [coefficient_estimates stats response_hat] = sp_regress(response, predictors, type, glmtype)
% function [coefficient_estimates stats response_hat] = sp_regress(responses, predictors, type, glmtype)
% Implements multiple linear regression using Matlab regress:
% It checks the input arguments for consistency and takes care of adding a column of ones to the predicators
% Also packs the output statistics in a more convenient way
%
% Also implements Generalized Linear Regression using glmfit
%
% INPUTS: (regression model y=b*X)
% response : [y] [n x 1] vector of observed responses
% predictors : [X] [n x p] matrix of p predictors at each of n observations
% [type] : [string] {'linear' (Default), 'glm','spline'}
% 'spline' uses the ARES package
% [glmtype] : [string] 'poisson'
%
% OUTPUTS:
% coefficient_estimates: [array]
% stats: modified output from regress/ glmfit
% response_hat: the estimated response
%
% Sagi Perel, 02/2009
if(~isvector(response))
error('sp_regress: responses must be a vector');
else
response = make_column_vector(response);
end
n1 = length(response);
n2 = size(predictors,1);
if(n1 ~= n2)
error(['sp_regress: number of rows in response (' num2str(n1) ') must match the number of rows in predictors (' num2str(n2) ')']);
end
if(~exist('type','var'))
type = 'linear';
end
if(~exist('glmtype','var'))
glmtype = 'poisson';
end
switch(type)
case 'linear'
% predicators = [ones(size(predictors,1),1) predictors];
% % [b,bint,r,rint,stats] = regress(y,X) % X- [n x p] predictors, y- [n x 1] responses
% % X is an n-by-p matrix of p predictors at each of n observations. y is an n-by-1 vector of observed responses
% [coefficient_estimates, bint , r, rint, r_stats] = regress(responses, predictors);
% % output from regress:
% % bint : p-by-2 matrix bint of 95% confidence intervals for the coefficient estimates
% % r : n-by-1 vector r of residuals
% % rint : n-by-2 matrix rint of intervals that can be used to diagnose outliers. If the interval rint(i,:) for
% % observation i does not contain zero, the corresponding residual is larger than expected in 95% of new observations, suggesting an outlier
% % stats: 1-by-4 vector : [R2 statistic, the F statistic, its p-value, estimate of the error variance]
% stats.bint = bint;
% stats.r = r;
% stats.rint = rint;
% stats.r2 = r_stats(1);
% stats.f = r_stats(2);
% stats.p = r_stats(3);
% stats.error_variance = r_stats(4);
whichstats = {'beta','r','rsquare','fstat','tstat'};
% By default, regstats adds a first column of 1s to X
stats = regstats(response, predictors,'linear',whichstats);
coefficient_estimates = stats.beta;
if(nargout > 2)
response_hat = [ones( size(predictors,1), 1) predictors] * coefficient_estimates;
end
case 'glm'
% By default, glmfit adds a first column of 1s to X
% [b,dev,stats] = glmfit(X,y,distr); % X- [n x p] predictors, y- [n x 1] responses
[coefficient_estimates, deviance, stats] = glmfit(predictors, response, glmtype);
% output from glmfit:
% deviance: the deviance of the fit at the solution vector. The deviance is a generalization of the residual sum of
% squares. It is possible to perform an analysis of deviance to compare several models, each a subset of the
% other, and to test whether the model with more terms is significantly better than the model with fewer terms.
% stats : a structure with the following fields:
% beta — Coefficient estimates b | dfe — Degrees of freedom for error | s — Theoretical or estimated dispersion parameter
% sfit — Estimated dispersion parameter | se — Vector of standard errors of the coefficient estimates b | coeffcorr — Correlation matrix for b
% covb — Estimated covariance matrix for B| t — t statistics for b | p — p-values for b
% resid — Vector of residuals | residp — Vector of Pearson residuals | residd — Vector of deviance residuals
% resida — Vector of Anscombe residuals
%
% An example for relative sizes of fields:
% beta: [385x1 double]
% dfe: 373481
% sfit: 0.9859
% s: 1
% estdisp: 0
% covb: [385x385 double]
% se: [385x1 double]
% coeffcorr: [385x385 double]
% t: [385x1 double]
% p: [385x1 double]
% resid: [373866x1 double]
% residp: [373866x1 double]
% residd: [373866x1 double]
% resida: [373866x1 double]
% deviance: 7.3563e+04
stats.deviance = deviance;
if(nargout > 2)
error('TBC');
end
case 'spline'
% remove NaN values
nan_responses_lidx = isnan(response);
nan_predictors_lidx = sum(isnan(predictors),2);
nan_lidx = nan_responses_lidx | nan_predictors_lidx;
if(any(nan_lidx))
log_disp(['sp_regress: eliminated ' num2str(sum(nan_lidx)) '/' num2str(length(response)) ' NaN values']);
end
predictors = sp_zscore(predictors(~nan_lidx,:));
response = sp_zscore(response(~nan_lidx));
stats = aresbuild(predictors, response);
coefficient_estimates = stats.coefs;
% model : The built ARES model a structure with the following elements:
% coefs : Coefficient vector of the regression model (for the intercept term and each basis function).
% knotdims : Cell array of indexes of used input variables for each knot in each basis function.
% knotsites : Cell array of knot sites for each knot and used input variable in each basis function.
% knotdirs : Cell array of directions (-1 or 1) of the hinge functions for each used input variable in each basis function.
% parents : Vector of indexes of direct parents for each basis function (0 if there is no direct parent or it is the intercept term).
% trainParams : A structure of training parameters for the algorithm (the same as in the function input).
% MSE : Mean Squared Error of the model in the training data.
% GCV : Generalized Cross-Validation (GCV) of the model in the taining data set. The value may also be Inf if model's
% effective number of parameters is larger than or equal to n.
% t1 : For piecewise-cubic models only. Matrix of knot sites for the additional knots on the left of the central knot.
% t2 : For piecewise-cubic models only. Matrix of knot sites for the additional knots on the right of the central knot.
% minX : Vector of minimums for input variables (used for t1 and t2 placements as well as for model plotting).
% maxX : Vector of maximums for input variables (used for t1 and t2 placements as well as for model plotting).
% endSpan : The used value of endSpan.
if(nargout > 2)
response_hat = sp_spline_predict(stats, predictors);
stats.rsquare = sp_compute_r2(response, response_hat);
stats.rmse = sp_compute_RMSE(response, response_hat);
end
otherwise
error('sp_regress: unknown regression type');
end