Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Lme tweaks #104

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions gneiss/regression/_mixedlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,8 @@ def __init__(self, *args, **kwargs):

def fit(self, **kwargs):
""" Fit the model """
for s in self.submodels:
# assumes that the underlying submodels have implemented `fit`.
# TODO: Add regularized fit
m = s.fit(**kwargs)
self.results.append(m)
self.results = [s.fit(**kwargs) for s in self.submodels]


def summary(self, ndim=10):
""" Summarize the Ordinary Least Squares Regression Results.
Expand Down Expand Up @@ -225,7 +222,7 @@ def summary(self, ndim=10):
pvals.insert(0, ' ', ['pvalue']*pvals.shape[0])
scores = pd.concat((coefs, pvals))
scores = scores.sort_values(by=' ', ascending=False)
scores = scores.sort_index()
scores = scores.sort_index(kind='mergesort')

def _format(x):
# format scores to be printable
Expand Down
168 changes: 146 additions & 22 deletions gneiss/regression/_ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,23 @@
# ----------------------------------------------------------------------------
from decimal import Decimal
from collections import OrderedDict

import numpy as np
import pandas as pd
from gneiss.regression._model import RegressionModel
from ._regression import (_intersect_of_table_metadata_tree,
_to_balances)
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import Summary
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
from patsy import dmatrix


def _fit_ols(y, x, **kwargs):
""" Perform the basic ols regression."""
# mixed effects code is obtained here:
# http://stackoverflow.com/a/22439820/1167475
submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
return submodels


# TODO: Register as qiime 2 method
Expand Down Expand Up @@ -155,18 +165,10 @@ def ols(formula, table, metadata, tree, **kwargs):
metadata,
tree)
ilr_table, basis = _to_balances(table, tree)
data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)

submodels = []

for b in ilr_table.columns:
# mixed effects code is obtained here:
# http://stackoverflow.com/a/22439820/1167475
stats_formula = '%s ~ %s' % (b, formula)

mdf = smf.ols(stats_formula, data=data, **kwargs)
submodels.append(mdf)

ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0)
# one-time creation of exogenous data matrix allows for faster run-time
x = dmatrix(formula, metadata, return_type='dataframe')
submodels = _fit_ols(ilr_table, x)
return OLSModel(submodels, basis=basis,
balances=ilr_table,
tree=tree)
Expand Down Expand Up @@ -203,12 +205,20 @@ def __init__(self, *args, **kwargs):
"""
super().__init__(*args, **kwargs)

def fit(self, **kwargs):
""" Fit the model """
for s in self.submodels:
# assumes that the underlying submodels have implemented `fit`.
m = s.fit(**kwargs)
self.results.append(m)
def fit(self, regularized=False, **kwargs):
""" Fit the model.

Parameters
----------
regularized : bool
Specifies if a regularization procedure should be used
when performing the fit. (default = False)
**kwargs : dict
Keyword arguments used to tune the parameter estimation.

"""
# assumes that the underlying submodels have implemented `fit`.
self.results = [s.fit(**kwargs) for s in self.submodels]

def summary(self, ndim=10):
""" Summarize the Ordinary Least Squares Regression Results.
Expand Down Expand Up @@ -278,15 +288,13 @@ def _format(x):
smry.add_dict(info, ncols=1)
smry.add_title("Simplicial Least Squares Results")
smry.add_df(scores, align='r')

return smry

@property
def r2(self):
""" Coefficient of determination for overall fit"""
# Reason why we wanted to move this out was because not
# all types of statsmodels regressions have this property.

# See `statsmodels.regression.linear_model.RegressionResults`
# for more explanation on `ess` and `ssr`.
# sum of squares regression. Also referred to as
Expand All @@ -295,6 +303,122 @@ def r2(self):
# sum of squares error. Also referred to as sum of squares residuals
sse = sum([r.ssr for r in self.results])
# calculate the overall coefficient of determination (i.e. R2)

sst = sse + ssr
return 1 - sse / sst

@property
def mse(self):
""" Mean Sum of squares Error"""
sse = sum([r.ssr for r in self.results])
dfe = self.results[0].df_resid
return sse / dfe

def loo(self, **kwargs):
""" Leave one out cross-validation.

Calculates summary statistics for each iteraction of
leave one out cross-validation, specially `mse` on entire model
and `pred_err` to measure prediction error.

Parameters
----------
**kwargs : dict
Keyword arguments used to tune the parameter estimation.

Returns
-------
pd.DataFrame
mse : np.array, float
Mean sum of squares error for each iteration of
the cross validation.
pred_err : np.array, float
Prediction mean sum of squares error for each iteration of
the cross validation.

See Also
--------
fit
statsmodels.regression.linear_model.
"""

nobs = self.balances.shape[0] # number of observations (i.e. samples)
cv_iter = LeaveOneOut(nobs)
endog = self.balances
exog_names = self.results[0].model.exog_names
exog = pd.DataFrame(self.results[0].model.exog,
index=self.balances.index,
columns=exog_names)
results = pd.DataFrame(index=self.balances.index,
columns=['mse', 'pred_err'])

for i, (inidx, outidx) in enumerate(cv_iter):
sample_id = self.balances.index[i]
res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
res_i = [r.fit(**kwargs) for r in res_i]

# mean sum of squares error
sse = sum([r.ssr for r in res_i])
# degrees of freedom for residuals
dfe = res_i[0].df_resid
results.loc[sample_id, 'mse'] = sse / dfe

# prediction error on loo point
predicted = np.hstack([r.predict(exog.loc[outidx]) for r in res_i])

pred_sse = np.sum((predicted - self.balances.loc[outidx])**2)
results.loc[sample_id, 'pred_err'] = pred_sse.sum()
return results

def lovo(self, **kwargs):
""" Leave one variable out cross-validation.

Calculates summary statistics for each iteraction of leave one variable
out cross-validation, specially `r2` and `mse` on entire model.
This technique is particularly useful for feature selection.

Parameters
----------
**kwargs : dict
Keyword arguments used to tune the parameter estimation.

Returns
-------
pd.DataFrame
Rsquared : np.array, flot
Coefficient of determination for each variable left out.
mse : np.array, float
Mean sum of squares error for each iteration of
the cross validation.
"""
endog = self.balances
exog_names = self.results[0].model.exog_names
exog = pd.DataFrame(self.results[0].model.exog,
index=self.balances.index,
columns=exog_names)
cv_iter = LeaveOneOut(len(exog_names))
results = pd.DataFrame(index=exog_names,
columns=['mse', 'Rsquared'])
for i, (inidx, outidx) in enumerate(cv_iter):
feature_id = exog_names[i]
res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs)
res_i = [r.fit(**kwargs) for r in res_i]
# See `statsmodels.regression.linear_model.RegressionResults`
# for more explanation on `ess` and `ssr`.
# sum of squares regression.
ssr = sum([r.ess for r in res_i])
# sum of squares error.
sse = sum([r.ssr for r in res_i])
# calculate the overall coefficient of determination (i.e. R2)
sst = sse + ssr
results.loc[feature_id, 'Rsquared'] = 1 - sse / sst
# degrees of freedom for residuals
dfe = res_i[0].df_resid
results.loc[feature_id, 'mse'] = sse / dfe
return results

def percent_explained(self):
""" Proportion explained by each principal balance."""
# Using sum of squares error calculation (df=1)
# instead of population variance (df=0).
axis_vars = np.var(self.balances, ddof=1, axis=0)
return axis_vars / axis_vars.sum()
66 changes: 66 additions & 0 deletions gneiss/regression/tests/test_ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@ def setUp(self):
'real': [1, 2, 3, 4, 5]
}, index=['s1', 's2', 's3', 's4', 's5'])

np.random.seed(0)
n = 15
a = np.array([1, 4.2, 5.3, -2.2, 8])
x1 = np.linspace(.01, 0.1, n)
x2 = np.logspace(0, 0.01, n)
x3 = np.exp(np.linspace(0, 0.01, n))
x4 = x1 ** 2
self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4})
y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 +
np.random.normal(size=n))
sy = np.vstack((y, y/10)).T
self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c'])
self.t2 = TreeNode.read([r"((a,b)n,c);"])

def test_ols(self):
res = ols('real', self.table, self.metadata, self.tree)
res.fit()
Expand Down Expand Up @@ -226,6 +240,58 @@ def test_summary_head(self):
exp = fh.read()
self.assertEqual(res, exp)

def test_loo(self):
res = ols(formula="x1 + x2 + x3 + x4",
table=self.y, metadata=self.x, tree=self.t2)
res.fit()
exp_loo = pd.DataFrame([[0.66953263510975791, 10.994700550912553],
[0.69679777354984163, 2.3613911713947062],
[0.84934173316473072, 0.4057812892157881],
[0.6990546679957772, 2.2872776593899351],
[0.72855466737125463, 1.7615637744849277],
[0.55998953661859308, 3.617823652256889],
[0.81787392852582308, 0.72395497360494043],
[0.8653549732546999, 0.17706927499520822],
[0.86983181933002329, 0.1216027316667969],
[0.87779006612352628, 0.028600627330344405],
[0.86591226075609384, 0.16724511075065476],
[0.7787232221539, 1.2820054843437292],
[0.88032413856094505, 3.4113910096200831e-06],
[0.83195133809800792, 0.62276589277034022],
[0.85352707356786695, 1.4038585971691198]],
columns=['mse', 'pred_err'],
index=self.y.index)
res_loo = res.loo().astype(np.float)
pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)

def test_lovo(self):
res = ols(formula="x1 + x2 + x3 + x4",
table=self.y, metadata=self.x, tree=self.t2)
res.fit()
exp_lovo = pd.DataFrame([[0.799364, 0.978214],
[0.799363, 0.097355],
[0.799368, 0.0973498],
[0.799364, 0.097354],
[0.799361, 0.0973575]],
columns=['mse', 'Rsquared'],
index=['Intercept', 'x1', 'x2', 'x3', 'x4'])
res_lovo = res.lovo().astype(np.float)
pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)

def test_percent_explained(self):
res = ols(formula="x1 + x2 + x3 + x4",
table=self.y, metadata=self.x, tree=self.t2)
res.fit()
res_perc = res.percent_explained()
exp_perc = pd.Series({'y0': 0.009901,
'y1': 0.990099})
pdt.assert_series_equal(res_perc, exp_perc)

def test_mse(self):
res = ols(formula="x1 + x2 + x3 + x4",
table=self.y, metadata=self.x, tree=self.t2)
res.fit()
self.assertAlmostEqual(res.mse, 0.79228890379010453, places=4)

if __name__ == "__main__":
unittest.main()