biocore · mortonjt · Jan 26, 2017 · Jan 26, 2017 · Jan 26, 2017 · Jan 26, 2017
diff --git a/gneiss/regression/_mixedlm.py b/gneiss/regression/_mixedlm.py
@@ -188,11 +188,8 @@ def __init__(self, *args, **kwargs):
 
     def fit(self, **kwargs):
         """ Fit the model """
-        for s in self.submodels:
-            # assumes that the underlying submodels have implemented `fit`.
-            # TODO: Add regularized fit
-            m = s.fit(**kwargs)
-            self.results.append(m)
+        self.results = [s.fit(**kwargs) for s in self.submodels]
+
 
     def summary(self, ndim=10):
         """ Summarize the Ordinary Least Squares Regression Results.
@@ -225,7 +222,7 @@ def summary(self, ndim=10):
         pvals.insert(0, '     ', ['pvalue']*pvals.shape[0])
         scores = pd.concat((coefs, pvals))
         scores = scores.sort_values(by='     ', ascending=False)
-        scores = scores.sort_index()
+        scores = scores.sort_index(kind='mergesort')
 
         def _format(x):
             # format scores to be printable

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
@@ -7,13 +7,23 @@
 # ----------------------------------------------------------------------------
 from decimal import Decimal
 from collections import OrderedDict
-
+import numpy as np
 import pandas as pd
 from gneiss.regression._model import RegressionModel
 from ._regression import (_intersect_of_table_metadata_tree,
                           _to_balances)
 import statsmodels.formula.api as smf
 from statsmodels.iolib.summary2 import Summary
+from statsmodels.sandbox.tools.cross_val import LeaveOneOut
+from patsy import dmatrix
+
+
+def _fit_ols(y, x, **kwargs):
+    """ Perform the basic ols regression."""
+    # mixed effects code is obtained here:
+    # http://stackoverflow.com/a/22439820/1167475
+    submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
+    return submodels
 
 
 # TODO: Register as qiime 2 method
@@ -155,18 +165,10 @@ def ols(formula, table, metadata, tree, **kwargs):
                                                               metadata,
                                                               tree)
     ilr_table, basis = _to_balances(table, tree)
-    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)
-
-    submodels = []
-
-    for b in ilr_table.columns:
-        # mixed effects code is obtained here:
-        # http://stackoverflow.com/a/22439820/1167475
-        stats_formula = '%s ~ %s' % (b, formula)
-
-        mdf = smf.ols(stats_formula, data=data, **kwargs)
-        submodels.append(mdf)
-
+    ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0)
+    # one-time creation of exogenous data matrix allows for faster run-time
+    x = dmatrix(formula, metadata, return_type='dataframe')
+    submodels = _fit_ols(ilr_table, x)
     return OLSModel(submodels, basis=basis,
                     balances=ilr_table,
                     tree=tree)
@@ -203,12 +205,20 @@ def __init__(self, *args, **kwargs):
         """
         super().__init__(*args, **kwargs)
 
-    def fit(self, **kwargs):
-        """ Fit the model """
-        for s in self.submodels:
-            # assumes that the underlying submodels have implemented `fit`.
-            m = s.fit(**kwargs)
-            self.results.append(m)
+    def fit(self, regularized=False, **kwargs):
+        """ Fit the model.
+
+        Parameters
+        ----------
+        regularized : bool
+            Specifies if a regularization procedure should be used
+            when performing the fit. (default = False)
+        **kwargs : dict
+           Keyword arguments used to tune the parameter estimation.
+
+        """
+        # assumes that the underlying submodels have implemented `fit`.
+        self.results = [s.fit(**kwargs) for s in self.submodels]
 
     def summary(self, ndim=10):
         """ Summarize the Ordinary Least Squares Regression Results.
@@ -278,15 +288,13 @@ def _format(x):
         smry.add_dict(info, ncols=1)
         smry.add_title("Simplicial Least Squares Results")
         smry.add_df(scores, align='r')
-
         return smry
 
     @property
     def r2(self):
         """ Coefficient of determination for overall fit"""
         # Reason why we wanted to move this out was because not
         # all types of statsmodels regressions have this property.
-
         # See `statsmodels.regression.linear_model.RegressionResults`
         # for more explanation on `ess` and `ssr`.
         # sum of squares regression. Also referred to as
@@ -295,6 +303,122 @@ def r2(self):
         # sum of squares error.  Also referred to as sum of squares residuals
         sse = sum([r.ssr for r in self.results])
         # calculate the overall coefficient of determination (i.e. R2)
-
         sst = sse + ssr
         return 1 - sse / sst
+
+    @property
+    def mse(self):
+        """ Mean Sum of squares Error"""
+        sse = sum([r.ssr for r in self.results])
+        dfe = self.results[0].df_resid
+        return sse / dfe
+
+    def loo(self, **kwargs):
+        """ Leave one out cross-validation.
+
+        Calculates summary statistics for each iteraction of
+        leave one out cross-validation, specially `mse` on entire model
+        and `pred_err` to measure prediction error.
+
+        Parameters
+        ----------
+        **kwargs : dict
+           Keyword arguments used to tune the parameter estimation.
+
+        Returns
+        -------
+        pd.DataFrame
+           mse : np.array, float
+               Mean sum of squares error for each iteration of
+               the cross validation.
+           pred_err : np.array, float
+               Prediction mean sum of squares error for each iteration of
+               the cross validation.
+
+        See Also
+        --------
+        fit
+        statsmodels.regression.linear_model.
+        """
+
+        nobs = self.balances.shape[0]  # number of observations (i.e. samples)
+        cv_iter = LeaveOneOut(nobs)
+        endog = self.balances
+        exog_names = self.results[0].model.exog_names
+        exog = pd.DataFrame(self.results[0].model.exog,
+                            index=self.balances.index,
+                            columns=exog_names)
+        results = pd.DataFrame(index=self.balances.index,
+                               columns=['mse', 'pred_err'])
+
+        for i, (inidx, outidx) in enumerate(cv_iter):
+            sample_id = self.balances.index[i]
+            res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
+            res_i = [r.fit(**kwargs) for r in res_i]
+
+            # mean sum of squares error
+            sse = sum([r.ssr for r in res_i])
+            # degrees of freedom for residuals
+            dfe = res_i[0].df_resid
+            results.loc[sample_id, 'mse'] = sse / dfe
+
+            # prediction error on loo point
+            predicted = np.hstack([r.predict(exog.loc[outidx]) for r in res_i])
+
+            pred_sse = np.sum((predicted - self.balances.loc[outidx])**2)
+            results.loc[sample_id, 'pred_err'] = pred_sse.sum()
+        return results
+
+    def lovo(self, **kwargs):
+        """ Leave one variable out cross-validation.
+
+        Calculates summary statistics for each iteraction of leave one variable
+        out cross-validation, specially `r2` and `mse` on entire model.
+        This technique is particularly useful for feature selection.
+
+        Parameters
+        ----------
+        **kwargs : dict
+           Keyword arguments used to tune the parameter estimation.
+
+        Returns
+        -------
+        pd.DataFrame
+           Rsquared : np.array, flot
+               Coefficient of determination for each variable left out.
+           mse : np.array, float
+               Mean sum of squares error for each iteration of
+               the cross validation.
+        """
+        endog = self.balances
+        exog_names = self.results[0].model.exog_names
+        exog = pd.DataFrame(self.results[0].model.exog,
+                            index=self.balances.index,
+                            columns=exog_names)
+        cv_iter = LeaveOneOut(len(exog_names))
+        results = pd.DataFrame(index=exog_names,
+                               columns=['mse', 'Rsquared'])
+        for i, (inidx, outidx) in enumerate(cv_iter):
+            feature_id = exog_names[i]
+            res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs)
+            res_i = [r.fit(**kwargs) for r in res_i]
+            # See `statsmodels.regression.linear_model.RegressionResults`
+            # for more explanation on `ess` and `ssr`.
+            # sum of squares regression.
+            ssr = sum([r.ess for r in res_i])
+            # sum of squares error.
+            sse = sum([r.ssr for r in res_i])
+            # calculate the overall coefficient of determination (i.e. R2)
+            sst = sse + ssr
+            results.loc[feature_id, 'Rsquared'] = 1 - sse / sst
+            # degrees of freedom for residuals
+            dfe = res_i[0].df_resid
+            results.loc[feature_id, 'mse'] = sse / dfe
+        return results
+
+    def percent_explained(self):
+        """ Proportion explained by each principal balance."""
+        # Using sum of squares error calculation (df=1)
+        # instead of population variance (df=0).
+        axis_vars = np.var(self.balances, ddof=1, axis=0)
+        return axis_vars / axis_vars.sum()
diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
@@ -33,6 +33,20 @@ def setUp(self):
             'real': [1, 2, 3, 4, 5]
         }, index=['s1', 's2', 's3', 's4', 's5'])
 
+        np.random.seed(0)
+        n = 15
+        a = np.array([1, 4.2, 5.3, -2.2, 8])
+        x1 = np.linspace(.01, 0.1, n)
+        x2 = np.logspace(0, 0.01, n)
+        x3 = np.exp(np.linspace(0, 0.01, n))
+        x4 = x1 ** 2
+        self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4})
+        y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 +
+             np.random.normal(size=n))
+        sy = np.vstack((y, y/10)).T
+        self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c'])
+        self.t2 = TreeNode.read([r"((a,b)n,c);"])
+
     def test_ols(self):
         res = ols('real', self.table, self.metadata, self.tree)
         res.fit()
@@ -226,6 +240,58 @@ def test_summary_head(self):
             exp = fh.read()
             self.assertEqual(res, exp)
 
+    def test_loo(self):
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        exp_loo = pd.DataFrame([[0.66953263510975791, 10.994700550912553],
+                                [0.69679777354984163, 2.3613911713947062],
+                                [0.84934173316473072, 0.4057812892157881],
+                                [0.6990546679957772, 2.2872776593899351],
+                                [0.72855466737125463, 1.7615637744849277],
+                                [0.55998953661859308, 3.617823652256889],
+                                [0.81787392852582308, 0.72395497360494043],
+                                [0.8653549732546999, 0.17706927499520822],
+                                [0.86983181933002329, 0.1216027316667969],
+                                [0.87779006612352628, 0.028600627330344405],
+                                [0.86591226075609384, 0.16724511075065476],
+                                [0.7787232221539, 1.2820054843437292],
+                                [0.88032413856094505, 3.4113910096200831e-06],
+                                [0.83195133809800792, 0.62276589277034022],
+                                [0.85352707356786695, 1.4038585971691198]],
+                               columns=['mse', 'pred_err'],
+                               index=self.y.index)
+        res_loo = res.loo().astype(np.float)
+        pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)
+
+    def test_lovo(self):
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        exp_lovo = pd.DataFrame([[0.799364, 0.978214],
+                                 [0.799363, 0.097355],
+                                 [0.799368, 0.0973498],
+                                 [0.799364, 0.097354],
+                                 [0.799361, 0.0973575]],
+                                columns=['mse', 'Rsquared'],
+                                index=['Intercept', 'x1', 'x2', 'x3', 'x4'])
+        res_lovo = res.lovo().astype(np.float)
+        pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)
+
+    def test_percent_explained(self):
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        res_perc = res.percent_explained()
+        exp_perc = pd.Series({'y0': 0.009901,
+                              'y1': 0.990099})
+        pdt.assert_series_equal(res_perc, exp_perc)
+
+    def test_mse(self):
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        self.assertAlmostEqual(res.mse, 0.79228890379010453, places=4)
 
 if __name__ == "__main__":
     unittest.main()