wasade · mortonjt · Jul 11, 2016 · Jul 16, 2016 · Jul 16, 2016 · Jul 16, 2016
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,10 @@
+# gneiss changelog
+
+## Version 0.0.2  (changes since 0.0.2 go here)
+
+### Features
+* * Adding in a niche sorting algorithm `gneiss.sort.niche_sort` that can generate a band table given a gradient [#16](https://github.com/biocore/gneiss/pull/16)
+* Adding in utility functions for handing feature tables, metadata, and trees. [#12](https://github.com/biocore/gneiss/pull/12)
+* Adding GPL license.
+
+### Bug fixes
diff --git a/COPYING.txt b/COPYING.txt
diff --git a/gneiss/__init__.py b/gneiss/__init__.py
@@ -1,12 +1,12 @@
 # ----------------------------------------------------------------------------
 # Copyright (c) 2016--, gneiss development team.
 #
-# Distributed under the terms of the Modified BSD License.
+# Distributed under the terms of the GPLv3 License.
 #
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
 from __future__ import absolute_import, division, print_function
 
 
-__version__ = "0.0.1"
+__version__ = "0.0.2"
diff --git a/gneiss/balances.py b/gneiss/balances.py
@@ -10,25 +10,37 @@
 def _balance_basis(tree_node):
     """ Helper method for calculating balance basis
     """
-    counts, n_tips = _count_matrix(tree_node)
-    counts = OrderedDict([(x, counts[x])
-                          for x in counts.keys() if not x.is_tip()])
-    nds = counts.keys()
-    r = np.array([counts[n]['r'] for n in nds])
-    s = np.array([counts[n]['l'] for n in nds])
-    k = np.array([counts[n]['k'] for n in nds])
-    t = np.array([counts[n]['t'] for n in nds])
+    # TODO: use recarray
+    # col 0 -> right counts
+    # col 1 -> left counts
+    # col 2 -> k
+    # col 3 -> t
+    r_idx = 0
+    l_idx = 1
+    k_idx = 2
+    t_idx = 3
+
+    counts, n_tips, n_nodes = _count_matrix(tree_node)
+    r = counts[:, r_idx]
+    s = counts[:, l_idx]
+    k = counts[:, k_idx]
+    t = counts[:, t_idx]
 
     a = np.sqrt(s / (r*(r+s)))
     b = -1*np.sqrt(r / (s*(r+s)))
 
     basis = np.zeros((n_tips-1, n_tips))
-    for i in range(len(nds)):
-        basis[i, :] = np.array([0]*k[i] + [a[i]]*r[i] + [b[i]]*s[i] + [0]*t[i])
-    # Make sure that the basis is in level order
-    basis = basis[:, ::-1]
-    nds = list(nds)
-    return basis, nds
+    for i in np.arange(n_nodes - n_tips, dtype=int):
+        v = basis[i]
+
+        k_i = n_tips - k[i]
+        r_i = k_i - r[i]
+        s_i = r_i - s[i]
+
+        v[r_i:k_i] = a[i]
+        v[s_i:r_i] = b[i]
+
+    return basis, [n for n in tree_node.levelorder() if not n.is_tip()]
 
 
 def balance_basis(tree_node):
@@ -90,51 +102,63 @@ def balance_basis(tree_node):
 
 
 def _count_matrix(treenode):
-    n_tips = 0
-    nodes = list(treenode.levelorder(include_self=True))
-    # fill in the Ordered dictionary. Note that the
-    # elements of this Ordered dictionary are
-    # dictionaries.
-    counts = OrderedDict()
-    columns = ['k', 'r', 'l', 't', 'tips']
-    for n in nodes:
-        if n not in counts:
-            counts[n] = {}
-        for c in columns:
-            counts[n][c] = 0
-
-    # fill in r and l.  This is done in reverse level order.
-    for n in nodes[::-1]:
+    node_count = 0
+    for n in treenode.postorder(include_self=True):
+        node_count += 1
         if n.is_tip():
-            counts[n]['tips'] = 1
-            n_tips += 1
-        elif len(n.children) == 2:
-            lchild = n.children[0]
-            rchild = n.children[1]
-            counts[n]['r'] = counts[rchild]['tips']
-            counts[n]['l'] = counts[lchild]['tips']
-            counts[n]['tips'] = counts[n]['r'] + counts[n]['l']
+            n._tip_count = 1
         else:
-            raise ValueError("Not a strictly bifurcating tree!")
-
-    # fill in k and t
-    for n in nodes:
-        if n.parent is None:
-            counts[n]['k'] = 0
-            counts[n]['t'] = 0
-            continue
-        elif n.is_tip():
+            try:
+                left, right = n.children
+            except:
+                raise ValueError("Not a strictly bifurcating tree!")
+            n._tip_count = left._tip_count + right._tip_count
+
+    # TODO: use recarray
+    # col 0 -> right counts
+    # col 1 -> left counts
+    # col 2 -> k
+    # col 3 -> t
+    r_idx = 0
+    l_idx = 1
+    k_idx = 2
+    t_idx = 3
+    counts = np.zeros((node_count, 4), dtype=int)
+
+    for i, n in enumerate(treenode.levelorder(include_self=True)):
+        if n.is_tip():
             continue
-        # left or right child
-        # left = 0, right = 1
-        child_idx = 'l' if n.parent.children[0] != n else 'r'
-        if child_idx == 'l':
-            counts[n]['t'] = counts[n.parent]['t'] + counts[n.parent]['l']
-            counts[n]['k'] = counts[n.parent]['k']
+
+        n._lo_idx = i
+        node_counts = counts[i]
+
+        node_counts[r_idx] = 1 if n.is_tip() else n.children[1]._tip_count
+        node_counts[l_idx] = 1 if n.is_tip() else n.children[0]._tip_count
+
+        if n.is_root():
+            k = 0
+            t = 0
         else:
-            counts[n]['k'] = counts[n.parent]['k'] + counts[n.parent]['r']
-            counts[n]['t'] = counts[n.parent]['t']
-    return counts, n_tips
+            parent_counts = counts[n.parent._lo_idx]
+            if n is n.parent.children[0]:
+                #t = parent_counts[t_idx] + parent_counts[l_idx]
+                #k = parent_counts[k_idx]
+
+                k = parent_counts[k_idx] + parent_counts[r_idx]
+                t = parent_counts[t_idx]
+            else:
+                #k = parent_counts[k_idx] + parent_counts[r_idx]
+                #t = parent_counts[t_idx]
+
+                k = parent_counts[k_idx]
+                t = parent_counts[t_idx] + parent_counts[l_idx]
+
+        node_counts[k_idx] = k
+        node_counts[t_idx] = t
+
+        counts[i] = node_counts
+
+    return counts, treenode._tip_count, node_count
 
 
 def _attach_balances(balances, tree):

diff --git a/gneiss/layouts.py b/gneiss/layouts.py
@@ -1,3 +1,11 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
 from ete3 import faces, AttrFace, CircleFace, BarChartFace
 
 

diff --git a/gneiss/sort.py b/gneiss/sort.py
@@ -0,0 +1,116 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import numpy as np
+import pandas as pd
+from functools import partial
+from gneiss.util import match
+
+
+def mean_niche_estimator(abundances, gradient):
+    """ Estimates the mean niche of an organism.
+
+    Calculates the mean niche of an organism along a gradient.
+    This is done by calculating the expected value of an organism
+    across the gradient.
+
+    Specifically, this module calculates the following
+
+    .. math::
+        E[g | x] =
+         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}
+
+    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
+    species :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
+    at sample `i`.
+
+    Parameters
+    ----------
+    abundances : pd.Series, np.float
+        Vector of fraction abundances of an organism over
+        a list of samples.
+    gradient : pd.Series, np.float
+        Vector of numerical gradient values.
+
+    Returns
+    -------
+    np.float :
+        The mean gradient that the organism lives in.
+
+    Raises
+    ------
+    ValueError:
+        If the length of `abundances` is not the same length as `gradient`.
+    ValueError:
+        If the length of `gradient` contains nans.
+    """
+    len_abundances = len(abundances)
+    len_gradient = len(gradient)
+    if len_abundances != len_gradient:
+        raise ValueError("Length of `abundances` (%d) doesn't match the length"
+                         " of the `gradient` (%d)" % (len_abundances,
+                                                      len_gradient))
+    if np.any(pd.isnull(gradient)):
+        raise ValueError("`gradient` cannot have any nans.")
+
+    # normalizes the proportions of the organism across all of the
+    # samples to add to 1.
+    v = abundances / abundances.sum()
+    m = np.dot(gradient, v)
+    return m
+
+
+def niche_sort(table, gradient, niche_estimator=mean_niche_estimator):
+    """ Sort the table according to estimated niches.
+
+    Sorts the table by samples along the gradient
+    and otus by their estimated niche along the gradient.
+
+    Parameters
+    ----------
+    table : pd.DataFrame
+        Contingency table where samples are rows and
+        features (i.e. OTUs) are columns.
+    gradient : pd.Series
+        Vector of numerical gradient values.
+    niche_estimator : function, optional
+        A function that takes in two pandas series and returns an ordered
+        object. The ability for the object to be ordered is critical, since
+        this will allow the table to be sorted according to this ordering.
+        By default, `mean_niche_estimator` will be used.
+
+    Returns
+    -------
+    pd.DataFrame :
+        Sorted table according to the gradient of the samples, and the niches
+        of the organisms along that gradient.
+
+    Raises
+    ------
+    ValueError :
+        Raised if `niche_estimator` is not a function.
+    """
+    if not callable(niche_estimator):
+        raise ValueError("`niche_estimator` is not a function.")
+
+    table, gradient = match(table, gradient)
+
+    niche_estimator = partial(niche_estimator,
+                              gradient=gradient)
+
+    # normalizes feature abundances to sum to 1, for each sample.
+    # (i.e. scales values in each row to sum to 1).
+    normtable = table.apply(lambda x: x/x.sum(), axis=1)
+
+    # calculates estimated niche for each feature
+    est_niche = normtable.apply(niche_estimator, axis=0)
+    gradient = gradient.sort_values()
+    est_niche = est_niche.sort_values()
+
+    table = table.reindex(index=gradient.index,
+                          columns=est_niche.index)
+    return table
diff --git a/gneiss/tests/data/large_tree2.nwk b/gneiss/tests/data/large_tree2.nwk
diff --git a/gneiss/tests/data/small_tree.nwk b/gneiss/tests/data/small_tree.nwk
@@ -0,0 +1 @@
+(O0:0.316212845735,(O1:0.0544249673249,((((O8:6.17283950617e-05,O9:6.17283950617e-05):0.000396077412446,(O6:0.00015943877551,O7:0.00015943877551):0.000298367031998):0.00183128228143,(O4:0.000555555555556,O5:0.000555555555556):0.00173353253338):0.0105194882737,(O2:0.00347222222222,O3:0.00347222222222):0.00933635414042):0.0416163909622):0.26178787841);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		(O0:0.316212845735,(O1:0.0544249673249,((((O8:6.17283950617e-05,O9:6.17283950617e-05):0.000396077412446,(O6:0.00015943877551,O7:0.00015943877551):0.000298367031998):0.00183128228143,(O4:0.000555555555556,O5:0.000555555555556):0.00173353253338):0.0105194882737,(O2:0.00347222222222,O3:0.00347222222222):0.00933635414042):0.0416163909622):0.26178787841);