From 9a535b5856ee9423d1471338473d72404d2a8555 Mon Sep 17 00:00:00 2001 From: Robert Edwards Date: Fri, 25 May 2018 16:29:02 -0700 Subject: [PATCH] Adding a cophenetic matrix and a test method --- ete3/coretype/tree.py | 102 +++++++++++++++++++++++++++++++++++++++++ ete3/test/test_tree.py | 78 ++++++++++++++++++++++++++++++- 2 files changed, 179 insertions(+), 1 deletion(-) diff --git a/ete3/coretype/tree.py b/ete3/coretype/tree.py index 26b1c0016..643ee7a48 100644 --- a/ete3/coretype/tree.py +++ b/ete3/coretype/tree.py @@ -2474,6 +2474,108 @@ def phonehome(self): from .. import _ph _ph.call() + def cophenetic_matrix(self): + """ + .. versionadded: 3.1.1 + + Generate a cophenetic distance matrix of the treee to standard output + + The `cophenetic matrix ` is a matrix representation of the + distance between each node. + + if we have a tree like + + ----A + _____________|y + | | + | ----B + ________|z + | ----C + | | + |____________|x -----D + | | + |______|w + | + | + -----E + + Where w,x,y,z are internal nodes. + d(A,B) = d(y,A) + d(y,B) + and + d(A, E) = d(z,A) + d(z, E) = {d(z,y) + d(y,A)} + {d(z,x) + d(x,w) + d(w,E)} + + We use an idea inspired by the ete3 team: https://gist.github.com/jhcepas/279f9009f46bf675e3a890c19191158b : + + For each node find its path to the root. + + e.g. + + A -> A, y, z + E -> E, w, x,z + + and make these orderless sets. Then we XOR the two sets to only find the elements + that are in one or other sets but not both. In this case A, E, y, x, w. + + The distance between the two nodes is the sum of the distances from each of those nodes + to the parent + + One more optimization: since the distances are symmetric, and distance to itself is zero + we user itertools.combinations rather than itertools.permutations. This cuts our computes from theta(n^2) + 1/2n^2 - n (= O(n^2), which is still not great, but in reality speeds things up for large trees). + + + For this tree, we will return the two dimensional array: + A B C D E + A 0 d(A-y) + d(B-y) d(A-z) + d(C-z) d(A-z) + d(D-z) d(A-z) + d(E-z) + B d(B-y) + d(A-y) 0 d(B-z) + d(C-z) d(B-z) + d(D-z) d(B-z) + d(E-z) + C d(C-z) + d(A-z) d(C-z) + d(B-z) 0 d(C-x) + d(D-x) d(C-x) + d(E-x) + D d(D-z) + d(A-z) d(D-z) + d(B-z) d(D-x) + d(C-x) 0 d(D-w) + d(E-w) + E d(E-z) + d(A-z) d(E-z) + d(B-z) d(E-x) + d(C-x) d(E-w) + d(D-w) 0 + + We will also return the one dimensional array with the leaves in the order in which they appear in the matrix + (i.e. the column and/or row headers). + + :param filename: the optional file to write to. If not provided, output will be to standard output + :return: two-dimensional array and a one dimensional array + """ + + leaves = self.get_leaves() + paths = {x: set() for x in leaves} + + # get the paths going up the tree + # we get all the nodes up to the last one and store them in a set + + for n in leaves: + if n.is_root(): + continue + movingnode = n + while not movingnode.is_root(): + paths[n].add(movingnode) + movingnode = movingnode.up + + # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA + + leaf_distances = {x.name: {} for x in leaves} + + for (leaf1, leaf2) in itertools.combinations(leaves, 2): + # figure out the unique nodes in the path + uniquenodes = paths[leaf1] ^ paths[leaf2] + distance = sum(x.dist for x in uniquenodes) + leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][leaf1.name] = distance + + allleaves = sorted(leaf_distances.keys()) # the leaves in order that we will return + + output = [] # the two dimensional array that we will return + + for i, n in enumerate(allleaves): + output.append([]) + for m in allleaves: + if m == n: + output[i].append(0) # distance to ourself = 0 + else: + output[i].append(leaf_distances[n][m]) + return output, allleaves + def _translate_nodes(root, *nodes): name2node = dict([ [n, None] for n in nodes if type(n) is str]) diff --git a/ete3/test/test_tree.py b/ete3/test/test_tree.py index 6842427c3..e1b16faf1 100644 --- a/ete3/test/test_tree.py +++ b/ete3/test/test_tree.py @@ -1296,7 +1296,83 @@ def test_copy(self): self.assertEqual((t_pkl & "A").complex[0], [0,1]) self.assertEqual((t_deep & "A").testfn(), "YES") - + + def test_cophenetic_matrix(self): + t = Tree(nw_full) + dists, leaves = t.cophenetic_matrix() + actualdists = [ + [0, 2.3662779999999994, 2.350554999999999, 2.7002369999999996, 3.527812, 3.305472, 2.424086, 2.424086, + 2.432288, 2.483421, 2.3355079999999995, 2.3355079999999995, 2.389350999999999, 2.3812519999999995, + 2.404005999999999, 2.3945459999999996, 2.4035289999999994, 2.3689599999999995, 2.4048339999999997, + 2.6487609999999995], + [2.3662779999999994, 0, 0.079009, 1.122461, 2.38897, 2.16663, 0.47755000000000003, 0.47755000000000003, + 0.4857520000000001, 0.5368850000000001, 0.320202, 0.32020200000000004, 0.133729, 0.12563, + 0.14838400000000002, 0.230406, 0.168047, 0.113338, 0.16935199999999997, 0.633455], + [2.350554999999999, 0.079009, 0, 1.106738, 2.373247, 2.150907, 0.461827, 0.461827, 0.47002900000000003, + 0.521162, 0.304479, 0.30447900000000006, 0.11800599999999999, 0.10990699999999998, 0.132661, 0.214683, + 0.152324, 0.09761499999999998, 0.153629, 0.617732], + [2.7002369999999996, 1.122461, 1.106738, 0, 2.7229289999999997, 2.5005889999999997, 1.180269, 1.180269, + 1.188471, 1.239604, 1.091691, 1.091691, 1.145534, 1.137435, 1.160189, 1.1507290000000001, 1.159712, + 1.125143, 1.161017, 1.404944], + [3.527812, 2.38897, 2.373247, 2.7229289999999997, 0, 2.6926, 2.446778, 2.446778, 2.45498, 2.506113, + 2.3581999999999996, 2.3581999999999996, 2.412043, 2.403944, 2.426698, 2.4172379999999998, + 2.4262209999999995, 2.391652, 2.427526, 2.6714529999999996], + [3.305472, 2.16663, 2.150907, 2.5005889999999997, 2.6926, 0, 2.224438, 2.224438, 2.23264, 2.283773, 2.13586, + 2.13586, 2.189703, 2.181604, 2.204358, 2.194898, 2.2038809999999995, 2.169312, 2.205186, 2.449113], + [2.424086, 0.47755000000000003, 0.461827, 1.180269, 2.446778, 2.224438, 0, 0.0, 0.01366, + 0.30963300000000005, 0.44678, 0.44677999999999995, 0.5006229999999999, 0.49252399999999996, 0.515278, + 0.505818, 0.5148010000000001, 0.480232, 0.5161060000000001, 0.7600329999999998], + [2.424086, 0.47755000000000003, 0.461827, 1.180269, 2.446778, 2.224438, 0.0, 0, 0.01366, + 0.30963300000000005, 0.44678, 0.44677999999999995, 0.5006229999999999, 0.49252399999999996, 0.515278, + 0.505818, 0.5148010000000001, 0.480232, 0.5161060000000001, 0.7600329999999998], + [2.432288, 0.4857520000000001, 0.47002900000000003, 1.188471, 2.45498, 2.23264, 0.01366, 0.01366, 0, + 0.317835, 0.45498200000000005, 0.454982, 0.508825, 0.500726, 0.5234800000000001, 0.51402, 0.523003, + 0.48843400000000003, 0.524308, 0.7682349999999999], + [2.483421, 0.5368850000000001, 0.521162, 1.239604, 2.506113, 2.283773, 0.30963300000000005, + 0.30963300000000005, 0.317835, 0, 0.506115, 0.506115, 0.559958, 0.551859, 0.574613, 0.565153, 0.574136, + 0.539567, 0.5754410000000001, 0.8193679999999999], + [2.3355079999999995, 0.320202, 0.304479, 1.091691, 2.3581999999999996, 2.13586, 0.44678, 0.44678, + 0.45498200000000005, 0.506115, 0, 0.0, 0.343275, 0.33517600000000003, 0.35793, 0.34847, + 0.35745299999999997, 0.322884, 0.35875799999999997, 0.531709], + [2.3355079999999995, 0.32020200000000004, 0.30447900000000006, 1.091691, 2.3581999999999996, 2.13586, + 0.44677999999999995, 0.44677999999999995, 0.454982, 0.506115, 0.0, 0, 0.34327500000000005, + 0.33517600000000003, 0.35793, 0.34847, 0.357453, 0.32288400000000006, 0.358758, 0.531709], + [2.389350999999999, 0.133729, 0.11800599999999999, 1.145534, 2.412043, 2.189703, 0.5006229999999999, + 0.5006229999999999, 0.508825, 0.559958, 0.343275, 0.34327500000000005, 0, 0.013558999999999998, 0.021967, + 0.25347900000000007, 0.19112, 0.031257, 0.192425, 0.656528], + [2.3812519999999995, 0.12563, 0.10990699999999998, 1.137435, 2.403944, 2.181604, 0.49252399999999996, + 0.49252399999999996, 0.500726, 0.551859, 0.33517600000000003, 0.33517600000000003, 0.013558999999999998, 0, + 0.028214, 0.24538000000000004, 0.183021, 0.023157999999999998, 0.184326, 0.648429], + [2.404005999999999, 0.14838400000000002, 0.132661, 1.160189, 2.426698, 2.204358, 0.515278, 0.515278, + 0.5234800000000001, 0.574613, 0.35793, 0.35793, 0.021967, 0.028214, 0, 0.26813400000000004, + 0.20577499999999999, 0.045912, 0.20708, 0.6711830000000001], + [2.3945459999999996, 0.230406, 0.214683, 1.1507290000000001, 2.4172379999999998, 2.194898, 0.505818, + 0.505818, 0.51402, 0.565153, 0.34847, 0.34847, 0.25347900000000007, 0.24538000000000004, + 0.26813400000000004, 0, 0.267657, 0.233088, 0.268962, 0.661723], + [2.4035289999999994, 0.168047, 0.152324, 1.159712, 2.4262209999999995, 2.2038809999999995, + 0.5148010000000001, 0.5148010000000001, 0.523003, 0.574136, 0.35745299999999997, 0.357453, 0.19112, + 0.183021, 0.20577499999999999, 0.267657, 0, 0.170729, 0.057269, 0.670706], + [2.3689599999999995, 0.113338, 0.09761499999999998, 1.125143, 2.391652, 2.169312, 0.480232, 0.480232, + 0.48843400000000003, 0.539567, 0.322884, 0.32288400000000006, 0.031257, 0.023157999999999998, 0.045912, + 0.233088, 0.170729, 0, 0.17203399999999996, 0.636137], + [2.4048339999999997, 0.16935199999999997, 0.153629, 1.161017, 2.427526, 2.205186, 0.5161060000000001, + 0.5161060000000001, 0.524308, 0.5754410000000001, 0.35875799999999997, 0.358758, 0.192425, 0.184326, + 0.20708, 0.268962, 0.057269, 0.17203399999999996, 0, 0.672011], + [2.6487609999999995, 0.633455, 0.617732, 1.404944, 2.6714529999999996, 2.449113, 0.7600329999999998, + 0.7600329999999998, 0.7682349999999999, 0.8193679999999999, 0.531709, 0.531709, 0.656528, 0.648429, + 0.6711830000000001, 0.661723, 0.670706, 0.636137, 0.672011, 0] + ] + + actualleaves = ['Aga0007658', 'Bta0018700', 'Cfa0016700', 'Cin0011239', 'Ddi0002240', 'Dme0014628', + 'Dre0008390', 'Dre0008391', 'Dre0008392', 'Fru0004507', 'Gga0000981', 'Gga0000982', + 'Hsa0000001', 'Hsa0010711', 'Hsa0010730', 'Mdo0014718', 'Mms0024821', 'Ptr0000001', + 'Rno0030248', 'Xtr0044988'] + + for i in range(len(actualdists)): + for j in range(len(actualdists[i])): + self.assertAlmostEqual(actualdists[i][j], dists[i][j], places=4) + self.assertEqual(actualleaves, leaves) + # def test_traversing_speed(self): # return