Skip to content

Commit 2e7428b

Browse files
jeromekellehermergify-bot
authored and
mergify-bot
committed
Update code to use the virtual root and traversals
Closes #1787 Note we have left out updating the traversals in vargen for now because it is undergoing a large refactoring anyway.
1 parent e622480 commit 2e7428b

File tree

3 files changed

+86
-129
lines changed

3 files changed

+86
-129
lines changed

c/tskit/convert.c

+1-2
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,7 @@ tsk_newick_converter_init(tsk_newick_converter_t *self, const tsk_tree_t *tree,
154154
self->options = options;
155155
self->tree = tree;
156156
self->traversal_stack
157-
= tsk_malloc(tsk_treeseq_get_num_nodes(self->tree->tree_sequence)
158-
* sizeof(*self->traversal_stack));
157+
= tsk_malloc(tsk_tree_get_size_bound(tree) * sizeof(*self->traversal_stack));
159158
if (self->traversal_stack == NULL) {
160159
ret = TSK_ERR_NO_MEMORY;
161160
goto out;

c/tskit/trees.c

+57-90
Original file line numberDiff line numberDiff line change
@@ -3509,34 +3509,29 @@ tsk_tree_get_num_samples_by_traversal(
35093509
const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples)
35103510
{
35113511
int ret = 0;
3512-
tsk_id_t *stack = NULL;
3513-
tsk_id_t v;
3512+
tsk_size_t num_nodes, j;
35143513
tsk_size_t count = 0;
3515-
int stack_top = 0;
3514+
const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags;
3515+
tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes));
3516+
tsk_id_t v;
35163517

3517-
stack = tsk_malloc(self->num_nodes * sizeof(*stack));
3518-
if (stack == NULL) {
3518+
if (nodes == NULL) {
35193519
ret = TSK_ERR_NO_MEMORY;
35203520
goto out;
35213521
}
3522-
3523-
stack[0] = u;
3524-
while (stack_top >= 0) {
3525-
v = stack[stack_top];
3526-
stack_top--;
3527-
if (tsk_treeseq_is_sample(self->tree_sequence, v)) {
3522+
ret = tsk_tree_preorder(self, u, nodes, &num_nodes);
3523+
if (ret != 0) {
3524+
goto out;
3525+
}
3526+
for (j = 0; j < num_nodes; j++) {
3527+
v = nodes[j];
3528+
if (flags[v] & TSK_NODE_IS_SAMPLE) {
35283529
count++;
35293530
}
3530-
v = self->left_child[v];
3531-
while (v != TSK_NULL) {
3532-
stack_top++;
3533-
stack[stack_top] = v;
3534-
v = self->right_sib[v];
3535-
}
35363531
}
35373532
*num_samples = count;
35383533
out:
3539-
tsk_safe_free(stack);
3534+
tsk_safe_free(nodes);
35403535
return ret;
35413536
}
35423537

@@ -3649,14 +3644,14 @@ tsk_tree_get_sites(
36493644
static int
36503645
tsk_tree_get_depth_unsafe(const tsk_tree_t *self, tsk_id_t u)
36513646
{
3652-
36533647
tsk_id_t v;
3648+
const tsk_id_t *restrict parent = self->parent;
36543649
int depth = 0;
36553650

36563651
if (u == self->virtual_root) {
36573652
return -1;
36583653
}
3659-
for (v = self->parent[u]; v != TSK_NULL; v = self->parent[v]) {
3654+
for (v = parent[u]; v != TSK_NULL; v = parent[v]) {
36603655
depth++;
36613656
}
36623657
return depth;
@@ -4443,6 +4438,9 @@ get_smallest_set_bit(uint64_t v)
44434438
* use a general cost matrix, in which case we'll use the Sankoff algorithm. For
44444439
* now this is unused.
44454440
*
4441+
* We should also vectorise the function so that several sites can be processed
4442+
* at once.
4443+
*
44464444
* The algorithm used here is Hartigan parsimony, "Minimum Mutation Fits to a
44474445
* Given Tree", Biometrics 1973.
44484446
*/
@@ -4458,30 +4456,34 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes,
44584456
int8_t state;
44594457
};
44604458
const tsk_size_t num_samples = self->tree_sequence->num_samples;
4461-
const tsk_size_t num_nodes = self->num_nodes;
44624459
const tsk_id_t *restrict left_child = self->left_child;
44634460
const tsk_id_t *restrict right_sib = self->right_sib;
4464-
const tsk_id_t *restrict parent = self->parent;
4461+
const tsk_size_t N = tsk_treeseq_get_num_nodes(self->tree_sequence);
44654462
const tsk_flags_t *restrict node_flags = self->tree_sequence->tables->nodes.flags;
4466-
uint64_t optimal_root_set;
4467-
uint64_t *restrict optimal_set = tsk_calloc(num_nodes, sizeof(*optimal_set));
4468-
tsk_id_t *restrict postorder_stack
4469-
= tsk_malloc(num_nodes * sizeof(*postorder_stack));
4463+
tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes));
4464+
/* Note: to use less memory here and to improve cache performance we should
4465+
* probably change to allocating exactly the number of nodes returned by
4466+
* a preorder traversal, and then lay the memory out in this order. So, we'd
4467+
* need a map from node ID to its index in the preorder traversal, but this
4468+
* is trivial to compute. Probably doesn't matter so much at the moment
4469+
* when we're doing a single site, but it would make a big difference if
4470+
* we were vectorising over lots of sites. */
4471+
uint64_t *restrict optimal_set = tsk_calloc(N + 1, sizeof(*optimal_set));
44704472
struct stack_elem *restrict preorder_stack
4471-
= tsk_malloc(num_nodes * sizeof(*preorder_stack));
4472-
tsk_id_t postorder_parent, root, u, v;
4473+
= tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*preorder_stack));
4474+
tsk_id_t root, u, v;
44734475
/* The largest possible number of transitions is one over every sample */
44744476
tsk_state_transition_t *transitions = tsk_malloc(num_samples * sizeof(*transitions));
44754477
int8_t allele, ancestral_state;
44764478
int stack_top;
44774479
struct stack_elem s;
4478-
tsk_size_t j, num_transitions, max_allele_count;
4480+
tsk_size_t j, num_transitions, max_allele_count, num_nodes;
44794481
tsk_size_t allele_count[HARTIGAN_MAX_ALLELES];
44804482
tsk_size_t non_missing = 0;
44814483
int8_t num_alleles = 0;
44824484

4483-
if (optimal_set == NULL || preorder_stack == NULL || postorder_stack == NULL
4484-
|| transitions == NULL) {
4485+
if (optimal_set == NULL || preorder_stack == NULL || transitions == NULL
4486+
|| nodes == NULL) {
44854487
ret = TSK_ERR_NO_MEMORY;
44864488
goto out;
44874489
}
@@ -4518,68 +4520,33 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes,
45184520
}
45194521
}
45204522

4521-
for (root = self->left_root; root != TSK_NULL; root = self->right_sib[root]) {
4522-
/* Do a post order traversal */
4523-
postorder_stack[0] = root;
4524-
stack_top = 0;
4525-
postorder_parent = TSK_NULL;
4526-
while (stack_top >= 0) {
4527-
u = postorder_stack[stack_top];
4528-
if (left_child[u] != TSK_NULL && u != postorder_parent) {
4529-
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
4530-
stack_top++;
4531-
postorder_stack[stack_top] = v;
4532-
}
4533-
} else {
4534-
stack_top--;
4535-
postorder_parent = parent[u];
4536-
4537-
/* Visit u */
4538-
tsk_memset(
4539-
allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count));
4540-
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
4541-
for (allele = 0; allele < num_alleles; allele++) {
4542-
allele_count[allele] += bit_is_set(optimal_set[v], allele);
4543-
}
4544-
}
4545-
if (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) {
4546-
max_allele_count = 0;
4547-
for (allele = 0; allele < num_alleles; allele++) {
4548-
max_allele_count
4549-
= TSK_MAX(max_allele_count, allele_count[allele]);
4550-
}
4551-
for (allele = 0; allele < num_alleles; allele++) {
4552-
if (allele_count[allele] == max_allele_count) {
4553-
optimal_set[u] = set_bit(optimal_set[u], allele);
4554-
}
4555-
}
4556-
}
4557-
}
4558-
}
4523+
ret = tsk_tree_postorder(self, self->virtual_root, nodes, &num_nodes);
4524+
if (ret != 0) {
4525+
goto out;
45594526
}
4560-
4561-
if (!(options & TSK_MM_FIXED_ANCESTRAL_STATE)) {
4562-
optimal_root_set = 0;
4563-
/* TODO it's annoying that this is essentially the same as the
4564-
* visit function above. It would be nice if we had an extra
4565-
* node that was the parent of all roots, then the algorithm
4566-
* would work as-is */
4527+
for (j = 0; j < num_nodes; j++) {
4528+
u = nodes[j];
45674529
tsk_memset(allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count));
4568-
for (root = self->left_root; root != TSK_NULL; root = right_sib[root]) {
4530+
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
45694531
for (allele = 0; allele < num_alleles; allele++) {
4570-
allele_count[allele] += bit_is_set(optimal_set[root], allele);
4532+
allele_count[allele] += bit_is_set(optimal_set[v], allele);
45714533
}
45724534
}
4573-
max_allele_count = 0;
4574-
for (allele = 0; allele < num_alleles; allele++) {
4575-
max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]);
4576-
}
4577-
for (allele = 0; allele < num_alleles; allele++) {
4578-
if (allele_count[allele] == max_allele_count) {
4579-
optimal_root_set = set_bit(optimal_root_set, allele);
4535+
/* the virtual root has no flags defined */
4536+
if (u == (tsk_id_t) N || !(node_flags[u] & TSK_NODE_IS_SAMPLE)) {
4537+
max_allele_count = 0;
4538+
for (allele = 0; allele < num_alleles; allele++) {
4539+
max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]);
4540+
}
4541+
for (allele = 0; allele < num_alleles; allele++) {
4542+
if (allele_count[allele] == max_allele_count) {
4543+
optimal_set[u] = set_bit(optimal_set[u], allele);
4544+
}
45804545
}
45814546
}
4582-
ancestral_state = get_smallest_set_bit(optimal_root_set);
4547+
}
4548+
if (!(options & TSK_MM_FIXED_ANCESTRAL_STATE)) {
4549+
ancestral_state = get_smallest_set_bit(optimal_set[N]);
45834550
}
45844551

45854552
num_transitions = 0;
@@ -4622,8 +4589,8 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes,
46224589
if (preorder_stack != NULL) {
46234590
free(preorder_stack);
46244591
}
4625-
if (postorder_stack != NULL) {
4626-
free(postorder_stack);
4592+
if (nodes != NULL) {
4593+
free(nodes);
46274594
}
46284595
return ret;
46294596
}
@@ -4888,7 +4855,7 @@ fill_kc_vectors(const tsk_tree_t *t, kc_vectors *kc_vecs)
48884855
int ret = 0;
48894856
const tsk_treeseq_t *ts = t->tree_sequence;
48904857

4891-
stack = tsk_malloc(t->num_nodes * sizeof(*stack));
4858+
stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack));
48924859
if (stack == NULL) {
48934860
ret = TSK_ERR_NO_MEMORY;
48944861
goto out;
@@ -5094,7 +5061,7 @@ update_kc_subtree_state(
50945061
tsk_id_t *stack = NULL;
50955062
int ret = 0;
50965063

5097-
stack = tsk_malloc(t->num_nodes * sizeof(*stack));
5064+
stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack));
50985065
if (stack == NULL) {
50995066
ret = TSK_ERR_NO_MEMORY;
51005067
goto out;

python/tests/test_parsimony.py

+28-37
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2019-2020 Tskit Developers
3+
# Copyright (c) 2019-2021 Tskit Developers
44
#
55
# Permission is hereby granted, free of charge, to any person obtaining a copy
66
# of this software and associated documentation files (the "Software"), to deal
@@ -22,10 +22,10 @@
2222
"""
2323
Tests for the tree parsimony methods.
2424
"""
25+
import dataclasses
2526
import io
2627
import itertools
2728

28-
import attr
2929
import Bio.Phylo.TreeConstruction
3030
import msprime
3131
import numpy as np
@@ -219,48 +219,39 @@ def hartigan_map_mutations(tree, genotypes, alleles, ancestral_state=None):
219219
optimal_set[u] = 1
220220

221221
allele_count = np.zeros(num_alleles, dtype=int)
222-
for root in tree.roots:
223-
for u in tree.nodes(root, order="postorder"):
224-
allele_count[:] = 0
225-
for v in tree.children(u):
226-
for j in range(num_alleles):
227-
allele_count[j] += optimal_set[v, j]
228-
if not tree.is_sample(u):
229-
max_allele_count = np.max(allele_count)
230-
optimal_set[u, allele_count == max_allele_count] = 1
231-
232-
if ancestral_state is None:
222+
for u in tree.nodes(tree.virtual_root, order="postorder"):
233223
allele_count[:] = 0
234-
for v in tree.roots:
224+
for v in tree.children(u):
235225
for j in range(num_alleles):
236226
allele_count[j] += optimal_set[v, j]
237-
max_allele_count = np.max(allele_count)
238-
optimal_root_set = np.zeros(num_alleles, dtype=int)
239-
optimal_root_set[allele_count == max_allele_count] = 1
240-
ancestral_state = np.argmax(optimal_root_set)
227+
if not tree.is_sample(u):
228+
max_allele_count = np.max(allele_count)
229+
optimal_set[u, allele_count == max_allele_count] = 1
241230

242-
@attr.s
231+
if ancestral_state is None:
232+
ancestral_state = np.argmax(optimal_set[tree.virtual_root])
233+
234+
@dataclasses.dataclass
243235
class StackElement:
244-
node = attr.ib()
245-
state = attr.ib()
246-
mutation_parent = attr.ib()
236+
node: int
237+
state: int
238+
mutation_parent: int
247239

248240
mutations = []
249-
for root in tree.roots:
250-
stack = [StackElement(root, ancestral_state, -1)]
251-
while len(stack) > 0:
252-
s = stack.pop()
253-
if optimal_set[s.node, s.state] == 0:
254-
s.state = np.argmax(optimal_set[s.node])
255-
mutation = tskit.Mutation(
256-
node=s.node,
257-
derived_state=alleles[s.state],
258-
parent=s.mutation_parent,
259-
)
260-
s.mutation_parent = len(mutations)
261-
mutations.append(mutation)
262-
for v in tree.children(s.node):
263-
stack.append(StackElement(v, s.state, s.mutation_parent))
241+
stack = [StackElement(root, ancestral_state, -1) for root in reversed(tree.roots)]
242+
while len(stack) > 0:
243+
s = stack.pop()
244+
if optimal_set[s.node, s.state] == 0:
245+
s.state = np.argmax(optimal_set[s.node])
246+
mutation = tskit.Mutation(
247+
node=s.node,
248+
derived_state=alleles[s.state],
249+
parent=s.mutation_parent,
250+
)
251+
s.mutation_parent = len(mutations)
252+
mutations.append(mutation)
253+
for v in tree.children(s.node):
254+
stack.append(StackElement(v, s.state, s.mutation_parent))
264255
return alleles[ancestral_state], mutations
265256

266257

0 commit comments

Comments
 (0)