adding time slicing to ts.samples()

mufernando · mufernando · commit cf8829e12a53 · 2021-09-17T10:52:50.000-05:00
diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst
@@ -22,6 +22,9 @@
 - Add `__setitem__` to all tables allowing single rows to be updated. For example
   `tables.nodes[0] = tables.nodes[0].replace(flags=tskit.NODE_IS_SAMPLE)`
   (:user:`jeromekelleher`, :user:`benjeffery`, :issue:`1545`, :pr:`1600`).
+- Added a new parameter ``time`` to ``TreeSequence.samples()`` allowing to select
+  samples at a specific time point or time interval.
+  (:user:`mufernando`, :user:`petrelharp`, :issue:`1692`, :pr:`1700`)
 
 --------------------
 [0.3.7] - 2021-07-08
diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py
@@ -346,6 +346,23 @@ def get_mrca(pi, x, y):
     return mrca
 
 
+def get_samples(ts, time=None, population=None):
+    samples = []
+    for node in ts.nodes():
+        keep = bool(node.is_sample())
+        if time is not None:
+            if isinstance(time, (int, float)):
+                keep &= np.isclose(node.time, time)
+            if isinstance(time, (tuple, list)):
+                keep &= node.time >= time[0]
+                keep &= node.time < time[1]
+        if population is not None:
+            keep &= node.population == population
+        if keep:
+            samples.append(node.id)
+    return np.array(samples)
+
+
 class TestMRCACalculator:
     """
     Class to test the Schieber-Vishkin algorithm.
@@ -509,11 +526,14 @@ class TestNumpySamples:
     various methods.
     """
 
-    def get_tree_sequence(self, num_demes=4):
-        n = 40
+    def get_tree_sequence(self, num_demes=4, times=None, n=40):
+        if times is None:
+            times = [0]
         return msprime.simulate(
             samples=[
-                msprime.Sample(time=0, population=j % num_demes) for j in range(n)
+                msprime.Sample(time=t, population=j % num_demes)
+                for j in range(n)
+                for t in times
             ],
             population_configurations=[
                 msprime.PopulationConfiguration() for _ in range(num_demes)
@@ -541,6 +561,149 @@ def test_samples(self):
             ]
         assert total == ts.num_samples
 
+    def test_samples_time(self):
+        times = [0, 0.1, 1 / 3, 1 / 4, 5 / 7]
+        ts = self.get_tree_sequence(num_demes=2, n=20, times=times)
+        for time in times:
+            assert np.array_equal(get_samples(ts, time=time), ts.samples(time=time))
+            for population in (None, 0):
+                assert np.array_equal(
+                    get_samples(ts, time=time, population=population),
+                    ts.samples(time=time, population=population),
+                )
+
+    def test_samples_time_interval(self):
+        rng = np.random.default_rng(seed=931)
+        time_intervals = [
+            [0, 0.1],
+            (0, 1 / 3),
+            np.array([1 / 4, 2 / 3]),
+            (0.345, 5 / 7),
+            (-1, 1),
+        ]
+        for time_interval in time_intervals:
+            times = rng.uniform(low=time_interval[0], high=time_interval[1], size=20)
+            ts = self.get_tree_sequence(num_demes=2, n=1, times=times)
+            assert np.array_equal(
+                get_samples(ts, time=time_interval),
+                ts.samples(time=time_interval),
+            )
+            for population in (None, 0):
+                assert np.array_equal(
+                    get_samples(ts, time=time_interval, population=population),
+                    ts.samples(time=time_interval, population=population),
+                )
+
+    def test_samples_example(self):
+        tables = tskit.TableCollection(sequence_length=10)
+        time = [np.array(0), 0, np.array([1]), 1, 1, 3, 3.00001, 3.0 - 0.0001, 1 / 3]
+        pops = [1, 3, 1, 2, 1, 1, 1, 3, 1]
+        for _ in range(max(pops) + 1):
+            tables.populations.add_row()
+        for t, p in zip(time, pops):
+            tables.nodes.add_row(
+                flags=tskit.NODE_IS_SAMPLE,
+                time=t,
+                population=p,
+            )
+        # add not-samples also
+        for t, p in zip(time, pops):
+            tables.nodes.add_row(
+                flags=0,
+                time=t,
+                population=p,
+            )
+        ts = tables.tree_sequence()
+        assert np.array_equal(
+            ts.samples(),
+            np.arange(len(time)),
+        )
+        assert np.array_equal(
+            ts.samples(time=[0, np.inf]),
+            np.arange(len(time)),
+        )
+        assert np.array_equal(
+            ts.samples(time=0),
+            [0, 1],
+        )
+        # default tolerance is 1e-5
+        assert np.array_equal(
+            ts.samples(time=0.3333333),
+            [8],
+        )
+        assert np.array_equal(
+            ts.samples(time=3),
+            [5, 6],
+        )
+        assert np.array_equal(
+            ts.samples(time=1),
+            [2, 3, 4],
+        )
+        assert np.array_equal(
+            ts.samples(time=1, population=2),
+            [3],
+        )
+        assert np.array_equal(
+            ts.samples(population=0),
+            [],
+        )
+        assert np.array_equal(
+            ts.samples(population=1),
+            [0, 2, 4, 5, 6, 8],
+        )
+        assert np.array_equal(
+            ts.samples(population=2),
+            [3],
+        )
+        assert np.array_equal(
+            ts.samples(time=[0, 3]),
+            [0, 1, 2, 3, 4, 7, 8],
+        )
+        # note tuple instead of array
+        assert np.array_equal(
+            ts.samples(time=(1, 3)),
+            [2, 3, 4, 7],
+        )
+        assert np.array_equal(
+            ts.samples(time=[0, 3], population=1),
+            [0, 2, 4, 8],
+        )
+        assert np.array_equal(
+            ts.samples(time=[0.333333, 3]),
+            [2, 3, 4, 7, 8],
+        )
+        assert np.array_equal(
+            ts.samples(time=[100, np.inf]),
+            [],
+        )
+        assert np.array_equal(
+            ts.samples(time=-1),
+            [],
+        )
+        assert np.array_equal(
+            ts.samples(time=[-100, 100]),
+            np.arange(len(time)),
+        )
+        assert np.array_equal(
+            ts.samples(time=[-100, -1]),
+            [],
+        )
+
+    def test_samples_time_errors(self):
+        ts = self.get_tree_sequence(4)
+        # error incorrect types
+        with pytest.raises(ValueError):
+            ts.samples(time="s")
+        with pytest.raises(ValueError):
+            ts.samples(time=[])
+        with pytest.raises(ValueError):
+            ts.samples(time=np.array([1, 2, 3]))
+        with pytest.raises(ValueError):
+            ts.samples(time=(1, 2, 3))
+        # error using min and max switched
+        with pytest.raises(ValueError):
+            ts.samples(time=(2.4, 1))
+
     def test_genotype_matrix_indexing(self):
         num_demes = 4
         ts = self.get_tree_sequence(num_demes)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -4800,15 +4800,21 @@ def get_samples(self, population_id=None):
         # Deprecated alias for samples()
         return self.samples(population_id)
 
-    def samples(self, population=None, population_id=None):
-        """
-        Returns an array of the sample node IDs in this tree sequence. If the
-        ``population`` parameter is specified, only return sample IDs from that
-        population.
-
-        :param int population: The population of interest. If None,
-            return all samples.
+    def samples(self, population=None, population_id=None, time=None):
+        """
+        Returns an array of the sample node IDs in this tree sequence. If
+        `population` is specified, only return sample IDs from that population.
+        It is also possible to restrict samples by time using the parameter
+        `time`. If `time` is a numeric value, only return sample IDs whose node
+        time is approximately equal to the specified time. If `time` is a pair
+        of values of the form `(min_time, max_time)`, only return sample IDs
+        whose node time `t` is in this interval such that `min_time <= t < max_time`.
+
+        :param int population: The population of interest. If None, do not
+            filter samples by population.
         :param int population_id: Deprecated alias for ``population``.
+        :param float,tuple time: The time or time interval of interest. If
+            None, do not filter samples by time.
         :return: A numpy array of the node IDs for the samples of interest,
             listed in numerical order.
         :rtype: numpy.ndarray (dtype=np.int32)
@@ -4820,10 +4826,27 @@ def samples(self, population=None, population_id=None):
         if population_id is not None:
             population = population_id
         samples = self._ll_tree_sequence.get_samples()
+        keep = np.full(shape=samples.shape, fill_value=True)
         if population is not None:
             sample_population = self.tables.nodes.population[samples]
-            samples = samples[sample_population == population]
-        return samples
+            keep = np.logical_and(keep, sample_population == population)
+        if time is not None:
+            # ndmin is set so that scalars are converted into 1d arrays
+            time = np.array(time, ndmin=1, dtype=float)
+            sample_times = self.tables.nodes.time[samples]
+            if time.shape == (1,):
+                keep = np.logical_and(keep, np.isclose(sample_times, time))
+            elif time.shape == (2,):
+                if time[1] <= time[0]:
+                    raise ValueError("time_interval max is less than or equal to min.")
+                keep = np.logical_and(keep, sample_times >= time[0])
+                keep = np.logical_and(keep, sample_times < time[1])
+            else:
+                raise ValueError(
+                    "time must be either a single value or a pair of values "
+                    "(min_time, max_time)."
+                )
+        return samples[keep]
 
     def write_fasta(self, output, sequence_ids=None, wrap_width=60):
         ""