Skip to content

Commit 8f1bc7e

Browse files
jeromekelleherbenjeffery
authored andcommitted
Add mmap_temp_dir option to generate_ancestors
Closes tskit-dev#806
1 parent 2b6e898 commit 8f1bc7e

File tree

11 files changed

+300
-50
lines changed

11 files changed

+300
-50
lines changed

.circleci/config.yml

+13-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
version: 2
1+
version: 2.1
2+
3+
orbs:
4+
codecov: codecov/codecov@3.2.4
5+
26
jobs:
37
build:
48
docker:
@@ -44,8 +48,11 @@ jobs:
4448
name: Run Python tests and upload coverage
4549
command: |
4650
python3 -m pytest --cov=tsinfer --cov-report=xml --cov-branch -xvs tests
47-
python3 -m codecov -X gcov -F python
4851
rm .coverage
52+
53+
- codecov/upload:
54+
flags: python
55+
token: CODECOV_TOKEN
4956

5057
- run:
5158
name: Compile C with gcc
@@ -77,7 +84,10 @@ jobs:
7784
gcov -pb ./libtsinfer.a.p/object_heap.c.gcno ../lib/object_heap.c
7885
gcov -pb ./libtsinfer.a.p/err.c.gcno ../lib/err.c
7986
cd ..
80-
codecov -X gcov -F C
87+
88+
- codecov/upload:
89+
flags: C
90+
token: CODECOV_TOKEN
8191

8292
- run:
8393
name: Valgrind for C tests.

CHANGELOG.md

+12-8
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,17 @@ In development
1111

1212
**Performance improvements**
1313

14-
- Reduce memory usage when running `match_samples` against large cohorts
15-
containing sequences with substantial amounts of error.
14+
- Reduce memory usage when running `match_samples` against large cohorts
15+
containing sequences with substantial amounts of error.
1616
({pr}`761`, {user}`jeromekelleher`)
1717

1818
- `truncate_ancestors` no longer requires loading all the ancestors into RAM.
1919
({pr}`811`, {user}`benjeffery`)
2020

21+
- Reduce memory requirements of the `generate_ancestors` function by providing
22+
the `genotype_encoding` ({pr}`809`) and `mmap_temp_dir` ({pr}`808`) options
23+
({user}`jeromekelleher`).
24+
2125
## [0.3.0] - 2022-10-25
2226

2327
**Features**
@@ -29,11 +33,11 @@ In development
2933

3034
- The CLI interface now allows recombination rate (or rate maps) and mismatch ratios
3135
to be specified ({pr}`731`, {issue}`435` {user}`hyanwong`)
32-
36+
3337
- The calls to match-ancestors and match-samples via the CLI are now logged
3438
in the provenance entries of the output tree sequence ({pr}`732` and `741`,
3539
{issue}`730` {user}`hyanwong`)
36-
40+
3741
- The CLI interface allows `--no-post-process` to be specified (for details of post-
3842
processing, see "Breaking changes" below) ({pr}`727`, {issue}`721` {user}`hyanwong`)
3943

@@ -45,7 +49,7 @@ In development
4549
- `sample_data.subset()` now accepts a sequence_length ({pr}`681`, {user}`hyanwong`)
4650

4751
- `verify` no longer raises error when comparing a genotype to missingness.
48-
({pr}`716`, {issue}`625`, {user}`benjeffery`)
52+
({pr}`716`, {issue}`625`, {user}`benjeffery`)
4953

5054
**Breaking changes**:
5155

@@ -54,8 +58,8 @@ In development
5458
tsinfer to aid the matching process) then splits the ultimate ancestor into separate
5559
pieces. If splitting is not required, the `post_process` step can also be called as a
5660
separate function with the parameter `split_ultimate=False` ({pr}`687`, {pr}`750`,
57-
{issue}`673`, {user}`hyanwong`)
58-
61+
{issue}`673`, {user}`hyanwong`)
62+
5963
- Post-processing by default erases tree topology that exists before the first site
6064
and one unit after the last site, to avoid extrapolating into regions with no data.
6165
This can be disabled by calling `post_process` step as a separate function with the
@@ -94,7 +98,7 @@ In development
9498
- Oldest nodes in a standard inferred tree sequence are no longer set to frequencies ~2
9599
and ~3 (i.e. 2 or 3 times as old as all the other nodes), but are spaced above the
96100
others by the mean time between unique ancestor ages ({pr}`485`, {user}`hyanwong`)
97-
101+
98102
- The `tsinfer.SampleData.from_tree_sequence()` function now defaults to setting
99103
`use_sites_time` and `use_individuals_time` to `False` rather than `True`
100104
({pr}`599`, {user}`hyanwong`)

_tsinfermodule.c

+5-4
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,14 @@ AncestorBuilder_init(AncestorBuilder *self, PyObject *args, PyObject *kwds)
9292
{
9393
int ret = -1;
9494
int err;
95-
static char *kwlist[] = {"num_samples", "max_sites", "genotype_encoding", NULL};
95+
static char *kwlist[] = {"num_samples", "max_sites", "genotype_encoding", "mmap_fd", NULL};
9696
int num_samples, max_sites, genotype_encoding;
9797
int flags = 0;
98+
int mmap_fd = -1;
9899

99100
self->builder = NULL;
100-
if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|i", kwlist,
101-
&num_samples, &max_sites, &genotype_encoding)) {
101+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|ii", kwlist,
102+
&num_samples, &max_sites, &genotype_encoding, &mmap_fd)) {
102103
goto out;
103104
}
104105
self->builder = PyMem_Malloc(sizeof(ancestor_builder_t));
@@ -108,7 +109,7 @@ AncestorBuilder_init(AncestorBuilder *self, PyObject *args, PyObject *kwds)
108109
}
109110
flags = genotype_encoding;
110111
Py_BEGIN_ALLOW_THREADS
111-
err = ancestor_builder_alloc(self->builder, num_samples, max_sites, flags);
112+
err = ancestor_builder_alloc(self->builder, num_samples, max_sites, mmap_fd, flags);
112113
Py_END_ALLOW_THREADS
113114
if (err != 0) {
114115
handle_library_error(err);

lib/ancestor_builder.c

+94-9
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,15 @@
1616
** You should have received a copy of the GNU General Public License
1717
** along with tsinfer. If not, see <http://www.gnu.org/licenses/>.
1818
*/
19+
/* It's not worth trying to get mmap'd genotypes working on windows,
20+
* and is just a silent no-op if it's tried.
21+
*/
22+
#if defined(_WIN32)
23+
#else
24+
/* Needed for ftruncate */
25+
#define _XOPEN_SOURCE 700
26+
#define MMAP_GENOTYPES 1
27+
#endif
1928

2029
#include "tsinfer.h"
2130
#include "err.h"
@@ -25,6 +34,17 @@
2534
#include <string.h>
2635
#include <stdbool.h>
2736

37+
#include <errno.h>
38+
#include <fcntl.h>
39+
#include <stdlib.h>
40+
#include <string.h>
41+
42+
#ifdef MMAP_GENOTYPES
43+
#include <sys/mman.h>
44+
#include <unistd.h>
45+
#include <sys/types.h>
46+
#endif
47+
2848
#include "avl.h"
2949

3050
/* Note: using an unoptimised version of bit packing here because we're
@@ -135,6 +155,7 @@ ancestor_builder_print_state(ancestor_builder_t *self, FILE *out)
135155

136156
fprintf(out, "Ancestor builder\n");
137157
fprintf(out, "flags = %d\n", (int) self->flags);
158+
fprintf(out, "mmap_fd = %d\n", self->mmap_fd);
138159
fprintf(out, "num_samples = %d\n", (int) self->num_samples);
139160
fprintf(out, "num_sites = %d\n", (int) self->num_sites);
140161
fprintf(out, "num_ancestors = %d\n", (int) self->num_ancestors);
@@ -181,23 +202,62 @@ ancestor_builder_print_state(ancestor_builder_t *self, FILE *out)
181202
return 0;
182203
}
183204

184-
int
185-
ancestor_builder_alloc(
186-
ancestor_builder_t *self, size_t num_samples, size_t max_sites, int flags)
205+
#ifdef MMAP_GENOTYPES
206+
207+
static int
208+
ancestor_builder_make_genotype_mmap(ancestor_builder_t *self)
187209
{
210+
188211
int ret = 0;
189-
unsigned long max_size = 1024 * 1024;
190212

191-
memset(self, 0, sizeof(ancestor_builder_t));
192-
if (num_samples <= 1) {
193-
ret = TSI_ERR_BAD_NUM_SAMPLES;
213+
self->mmap_size = self->max_sites * self->encoded_genotypes_size;
214+
if (ftruncate(self->mmap_fd, (off_t) self->mmap_size) != 0) {
215+
ret = TSI_ERR_IO;
194216
goto out;
195217
}
218+
self->mmap_buffer = mmap(
219+
NULL, self->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, self->mmap_fd, 0);
220+
if (self->mmap_buffer == MAP_FAILED) {
221+
self->mmap_buffer = NULL;
222+
ret = TSI_ERR_IO;
223+
goto out;
224+
}
225+
self->mmap_offset = 0;
226+
out:
227+
return ret;
228+
}
196229

230+
static int
231+
ancestor_builder_free_genotype_mmap(ancestor_builder_t *self)
232+
{
233+
if (self->mmap_buffer != NULL) {
234+
/* There's nothing we can do about it here, so don't check errors. */
235+
munmap(self->mmap_buffer, self->mmap_size);
236+
}
237+
/* Try to truncate to zero so we don't flush out all the data */
238+
ftruncate(self->mmap_fd, 0);
239+
return 0;
240+
}
241+
#endif
242+
243+
int
244+
ancestor_builder_alloc(ancestor_builder_t *self, size_t num_samples, size_t max_sites,
245+
int mmap_fd, int flags)
246+
{
247+
int ret = 0;
248+
unsigned long max_size = 1024 * 1024;
249+
250+
memset(self, 0, sizeof(ancestor_builder_t));
197251
self->num_samples = num_samples;
198252
self->max_sites = max_sites;
253+
self->mmap_fd = mmap_fd;
199254
self->num_sites = 0;
200255
self->flags = flags;
256+
257+
if (num_samples <= 1) {
258+
ret = TSI_ERR_BAD_NUM_SAMPLES;
259+
goto out;
260+
}
201261
if (self->flags & TSI_GENOTYPE_ENCODING_ONE_BIT) {
202262
self->encoded_genotypes_size = (num_samples / 8) + ((num_samples % 8) != 0);
203263
self->decoded_genotypes_size = self->encoded_genotypes_size * 8;
@@ -228,6 +288,14 @@ ancestor_builder_alloc(
228288
if (ret != 0) {
229289
goto out;
230290
}
291+
#if MMAP_GENOTYPES
292+
if (self->mmap_fd != -1) {
293+
ret = ancestor_builder_make_genotype_mmap(self);
294+
if (ret != 0) {
295+
goto out;
296+
}
297+
}
298+
#endif
231299
avl_init_tree(&self->time_map, cmp_time_map, NULL);
232300
out:
233301
return ret;
@@ -236,13 +304,19 @@ ancestor_builder_alloc(
236304
size_t
237305
ancestor_builder_get_memsize(const ancestor_builder_t *self)
238306
{
239-
/* Ignore the other allocs as insignificant */
307+
/* Ignore the other allocs as insignificant, and don't report the
308+
* size of the mmap'd region */
240309
return self->main_allocator.total_size + self->indexing_allocator.total_size;
241310
}
242311

243312
int
244313
ancestor_builder_free(ancestor_builder_t *self)
245314
{
315+
#if MMAP_GENOTYPES
316+
if (self->mmap_fd != -1) {
317+
ancestor_builder_free_genotype_mmap(self);
318+
}
319+
#endif
246320
tsi_safe_free(self->sites);
247321
tsi_safe_free(self->descriptors);
248322
tsk_safe_free(self->genotype_encode_buffer);
@@ -558,7 +632,18 @@ ancestor_builder_encode_genotypes(
558632
static uint8_t *
559633
ancestor_builder_allocate_genotypes(ancestor_builder_t *self)
560634
{
561-
return tsk_blkalloc_get(&self->main_allocator, self->encoded_genotypes_size);
635+
uint8_t *ret = NULL;
636+
void *p;
637+
638+
if (self->mmap_buffer == NULL) {
639+
ret = tsk_blkalloc_get(&self->main_allocator, self->encoded_genotypes_size);
640+
} else {
641+
p = (char *) self->mmap_buffer + self->mmap_offset;
642+
self->mmap_offset += self->encoded_genotypes_size;
643+
assert(self->mmap_offset <= self->mmap_size);
644+
ret = (uint8_t *) p;
645+
}
646+
return ret;
562647
}
563648

564649
int WARN_UNUSED

lib/err.c

+4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
#include "err.h"
21+
#include <tskit.h>
2122

2223
const char *
2324
tsi_strerror(int err)
@@ -105,6 +106,9 @@ tsi_strerror(int err)
105106
case TSI_ERR_ONE_BIT_NON_BINARY:
106107
ret = "One-bit genotype encoding only supports binary 0/1 data";
107108
break;
109+
case TSI_ERR_IO:
110+
ret = tsk_strerror(TSK_ERR_IO);
111+
break;
108112
}
109113
return ret;
110114
}

lib/err.h

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#define TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA -22
2727
#define TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION -23
2828
#define TSI_ERR_ONE_BIT_NON_BINARY -24
29+
#define TSI_ERR_IO -25
2930
// clang-format on
3031

3132
#ifdef __GNUC__

0 commit comments

Comments
 (0)