Skip to content

Commit 737fe42

Browse files
Remove support for missing data from alignments
Closes #1897
1 parent 032e59c commit 737fe42

File tree

4 files changed

+274
-158
lines changed

4 files changed

+274
-158
lines changed

python/tests/test_genotypes.py

+39-19
Original file line numberDiff line numberDiff line change
@@ -1169,7 +1169,7 @@ def test_nexus_reference_sequence(self):
11691169
END;
11701170
BEGIN DATA;
11711171
DIMENSIONS NCHAR=10;
1172-
FORMAT DATATYPE=DNA MISSING=?;
1172+
FORMAT DATATYPE=DNA;
11731173
MATRIX
11741174
n0 01G345678T
11751175
n1 01A345678C
@@ -1232,6 +1232,13 @@ def test_alignments_default(self):
12321232
assert A[2] == "NNANNNNNNC"
12331233
assert A[3] == "NNNNNNNNNN"
12341234

1235+
def test_alignments_fails(self):
1236+
# https://github.com/tskit-dev/tskit/issues/1896
1237+
ref = "N" * 10
1238+
with pytest.raises(ValueError, match="1896"):
1239+
next(self.ts().alignments(reference_sequence=ref))
1240+
1241+
@pytest.mark.skip("Missing data in alignments: #1896")
12351242
def test_alignments_impute_missing(self):
12361243
ref = "N" * 10
12371244
A = list(
@@ -1256,16 +1263,18 @@ def test_alignments_missing_char_ref(self):
12561263
assert A[0] == "NNGNNNNNNT"
12571264
assert A[1] == "NNANNNNNNC"
12581265
assert A[2] == "NNANNNNNNC"
1259-
assert A[3] == "NNzNNNNNNz"
1266+
assert A[3] == "zzzzzzzzzz"
12601267

1268+
@pytest.mark.skip("Missing data in alignments: #1896")
12611269
def test_alignments_reference_sequence(self):
12621270
ref = "0123456789"
12631271
A = list(self.ts().alignments(reference_sequence=ref))
12641272
assert A[0] == "01G345678T"
12651273
assert A[1] == "01A345678C"
12661274
assert A[2] == "01A345678C"
1267-
assert A[3] == "01N345678N"
1275+
assert A[3] == "NNNNNNNNNN"
12681276

1277+
@pytest.mark.skip("Missing data in alignments: #1896")
12691278
def test_alignments_reference_sequence_missing_data_char(self):
12701279
ref = "0123456789"
12711280
A = list(
@@ -1274,8 +1283,9 @@ def test_alignments_reference_sequence_missing_data_char(self):
12741283
assert A[0] == "01G345678T"
12751284
assert A[1] == "01A345678C"
12761285
assert A[2] == "01A345678C"
1277-
assert A[3] == "01Q345678Q"
1286+
assert A[3] == "QQQQQQQQQQ"
12781287

1288+
@pytest.mark.skip("Missing data in alignments: #1896")
12791289
def test_fasta_reference_sequence(self):
12801290
ref = "0123456789"
12811291
expected = textwrap.dedent(
@@ -1287,11 +1297,12 @@ def test_fasta_reference_sequence(self):
12871297
>n2
12881298
01A345678C
12891299
>n5
1290-
01N345678N
1300+
NNNNNNNNNN
12911301
"""
12921302
)
12931303
assert expected == self.ts().as_fasta(reference_sequence=ref)
12941304

1305+
@pytest.mark.skip("Missing data in alignments: #1896")
12951306
def test_fasta_reference_sequence_missing_data_char(self):
12961307
ref = "0123456789"
12971308
expected = textwrap.dedent(
@@ -1303,13 +1314,14 @@ def test_fasta_reference_sequence_missing_data_char(self):
13031314
>n2
13041315
01A345678C
13051316
>n5
1306-
01Q345678Q
1317+
QQQQQQQQQQ
13071318
"""
13081319
)
13091320
assert expected == self.ts().as_fasta(
13101321
reference_sequence=ref, missing_data_character="Q"
13111322
)
13121323

1324+
@pytest.mark.skip("Missing data in alignments: #1896")
13131325
def test_fasta_impute_missing(self):
13141326
ref = "N" * 10
13151327
expected = textwrap.dedent(
@@ -1331,6 +1343,7 @@ def test_fasta_impute_missing(self):
13311343
# Note: the nexus tree output isn't compatible with our representation of
13321344
# missing data as trees with isolated roots (newick parsers won't accept
13331345
# this as valid input), so we set include_trees=False for these examples.
1346+
@pytest.mark.skip("Missing data in alignments: #1896")
13341347
def test_nexus_reference_sequence(self):
13351348
ref = "0123456789"
13361349
expected = textwrap.dedent(
@@ -1347,7 +1360,7 @@ def test_nexus_reference_sequence(self):
13471360
n0 01G345678T
13481361
n1 01A345678C
13491362
n2 01A345678C
1350-
n5 01?345678?
1363+
n5 ??????????
13511364
;
13521365
END;
13531366
"""
@@ -1356,6 +1369,7 @@ def test_nexus_reference_sequence(self):
13561369
reference_sequence=ref, include_trees=False
13571370
)
13581371

1372+
@pytest.mark.skip("Missing data in alignments: #1896")
13591373
def test_nexus_reference_sequence_missing_data_char(self):
13601374
ref = "0123456789"
13611375
expected = textwrap.dedent(
@@ -1372,7 +1386,7 @@ def test_nexus_reference_sequence_missing_data_char(self):
13721386
n0 01G345678T
13731387
n1 01A345678C
13741388
n2 01A345678C
1375-
n5 01Q345678Q
1389+
n5 QQQQQQQQQQ
13761390
;
13771391
END;
13781392
"""
@@ -1383,6 +1397,7 @@ def test_nexus_reference_sequence_missing_data_char(self):
13831397
include_trees=False,
13841398
)
13851399

1400+
@pytest.mark.skip("Missing data in alignments: #1896")
13861401
def test_nexus_impute_missing(self):
13871402
ref = "0123456789"
13881403
expected = textwrap.dedent(
@@ -1441,6 +1456,7 @@ def test_non_ascii_references(self, ref):
14411456
with pytest.raises(UnicodeEncodeError):
14421457
list(ts.alignments(reference_sequence=ref))
14431458

1459+
@pytest.mark.skip("Missing data in alignments: #1896")
14441460
@pytest.mark.parametrize("missing_data_char", ["À", "┃", "α"])
14451461
def test_non_ascii_missing_data_char(self, missing_data_char):
14461462
ts = self.simplest_ts()
@@ -1470,14 +1486,18 @@ def test_defaults(self, ts):
14701486
@pytest.mark.parametrize("ts", get_example_discrete_genome_tree_sequences())
14711487
def test_reference_sequence(self, ts):
14721488
ref = tskit.random_nucleotides(ts.sequence_length, seed=1234)
1473-
A = list(ts.alignments(reference_sequence=ref))
1474-
assert len(A) == ts.num_samples
1475-
H = list(ts.haplotypes())
1476-
pos = ts.tables.sites.position.astype(int)
1477-
for a, h in map(np.array, zip(A, H)):
1478-
last = 0
1479-
for j, x in enumerate(pos):
1480-
assert a[last:x] == ref[last:x]
1481-
assert a[x] == h[j]
1482-
last = x + 1
1483-
assert a[last:] == ref[last:]
1489+
if any(tree.num_roots > 1 for tree in ts.trees()):
1490+
with pytest.raises(ValueError, match="1896"):
1491+
list(ts.alignments(reference_sequence=ref))
1492+
else:
1493+
A = list(ts.alignments(reference_sequence=ref))
1494+
assert len(A) == ts.num_samples
1495+
H = list(ts.haplotypes())
1496+
pos = ts.tables.sites.position.astype(int)
1497+
for a, h in map(np.array, zip(A, H)):
1498+
last = 0
1499+
for j, x in enumerate(pos):
1500+
assert a[last:x] == ref[last:x]
1501+
assert a[x] == h[j]
1502+
last = x + 1
1503+
assert a[last:] == ref[last:]

0 commit comments

Comments
 (0)