Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 2114ede

Browse files
author
Sergey Vasilyev
committed
Restore the proper meta-params of PK column type relevant to each side when slicing
Otherwise, it uses the same PK values, e.g. `ArithUUID` from the side A, and then pushes them to side B, where improper rendering can lead to improper slicing.
1 parent 725cadb commit 2114ede

File tree

5 files changed

+157
-12
lines changed

5 files changed

+157
-12
lines changed

data_diff/abcs/database_types.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ def python_type(self) -> type:
182182
"Return the equivalent Python type of the key"
183183

184184
def make_value(self, value):
185+
if isinstance(value, self.python_type):
186+
return value
185187
return self.python_type(value)
186188

187189

@@ -237,9 +239,6 @@ def test_value(value: str) -> bool:
237239
except ValueError:
238240
return False
239241

240-
def make_value(self, value):
241-
return self.python_type(value)
242-
243242

244243
@attrs.define(frozen=True)
245244
class String_VaryingAlphanum(String_Alphanum):
@@ -251,6 +250,8 @@ class String_FixedAlphanum(String_Alphanum):
251250
length: int
252251

253252
def make_value(self, value):
253+
if isinstance(value, self.python_type):
254+
return value
254255
if len(value) != self.length:
255256
raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
256257
return self.python_type(value, max_len=self.length)

data_diff/diff_tables.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,8 @@ def _bisect_and_diff_tables(self, table1: TableSegment, table2: TableSegment, in
297297
# Start with the first completed value, so we don't waste time waiting
298298
min_key1, max_key1 = self._parse_key_range_result(key_types1, next(key_ranges))
299299

300-
btable1, btable2 = [t.new_key_bounds(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
300+
btable1 = table1.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types1)
301+
btable2 = table2.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types2)
301302

302303
logger.info(
303304
f"Diffing segments at key-range: {btable1.min_key}..{btable2.max_key}. "
@@ -321,16 +322,18 @@ def _bisect_and_diff_tables(self, table1: TableSegment, table2: TableSegment, in
321322
# └──┴──────┴──┘
322323
# Overall, the max number of new regions in this 2nd pass is 3^|k| - 1
323324

324-
min_key2, max_key2 = self._parse_key_range_result(key_types1, next(key_ranges))
325+
# Note: python types can be the same, but the rendering parameters (e.g. casing) can differ.
326+
min_key2, max_key2 = self._parse_key_range_result(key_types2, next(key_ranges))
325327

326328
points = [list(sorted(p)) for p in safezip(min_key1, min_key2, max_key1, max_key2)]
327329
box_mesh = create_mesh_from_points(*points)
328330

329331
new_regions = [(p1, p2) for p1, p2 in box_mesh if p1 < p2 and not (p1 >= min_key1 and p2 <= max_key1)]
330332

331333
for p1, p2 in new_regions:
332-
extra_tables = [t.new_key_bounds(min_key=p1, max_key=p2) for t in (table1, table2)]
333-
ti.submit(self._bisect_and_diff_segments, ti, *extra_tables, info_tree, priority=999)
334+
extra_table1 = table1.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types1)
335+
extra_table2 = table2.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types2)
336+
ti.submit(self._bisect_and_diff_segments, ti, extra_table1, extra_table2, info_tree, priority=999)
334337

335338
return ti
336339

data_diff/table_segment.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import time
2-
from typing import Container, Dict, List, Optional, Tuple
2+
from typing import Container, Dict, List, Optional, Sequence, Tuple
33
import logging
44
from itertools import product
55

@@ -9,7 +9,7 @@
99
from data_diff.utils import safezip, Vector
1010
from data_diff.utils import ArithString, split_space
1111
from data_diff.databases.base import Database
12-
from data_diff.abcs.database_types import DbPath, DbKey, DbTime
12+
from data_diff.abcs.database_types import DbPath, DbKey, DbTime, IKey
1313
from data_diff.schema import RawColumnInfo, Schema, create_schema
1414
from data_diff.queries.extras import Checksum
1515
from data_diff.queries.api import Count, SKIP, table, this, Expr, min_, max_, Code
@@ -205,7 +205,7 @@ def new(self, **kwargs) -> Self:
205205
"""Creates a copy of the instance using 'replace()'"""
206206
return attrs.evolve(self, **kwargs)
207207

208-
def new_key_bounds(self, min_key: Vector, max_key: Vector) -> Self:
208+
def new_key_bounds(self, min_key: Vector, max_key: Vector, *, key_types: Optional[Sequence[IKey]] = None) -> Self:
209209
if self.min_key is not None:
210210
assert self.min_key <= min_key, (self.min_key, min_key)
211211
assert self.min_key < max_key
@@ -214,6 +214,13 @@ def new_key_bounds(self, min_key: Vector, max_key: Vector) -> Self:
214214
assert min_key < self.max_key
215215
assert max_key <= self.max_key
216216

217+
# If asked, enforce the PKs to proper types, mainly to meta-params of the relevant side,
218+
# so that we do not leak e.g. casing of UUIDs from side A to side B and vice versa.
219+
# If not asked, keep the meta-params of the keys as is (assume them already casted).
220+
if key_types is not None:
221+
min_key = Vector(type.make_value(val) for type, val in safezip(key_types, min_key))
222+
max_key = Vector(type.make_value(val) for type, val in safezip(key_types, max_key))
223+
217224
return attrs.evolve(self, min_key=min_key, max_key=max_key)
218225

219226
@property

data_diff/utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,10 @@ def range(self, other: "ArithString", count: int) -> List[Self]:
135135
return [self.new(int=i) for i in checkpoints]
136136

137137

138-
def _any_to_uuid(v: Union[str, int, UUID]) -> UUID:
139-
if isinstance(v, UUID):
138+
def _any_to_uuid(v: Union[str, int, UUID, "ArithUUID"]) -> UUID:
139+
if isinstance(v, ArithUUID):
140+
return v.uuid
141+
elif isinstance(v, UUID):
140142
return v
141143
elif isinstance(v, str):
142144
return UUID(v)

tests/test_mesh.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import uuid
2+
3+
from data_diff.abcs.database_types import String_UUID
4+
from data_diff.databases import MySQL
5+
from data_diff.table_segment import create_mesh_from_points
6+
from data_diff.utils import ArithUUID, safezip
7+
from tests.common import DiffTestCase, table_segment
8+
9+
10+
# We do not need real tables, just any reference to them for proper object creation.
11+
class TestDiffMesh(DiffTestCase):
12+
db_cls = MySQL
13+
14+
def test_meta_parameters_passed_from_coltypes_to_values(self):
15+
key_types1 = [String_UUID(lowercase=True, uppercase=False)]
16+
key_types2 = [String_UUID(lowercase=False, uppercase=True)]
17+
18+
# side B is wider than side A to ensure there are "outer" regions.
19+
min_uuid1 = uuid.UUID("11111111-1111-1111-1111-111111111111")
20+
max_uuid1 = uuid.UUID("EEEEEEEE-EEEE-EEEE-EEEE-EEEEEEEEEEEE")
21+
min_uuid2 = uuid.UUID("00000000-0000-0000-0000-000000000000")
22+
max_uuid2 = uuid.UUID("FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF")
23+
min_key1 = (key_types1[0].make_value(min_uuid1),)
24+
max_key1 = (key_types1[0].make_value(max_uuid1),)
25+
min_key2 = (key_types2[0].make_value(min_uuid2),)
26+
max_key2 = (key_types2[0].make_value(max_uuid2),)
27+
28+
# Verify that we pass the meta-parameters from col types to values:
29+
assert isinstance(min_key1[0], ArithUUID)
30+
assert isinstance(max_key1[0], ArithUUID)
31+
assert isinstance(min_key2[0], ArithUUID)
32+
assert isinstance(max_key2[0], ArithUUID)
33+
assert min_key1[0].uuid == min_uuid1
34+
assert min_key1[0].lowercase == True
35+
assert min_key1[0].uppercase == False
36+
assert max_key1[0].uuid == max_uuid1
37+
assert max_key1[0].lowercase == True
38+
assert max_key1[0].uppercase == False
39+
assert min_key2[0].uuid == min_uuid2
40+
assert min_key2[0].lowercase == False
41+
assert min_key2[0].uppercase == True
42+
assert max_key2[0].uuid == max_uuid2
43+
assert max_key2[0].lowercase == False
44+
assert max_key2[0].uppercase == True
45+
46+
def test_meta_parameters_left_as_is_if_not_casted(self):
47+
table1 = table_segment(self.connection, self.table_src_path, "id", "timestamp", case_sensitive=False)
48+
key_types1 = [String_UUID(lowercase=True, uppercase=False)]
49+
50+
min_uuid1 = uuid.UUID("11111111-1111-1111-1111-111111111111")
51+
max_uuid1 = uuid.UUID("EEEEEEEE-EEEE-EEEE-EEEE-EEEEEEEEEEEE")
52+
min_key1 = (key_types1[0].make_value(min_uuid1),)
53+
max_key1 = (key_types1[0].make_value(max_uuid1),)
54+
55+
btable1 = table1.new_key_bounds(min_key=min_key1, max_key=max_key1)
56+
assert btable1.min_key[0] is min_key1[0] # by identity, not by equality
57+
assert btable1.max_key[0] is max_key1[0] # by identity, not by equality
58+
59+
def test_mesh_keys_meta_parameters_preserved(self):
60+
table1 = table_segment(self.connection, self.table_src_path, "id", "timestamp", case_sensitive=False)
61+
table2 = table_segment(self.connection, self.table_src_path, "id", "timestamp", case_sensitive=False)
62+
key_types1 = [String_UUID(lowercase=True, uppercase=False)]
63+
key_types2 = [String_UUID(lowercase=False, uppercase=True)]
64+
65+
# side B is wider than side A to ensure there are "outer" regions.
66+
min_uuid1 = uuid.UUID("11111111-1111-1111-1111-111111111111")
67+
max_uuid1 = uuid.UUID("EEEEEEEE-EEEE-EEEE-EEEE-EEEEEEEEEEEE")
68+
min_uuid2 = uuid.UUID("00000000-0000-0000-0000-000000000000")
69+
max_uuid2 = uuid.UUID("FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF")
70+
min_key1 = (key_types1[0].make_value(min_uuid1),)
71+
max_key1 = (key_types1[0].make_value(max_uuid1),)
72+
min_key2 = (key_types2[0].make_value(min_uuid2),)
73+
max_key2 = (key_types2[0].make_value(max_uuid2),)
74+
75+
# This is what TableDiffer._bisect_and_diff_tables() does, precisely (yes, using key1!):
76+
btable1 = table1.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types1)
77+
btable2 = table2.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types2)
78+
79+
# Verify that both sides have proper (the side-specific) pk meta-parameters:
80+
assert btable1.min_key[0].uuid == min_uuid1
81+
assert btable1.min_key[0].lowercase == True
82+
assert btable1.min_key[0].uppercase == False
83+
assert btable1.max_key[0].uuid == max_uuid1
84+
assert btable1.max_key[0].lowercase == True
85+
assert btable1.max_key[0].uppercase == False
86+
assert btable2.min_key[0].uuid == min_uuid1
87+
assert btable2.min_key[0].lowercase == False
88+
assert btable2.min_key[0].uppercase == True
89+
assert btable2.max_key[0].uuid == max_uuid1
90+
assert btable2.max_key[0].lowercase == False
91+
assert btable2.max_key[0].uppercase == True
92+
93+
# This is what TableDiffer._bisect_and_diff_tables() does, precisely:
94+
points = [list(sorted(p)) for p in safezip(min_key1, min_key2, max_key1, max_key2)]
95+
box_mesh = create_mesh_from_points(*points)
96+
new_regions = [(p1, p2) for p1, p2 in box_mesh if p1 < p2 and not (p1 >= min_key1 and p2 <= max_key1)]
97+
extra_tables = [
98+
(
99+
table1.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types1),
100+
table2.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types2),
101+
)
102+
for p1, p2 in new_regions
103+
]
104+
105+
# Verify that extra ("outer") segments have the proper pk meta-parameters:
106+
assert len(extra_tables) == 2
107+
108+
assert extra_tables[0][0].min_key[0].uuid == min_uuid2
109+
assert extra_tables[0][0].min_key[0].lowercase == True
110+
assert extra_tables[0][0].min_key[0].uppercase == False
111+
assert extra_tables[0][0].max_key[0].uuid == min_uuid1
112+
assert extra_tables[0][0].max_key[0].lowercase == True
113+
assert extra_tables[0][0].max_key[0].uppercase == False
114+
assert extra_tables[0][1].min_key[0].uuid == min_uuid2
115+
assert extra_tables[0][1].min_key[0].lowercase == False
116+
assert extra_tables[0][1].min_key[0].uppercase == True
117+
assert extra_tables[0][1].max_key[0].uuid == min_uuid1
118+
assert extra_tables[0][1].max_key[0].lowercase == False
119+
assert extra_tables[0][1].max_key[0].uppercase == True
120+
121+
assert extra_tables[1][0].min_key[0].uuid == max_uuid1
122+
assert extra_tables[1][0].min_key[0].lowercase == True
123+
assert extra_tables[1][0].min_key[0].uppercase == False
124+
assert extra_tables[1][0].max_key[0].uuid == max_uuid2
125+
assert extra_tables[1][0].max_key[0].lowercase == True
126+
assert extra_tables[1][0].max_key[0].uppercase == False
127+
assert extra_tables[1][1].min_key[0].uuid == max_uuid1
128+
assert extra_tables[1][1].min_key[0].lowercase == False
129+
assert extra_tables[1][1].min_key[0].uppercase == True
130+
assert extra_tables[1][1].max_key[0].uuid == max_uuid2
131+
assert extra_tables[1][1].max_key[0].lowercase == False
132+
assert extra_tables[1][1].max_key[0].uppercase == True

0 commit comments

Comments
 (0)