Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 73c7901

Browse files
authored
Merge pull request #838 from datafold/compiosite-pks
Group rows by all columns of composite PKs
2 parents 537d73b + 530a1f6 commit 73c7901

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

data_diff/hashdiff_tables.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
# Just for local readability: TODO: later switch to real type declarations of these.
2525
_Op = Literal["+", "-"]
26-
_PK = Any
26+
_PK = Sequence[Any]
2727
_Row = Tuple[Any]
2828

2929

@@ -34,24 +34,27 @@ def diff_sets(
3434
json_cols: dict = None,
3535
columns1: Sequence[str],
3636
columns2: Sequence[str],
37+
key_columns1: Sequence[str],
38+
key_columns2: Sequence[str],
3739
ignored_columns1: Collection[str],
3840
ignored_columns2: Collection[str],
3941
) -> Iterator:
4042
# Differ only by columns of interest (PKs+relevant-ignored). But yield with ignored ones!
4143
sa: Set[_Row] = {tuple(val for col, val in safezip(columns1, row) if col not in ignored_columns1) for row in a}
4244
sb: Set[_Row] = {tuple(val for col, val in safezip(columns2, row) if col not in ignored_columns2) for row in b}
4345

44-
# The first item is always the key (see TableDiffer.relevant_columns)
45-
# TODO update when we add compound keys to hashdiff
46+
# The first items are always the PK (see TableSegment.relevant_columns)
4647
diffs_by_pks: Dict[_PK, List[Tuple[_Op, _Row]]] = defaultdict(list)
4748
for row in a:
49+
pk: _PK = tuple(val for col, val in zip(key_columns1, row))
4850
cutrow: _Row = tuple(val for col, val in zip(columns1, row) if col not in ignored_columns1)
4951
if cutrow not in sb:
50-
diffs_by_pks[row[0]].append(("-", row))
52+
diffs_by_pks[pk].append(("-", row))
5153
for row in b:
54+
pk: _PK = tuple(val for col, val in zip(key_columns2, row))
5255
cutrow: _Row = tuple(val for col, val in zip(columns2, row) if col not in ignored_columns2)
5356
if cutrow not in sa:
54-
diffs_by_pks[row[0]].append(("+", row))
57+
diffs_by_pks[pk].append(("+", row))
5558

5659
warned_diff_cols = set()
5760
for diffs in (diffs_by_pks[pk] for pk in sorted(diffs_by_pks)):
@@ -232,6 +235,8 @@ def _bisect_and_diff_segments(
232235
json_cols=json_cols,
233236
columns1=table1.relevant_columns,
234237
columns2=table2.relevant_columns,
238+
key_columns1=table1.key_columns,
239+
key_columns2=table2.key_columns,
235240
ignored_columns1=self.ignored_columns1,
236241
ignored_columns2=self.ignored_columns2,
237242
)

0 commit comments

Comments
 (0)