Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 2327d44

Browse files
authored
Merge pull request #647 from stefankeidel/feat/stats-for-dbt
rudimentary support for --stats in --dbt --json mode
2 parents 38980f4 + f473546 commit 2327d44

File tree

4 files changed

+117
-17
lines changed

4 files changed

+117
-17
lines changed

data_diff/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ def main(conf, run, **kw):
317317
json_output=kw["json_output"],
318318
state=state,
319319
where_flag=kw["where"],
320+
stats_flag=kw["stats"],
320321
columns_flag=kw["columns"],
321322
)
322323
else:

data_diff/dbt.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ class TDiffVars(pydantic.BaseModel):
6262
include_columns: List[str]
6363
exclude_columns: List[str]
6464
dbt_model: Optional[str] = None
65+
stats_flag: bool = False
6566

6667

6768
def dbt_diff(
@@ -73,6 +74,7 @@ def dbt_diff(
7374
state: Optional[str] = None,
7475
log_status_handler: Optional[LogStatusHandler] = None,
7576
where_flag: Optional[str] = None,
77+
stats_flag: bool = False,
7678
columns_flag: Optional[Tuple[str]] = None,
7779
) -> None:
7880
print_version_info()
@@ -113,7 +115,7 @@ def dbt_diff(
113115
if log_status_handler:
114116
log_status_handler.set_prefix(f"Diffing {model.alias} \n")
115117

116-
diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, columns_flag)
118+
diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, stats_flag, columns_flag)
117119

118120
# we won't always have a prod path when using state
119121
# when the model DNE in prod manifest, skip the model diff
@@ -165,6 +167,7 @@ def _get_diff_vars(
165167
config: TDatadiffConfig,
166168
model,
167169
where_flag: Optional[str] = None,
170+
stats_flag: bool = False,
168171
columns_flag: Optional[Tuple[str]] = None,
169172
) -> TDiffVars:
170173
cli_columns = list(columns_flag) if columns_flag else []
@@ -200,6 +203,7 @@ def _get_diff_vars(
200203
where_filter=where_flag or datadiff_model_config.where_filter,
201204
include_columns=cli_columns or datadiff_model_config.include_columns,
202205
exclude_columns=[] if cli_columns else datadiff_model_config.exclude_columns,
206+
stats_flag=stats_flag,
203207
)
204208

205209

@@ -338,6 +342,7 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None:
338342
"removed": columns_removed,
339343
"changed": columns_type_changed,
340344
},
345+
stats_only=diff_vars.stats_flag,
341346
)
342347
),
343348
flush=True,

data_diff/format.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def jsonify(
4040
dataset2_columns: Columns,
4141
columns_diff: Dict[str, List[str]],
4242
with_summary: bool = False,
43+
stats_only: bool = False,
4344
) -> "JsonDiff":
4445
"""
4546
Converts the diff result into a JSON-serializable format.
@@ -53,21 +54,13 @@ def jsonify(
5354
t1_exclusive_rows = []
5455
t2_exclusive_rows = []
5556
diff_rows = []
57+
rows = None
5658
schema = [field for field, _ in diff_info.diff_schema]
5759

5860
t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)
5961

60-
diff_rows_jsonified = []
61-
for row in diff_rows:
62-
diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
63-
64-
t1_exclusive_rows_jsonified = []
65-
for row in t1_exclusive_rows:
66-
t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
67-
68-
t2_exclusive_rows_jsonified = []
69-
for row in t2_exclusive_rows:
70-
t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
62+
if not stats_only:
63+
rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows, key_columns)
7164

7265
summary = None
7366
if with_summary:
@@ -87,10 +80,7 @@ def jsonify(
8780
model=dbt_model,
8881
dataset1=list(table1.table_path),
8982
dataset2=list(table2.table_path),
90-
rows=RowsDiff(
91-
exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified),
92-
diff=diff_rows_jsonified,
93-
),
83+
rows=rows,
9484
summary=summary,
9585
columns=columns,
9686
).json()
@@ -228,7 +218,7 @@ class JsonDiff:
228218
model: str
229219
dataset1: List[str]
230220
dataset2: List[str]
231-
rows: RowsDiff
221+
rows: Optional[RowsDiff]
232222
summary: Optional[JsonDiffSummary]
233223
columns: Optional[JsonColumnsSummary]
234224

@@ -259,6 +249,33 @@ def _group_rows(
259249
return t1_exclusive_rows, t2_exclusive_rows, diff_rows
260250

261251

252+
def _make_rows_diff(
253+
t1_exclusive_rows: List[Dict[str, Any]],
254+
t2_exclusive_rows: List[Dict[str, Any]],
255+
diff_rows: List[Dict[str, Any]],
256+
key_columns: List[str]
257+
) -> RowsDiff:
258+
diff_rows_jsonified = []
259+
for row in diff_rows:
260+
diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
261+
262+
t1_exclusive_rows_jsonified = []
263+
for row in t1_exclusive_rows:
264+
t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
265+
266+
t2_exclusive_rows_jsonified = []
267+
for row in t2_exclusive_rows:
268+
t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
269+
270+
return RowsDiff(
271+
exclusive=ExclusiveDiff(
272+
dataset1=t1_exclusive_rows_jsonified,
273+
dataset2=t2_exclusive_rows_jsonified
274+
),
275+
diff=diff_rows_jsonified,
276+
)
277+
278+
262279
def _jsonify_diff(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonDiffRowValue]:
263280
columns = collections.defaultdict(dict)
264281
for field, value in row.items():

tests/test_format.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,83 @@ def test_jsonify_diff(self):
9595
},
9696
)
9797

98+
def test_jsonify_no_stats(self):
99+
diff = DiffResultWrapper(
100+
info_tree=InfoTree(
101+
info=SegmentInfo(
102+
tables=[
103+
TableSegment(table_path=("db", "schema", "table1"), key_columns=("id",), database=Database()),
104+
TableSegment(table_path=("db", "schema", "table2"), key_columns=("id",), database=Database()),
105+
],
106+
diff_schema=(
107+
("is_exclusive_a", bool),
108+
("is_exclusive_b", bool),
109+
("is_diff_id", int),
110+
("is_diff_value", int),
111+
("id_a", str),
112+
("id_b", str),
113+
("value_a", str),
114+
("value_b", str),
115+
),
116+
diff=[
117+
(False, False, 0, 1, "1", "1", "3", "201"),
118+
(True, False, 1, 1, "2", None, "4", None),
119+
(False, True, 1, 1, None, "3", None, "202"),
120+
],
121+
)
122+
),
123+
diff=[],
124+
stats={},
125+
)
126+
json_diff = jsonify(
127+
diff,
128+
dbt_model="my_model",
129+
dataset1_columns=[
130+
("id", "NUMBER", Integer()),
131+
("value", "NUMBER", Integer()),
132+
],
133+
dataset2_columns=[
134+
("id", "NUMBER", Integer()),
135+
("value", "NUMBER", Integer()),
136+
],
137+
columns_diff={
138+
"added": [],
139+
"removed": [],
140+
"typeChanged": [],
141+
},
142+
stats_only=True
143+
)
144+
145+
self.assertEqual(
146+
json_diff,
147+
{
148+
"version": "1.1.0",
149+
"status": "success",
150+
"result": "different",
151+
"model": "my_model",
152+
"dataset1": ["db", "schema", "table1"],
153+
"dataset2": ["db", "schema", "table2"],
154+
"rows": None,
155+
"columns": {
156+
"dataset1": [
157+
{"name": "id", "type": "NUMBER", "kind": "integer"},
158+
{"name": "value", "type": "NUMBER", "kind": "integer"},
159+
],
160+
"dataset2": [
161+
{"name": "id", "type": "NUMBER", "kind": "integer"},
162+
{"name": "value", "type": "NUMBER", "kind": "integer"},
163+
],
164+
"primaryKey": ["id"],
165+
"exclusive": {
166+
"dataset1": [],
167+
"dataset2": [],
168+
},
169+
"typeChanged": [],
170+
},
171+
"summary": None,
172+
},
173+
)
174+
98175
def test_jsonify_diff_no_difeference(self):
99176
diff = DiffResultWrapper(
100177
info_tree=InfoTree(

0 commit comments

Comments
 (0)