Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

rudimentary support for --stats in --dbt --json mode #647

Merged
merged 11 commits into from
Aug 14, 2023
1 change: 1 addition & 0 deletions data_diff/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ def main(conf, run, **kw):
json_output=kw["json_output"],
state=state,
where_flag=kw["where"],
stats_flag=kw["stats"],
columns_flag=kw["columns"],
)
else:
Expand Down
7 changes: 6 additions & 1 deletion data_diff/dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class TDiffVars(pydantic.BaseModel):
include_columns: List[str]
exclude_columns: List[str]
dbt_model: Optional[str] = None
stats_flag: bool = False


def dbt_diff(
Expand All @@ -73,6 +74,7 @@ def dbt_diff(
state: Optional[str] = None,
log_status_handler: Optional[LogStatusHandler] = None,
where_flag: Optional[str] = None,
stats_flag: bool = False,
columns_flag: Optional[Tuple[str]] = None,
) -> None:
print_version_info()
Expand Down Expand Up @@ -113,7 +115,7 @@ def dbt_diff(
if log_status_handler:
log_status_handler.set_prefix(f"Diffing {model.alias} \n")

diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, columns_flag)
diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, stats_flag, columns_flag)

# we won't always have a prod path when using state
# when the model DNE in prod manifest, skip the model diff
Expand Down Expand Up @@ -165,6 +167,7 @@ def _get_diff_vars(
config: TDatadiffConfig,
model,
where_flag: Optional[str] = None,
stats_flag: bool = False,
columns_flag: Optional[Tuple[str]] = None,
) -> TDiffVars:
cli_columns = list(columns_flag) if columns_flag else []
Expand Down Expand Up @@ -200,6 +203,7 @@ def _get_diff_vars(
where_filter=where_flag or datadiff_model_config.where_filter,
include_columns=cli_columns or datadiff_model_config.include_columns,
exclude_columns=[] if cli_columns else datadiff_model_config.exclude_columns,
stats_flag=stats_flag,
)


Expand Down Expand Up @@ -338,6 +342,7 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None:
"removed": columns_removed,
"changed": columns_type_changed,
},
stats_only=diff_vars.stats_flag,
)
),
flush=True,
Expand Down
36 changes: 22 additions & 14 deletions data_diff/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def jsonify(
dataset2_columns: Columns,
columns_diff: Dict[str, List[str]],
with_summary: bool = False,
stats_only: bool = False,
) -> "JsonDiff":
"""
Converts the diff result into a JSON-serializable format.
Expand All @@ -53,21 +54,31 @@ def jsonify(
t1_exclusive_rows = []
t2_exclusive_rows = []
diff_rows = []
rows = None
schema = [field for field, _ in diff_info.diff_schema]

t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)

diff_rows_jsonified = []
for row in diff_rows:
diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
if not stats_only:
diff_rows_jsonified = []
for row in diff_rows:
diff_rows_jsonified.append(_jsonify_diff(row, key_columns))

t1_exclusive_rows_jsonified = []
for row in t1_exclusive_rows:
t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
t1_exclusive_rows_jsonified = []
for row in t1_exclusive_rows:
t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))

t2_exclusive_rows_jsonified = []
for row in t2_exclusive_rows:
t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
t2_exclusive_rows_jsonified = []
for row in t2_exclusive_rows:
t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))

rows = RowsDiff(
exclusive=ExclusiveDiff(
dataset1=t1_exclusive_rows_jsonified,
dataset2=t2_exclusive_rows_jsonified
),
diff=diff_rows_jsonified,
)

summary = None
if with_summary:
Expand All @@ -87,10 +98,7 @@ def jsonify(
model=dbt_model,
dataset1=list(table1.table_path),
dataset2=list(table2.table_path),
rows=RowsDiff(
exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified),
diff=diff_rows_jsonified,
),
rows=rows,
summary=summary,
columns=columns,
).json()
Expand Down Expand Up @@ -228,7 +236,7 @@ class JsonDiff:
model: str
dataset1: List[str]
dataset2: List[str]
rows: RowsDiff
rows: Optional[RowsDiff]
summary: Optional[JsonDiffSummary]
columns: Optional[JsonColumnsSummary]

Expand Down