diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 0736aed5..9f41cbd9 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -317,6 +317,7 @@ def main(conf, run, **kw): json_output=kw["json_output"], state=state, where_flag=kw["where"], + stats_flag=kw["stats"], columns_flag=kw["columns"], ) else: diff --git a/data_diff/dbt.py b/data_diff/dbt.py index 9b3b0404..e5c61bda 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -62,6 +62,7 @@ class TDiffVars(pydantic.BaseModel): include_columns: List[str] exclude_columns: List[str] dbt_model: Optional[str] = None + stats_flag: bool = False def dbt_diff( @@ -73,6 +74,7 @@ def dbt_diff( state: Optional[str] = None, log_status_handler: Optional[LogStatusHandler] = None, where_flag: Optional[str] = None, + stats_flag: bool = False, columns_flag: Optional[Tuple[str]] = None, ) -> None: print_version_info() @@ -113,7 +115,7 @@ def dbt_diff( if log_status_handler: log_status_handler.set_prefix(f"Diffing {model.alias} \n") - diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, columns_flag) + diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, stats_flag, columns_flag) # we won't always have a prod path when using state # when the model DNE in prod manifest, skip the model diff @@ -165,6 +167,7 @@ def _get_diff_vars( config: TDatadiffConfig, model, where_flag: Optional[str] = None, + stats_flag: bool = False, columns_flag: Optional[Tuple[str]] = None, ) -> TDiffVars: cli_columns = list(columns_flag) if columns_flag else [] @@ -200,6 +203,7 @@ def _get_diff_vars( where_filter=where_flag or datadiff_model_config.where_filter, include_columns=cli_columns or datadiff_model_config.include_columns, exclude_columns=[] if cli_columns else datadiff_model_config.exclude_columns, + stats_flag=stats_flag, ) @@ -338,6 +342,7 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None: "removed": columns_removed, "changed": columns_type_changed, }, + stats_only=diff_vars.stats_flag, ) ), flush=True, diff --git a/data_diff/format.py b/data_diff/format.py index d9d971ac..bfeb0b1e 100644 --- a/data_diff/format.py +++ b/data_diff/format.py @@ -40,6 +40,7 @@ def jsonify( dataset2_columns: Columns, columns_diff: Dict[str, List[str]], with_summary: bool = False, + stats_only: bool = False, ) -> "JsonDiff": """ Converts the diff result into a JSON-serializable format. @@ -53,21 +54,13 @@ def jsonify( t1_exclusive_rows = [] t2_exclusive_rows = [] diff_rows = [] + rows = None schema = [field for field, _ in diff_info.diff_schema] t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema) - diff_rows_jsonified = [] - for row in diff_rows: - diff_rows_jsonified.append(_jsonify_diff(row, key_columns)) - - t1_exclusive_rows_jsonified = [] - for row in t1_exclusive_rows: - t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) - - t2_exclusive_rows_jsonified = [] - for row in t2_exclusive_rows: - t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + if not stats_only: + rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows, key_columns) summary = None if with_summary: @@ -87,10 +80,7 @@ def jsonify( model=dbt_model, dataset1=list(table1.table_path), dataset2=list(table2.table_path), - rows=RowsDiff( - exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified), - diff=diff_rows_jsonified, - ), + rows=rows, summary=summary, columns=columns, ).json() @@ -228,7 +218,7 @@ class JsonDiff: model: str dataset1: List[str] dataset2: List[str] - rows: RowsDiff + rows: Optional[RowsDiff] summary: Optional[JsonDiffSummary] columns: Optional[JsonColumnsSummary] @@ -259,6 +249,33 @@ def _group_rows( return t1_exclusive_rows, t2_exclusive_rows, diff_rows +def _make_rows_diff( + t1_exclusive_rows: List[Dict[str, Any]], + t2_exclusive_rows: List[Dict[str, Any]], + diff_rows: List[Dict[str, Any]], + key_columns: List[str] +) -> RowsDiff: + diff_rows_jsonified = [] + for row in diff_rows: + diff_rows_jsonified.append(_jsonify_diff(row, key_columns)) + + t1_exclusive_rows_jsonified = [] + for row in t1_exclusive_rows: + t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + + t2_exclusive_rows_jsonified = [] + for row in t2_exclusive_rows: + t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + + return RowsDiff( + exclusive=ExclusiveDiff( + dataset1=t1_exclusive_rows_jsonified, + dataset2=t2_exclusive_rows_jsonified + ), + diff=diff_rows_jsonified, + ) + + def _jsonify_diff(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonDiffRowValue]: columns = collections.defaultdict(dict) for field, value in row.items(): diff --git a/tests/test_format.py b/tests/test_format.py index 5de8feec..0aa8ee8e 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -95,6 +95,83 @@ def test_jsonify_diff(self): }, ) + def test_jsonify_no_stats(self): + diff = DiffResultWrapper( + info_tree=InfoTree( + info=SegmentInfo( + tables=[ + TableSegment(table_path=("db", "schema", "table1"), key_columns=("id",), database=Database()), + TableSegment(table_path=("db", "schema", "table2"), key_columns=("id",), database=Database()), + ], + diff_schema=( + ("is_exclusive_a", bool), + ("is_exclusive_b", bool), + ("is_diff_id", int), + ("is_diff_value", int), + ("id_a", str), + ("id_b", str), + ("value_a", str), + ("value_b", str), + ), + diff=[ + (False, False, 0, 1, "1", "1", "3", "201"), + (True, False, 1, 1, "2", None, "4", None), + (False, True, 1, 1, None, "3", None, "202"), + ], + ) + ), + diff=[], + stats={}, + ) + json_diff = jsonify( + diff, + dbt_model="my_model", + dataset1_columns=[ + ("id", "NUMBER", Integer()), + ("value", "NUMBER", Integer()), + ], + dataset2_columns=[ + ("id", "NUMBER", Integer()), + ("value", "NUMBER", Integer()), + ], + columns_diff={ + "added": [], + "removed": [], + "typeChanged": [], + }, + stats_only=True + ) + + self.assertEqual( + json_diff, + { + "version": "1.1.0", + "status": "success", + "result": "different", + "model": "my_model", + "dataset1": ["db", "schema", "table1"], + "dataset2": ["db", "schema", "table2"], + "rows": None, + "columns": { + "dataset1": [ + {"name": "id", "type": "NUMBER", "kind": "integer"}, + {"name": "value", "type": "NUMBER", "kind": "integer"}, + ], + "dataset2": [ + {"name": "id", "type": "NUMBER", "kind": "integer"}, + {"name": "value", "type": "NUMBER", "kind": "integer"}, + ], + "primaryKey": ["id"], + "exclusive": { + "dataset1": [], + "dataset2": [], + }, + "typeChanged": [], + }, + "summary": None, + }, + ) + def test_jsonify_diff_no_difeference(self): diff = DiffResultWrapper( info_tree=InfoTree(