From a75c2c980e4167b4ca38032b4670552c13aaaa4e Mon Sep 17 00:00:00 2001 From: Stefan Keidel <stefan.keidel@lichtblick.de> Date: Thu, 13 Jul 2023 13:13:04 +0200 Subject: [PATCH 1/7] feat: add --stats flag to --dbt --- data_diff/__main__.py | 1 + data_diff/dbt.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index dfd12577..3feae22d 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -317,6 +317,7 @@ def main(conf, run, **kw): json_output=kw["json_output"], state=state, where_flag=kw["where"], + stats_flag=kw["stats"], ) else: return _data_diff( diff --git a/data_diff/dbt.py b/data_diff/dbt.py index 212c0fc8..6431c48e 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -60,6 +60,7 @@ class TDiffVars(pydantic.BaseModel): include_columns: List[str] exclude_columns: List[str] dbt_model: Optional[str] = None + stats_flag: bool = False def dbt_diff( @@ -71,6 +72,7 @@ def dbt_diff( state: Optional[str] = None, log_status_handler: Optional[LogStatusHandler] = None, where_flag: Optional[str] = None, + stats_flag: bool = False, ) -> None: print_version_info() diff_threads = [] @@ -110,7 +112,7 @@ def dbt_diff( if log_status_handler: log_status_handler.set_prefix(f"Diffing {model.alias} \n") - diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag) + diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, stats_flag) # we won't always have a prod path when using state # when the model DNE in prod manifest, skip the model diff @@ -160,6 +162,7 @@ def _get_diff_vars( config: TDatadiffConfig, model, where_flag: Optional[str] = None, + stats_flag: bool = False, ) -> TDiffVars: dev_database = model.database dev_schema = model.schema_ @@ -193,6 +196,7 @@ def _get_diff_vars( where_filter=where_flag or datadiff_model_config.where_filter, include_columns=datadiff_model_config.include_columns, exclude_columns=datadiff_model_config.exclude_columns, + stats_flag=stats_flag, ) From 11200b572df9b3f4c384145c1d2d953613ad0bf2 Mon Sep 17 00:00:00 2001 From: Stefan Keidel <stefan.keidel@lichtblick.de> Date: Fri, 14 Jul 2023 13:30:11 +0200 Subject: [PATCH 2/7] feat: add --stats support for --json in --dbt mode --- data_diff/__main__.py | 1 + data_diff/dbt.py | 1 + data_diff/format.py | 36 ++++++++++++++++++++++-------------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 3feae22d..4047483d 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -1,4 +1,5 @@ from copy import deepcopy +from pudb import set_trace; set_trace(paused=False) from datetime import datetime import os import sys diff --git a/data_diff/dbt.py b/data_diff/dbt.py index 6431c48e..149080c0 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -323,6 +323,7 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None: "removed": columns_removed, "changed": columns_type_changed, }, + stats_only=diff_vars.stats_flag, ) ), flush=True, diff --git a/data_diff/format.py b/data_diff/format.py index 5a16f274..aa4e8d0f 100644 --- a/data_diff/format.py +++ b/data_diff/format.py @@ -20,6 +20,7 @@ def jsonify( dbt_model: str, with_summary: bool = False, with_columns: Optional[Dict[str, List[str]]] = None, + stats_only: bool = False, ) -> "JsonDiff": """ Converts the diff result into a JSON-serializable format. @@ -33,21 +34,31 @@ def jsonify( t1_exclusive_rows = [] t2_exclusive_rows = [] diff_rows = [] + rows = None schema = [field for field, _ in diff_info.diff_schema] t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema) - diff_rows_jsonified = [] - for row in diff_rows: - diff_rows_jsonified.append(_jsonify_diff(row, key_columns)) + if not stats_only: + diff_rows_jsonified = [] + for row in diff_rows: + diff_rows_jsonified.append(_jsonify_diff(row, key_columns)) - t1_exclusive_rows_jsonified = [] - for row in t1_exclusive_rows: - t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + t1_exclusive_rows_jsonified = [] + for row in t1_exclusive_rows: + t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) - t2_exclusive_rows_jsonified = [] - for row in t2_exclusive_rows: - t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + t2_exclusive_rows_jsonified = [] + for row in t2_exclusive_rows: + t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + + rows = RowsDiff( + exclusive=ExclusiveDiff( + dataset1=t1_exclusive_rows_jsonified, + dataset2=t2_exclusive_rows_jsonified + ), + diff=diff_rows_jsonified, + ) summary = None if with_summary: @@ -70,10 +81,7 @@ def jsonify( model=dbt_model, dataset1=list(table1.table_path), dataset2=list(table2.table_path), - rows=RowsDiff( - exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified), - diff=diff_rows_jsonified, - ), + rows=rows, summary=summary, columns=columns, ).json() @@ -175,7 +183,7 @@ class JsonDiff: model: str dataset1: List[str] dataset2: List[str] - rows: RowsDiff + rows: Optional[RowsDiff] summary: Optional[JsonDiffSummary] columns: Optional[JsonColumnsSummary] From 72b1ea117580377154f47cb61b0f84b92860dab9 Mon Sep 17 00:00:00 2001 From: Stefan Keidel <stefan.keidel@lichtblick.de> Date: Fri, 14 Jul 2023 13:34:36 +0200 Subject: [PATCH 3/7] fix: remove debugger import --- data_diff/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 4047483d..3feae22d 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -1,5 +1,4 @@ from copy import deepcopy -from pudb import set_trace; set_trace(paused=False) from datetime import datetime import os import sys From 63fff1c092ba65d3fd0d09c674da4f9877767a12 Mon Sep 17 00:00:00 2001 From: Dan <daniel@datafold.com> Date: Fri, 11 Aug 2023 14:17:55 -0600 Subject: [PATCH 4/7] add test case for --json --stats --- tests/test_format.py | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/test_format.py b/tests/test_format.py index 5de8feec..0aa8ee8e 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -95,6 +95,83 @@ def test_jsonify_diff(self): }, ) + def test_jsonify_no_stats(self): + diff = DiffResultWrapper( + info_tree=InfoTree( + info=SegmentInfo( + tables=[ + TableSegment(table_path=("db", "schema", "table1"), key_columns=("id",), database=Database()), + TableSegment(table_path=("db", "schema", "table2"), key_columns=("id",), database=Database()), + ], + diff_schema=( + ("is_exclusive_a", bool), + ("is_exclusive_b", bool), + ("is_diff_id", int), + ("is_diff_value", int), + ("id_a", str), + ("id_b", str), + ("value_a", str), + ("value_b", str), + ), + diff=[ + (False, False, 0, 1, "1", "1", "3", "201"), + (True, False, 1, 1, "2", None, "4", None), + (False, True, 1, 1, None, "3", None, "202"), + ], + ) + ), + diff=[], + stats={}, + ) + json_diff = jsonify( + diff, + dbt_model="my_model", + dataset1_columns=[ + ("id", "NUMBER", Integer()), + ("value", "NUMBER", Integer()), + ], + dataset2_columns=[ + ("id", "NUMBER", Integer()), + ("value", "NUMBER", Integer()), + ], + columns_diff={ + "added": [], + "removed": [], + "typeChanged": [], + }, + stats_only=True + ) + + self.assertEqual( + json_diff, + { + "version": "1.1.0", + "status": "success", + "result": "different", + "model": "my_model", + "dataset1": ["db", "schema", "table1"], + "dataset2": ["db", "schema", "table2"], + "rows": None, + "columns": { + "dataset1": [ + {"name": "id", "type": "NUMBER", "kind": "integer"}, + {"name": "value", "type": "NUMBER", "kind": "integer"}, + ], + "dataset2": [ + {"name": "id", "type": "NUMBER", "kind": "integer"}, + {"name": "value", "type": "NUMBER", "kind": "integer"}, + ], + "primaryKey": ["id"], + "exclusive": { + "dataset1": [], + "dataset2": [], + }, + "typeChanged": [], + }, + "summary": None, + }, + ) + def test_jsonify_diff_no_difeference(self): diff = DiffResultWrapper( info_tree=InfoTree( From ccdf54b0bf553b4c5223e053deb62fc2eb172838 Mon Sep 17 00:00:00 2001 From: Stefan Keidel <stefan.keidel@lichtblick.de> Date: Mon, 14 Aug 2023 07:37:57 +0200 Subject: [PATCH 5/7] Lift rows diff into separate function --- data_diff/format.py | 46 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/data_diff/format.py b/data_diff/format.py index 9541cc6c..52ee1339 100644 --- a/data_diff/format.py +++ b/data_diff/format.py @@ -60,25 +60,7 @@ def jsonify( t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema) if not stats_only: - diff_rows_jsonified = [] - for row in diff_rows: - diff_rows_jsonified.append(_jsonify_diff(row, key_columns)) - - t1_exclusive_rows_jsonified = [] - for row in t1_exclusive_rows: - t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) - - t2_exclusive_rows_jsonified = [] - for row in t2_exclusive_rows: - t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) - - rows = RowsDiff( - exclusive=ExclusiveDiff( - dataset1=t1_exclusive_rows_jsonified, - dataset2=t2_exclusive_rows_jsonified - ), - diff=diff_rows_jsonified, - ) + rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows) summary = None if with_summary: @@ -267,6 +249,32 @@ def _group_rows( return t1_exclusive_rows, t2_exclusive_rows, diff_rows +def _make_rows_diff( + t1_exclusive_rows: List[Dict[str, Any]], + t2_exclusive_rows: List[Dict[str, Any]], + diff_rows: List[Dict[str, Any]] +) -> RowsDiff: + diff_rows_jsonified = [] + for row in diff_rows: + diff_rows_jsonified.append(_jsonify_diff(row, key_columns)) + + t1_exclusive_rows_jsonified = [] + for row in t1_exclusive_rows: + t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + + t2_exclusive_rows_jsonified = [] + for row in t2_exclusive_rows: + t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns)) + + return RowsDiff( + exclusive=ExclusiveDiff( + dataset1=t1_exclusive_rows_jsonified, + dataset2=t2_exclusive_rows_jsonified + ), + diff=diff_rows_jsonified, + ) + + def _jsonify_diff(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonDiffRowValue]: columns = collections.defaultdict(dict) for field, value in row.items(): From 440c43b1bc4ee22d118c85015fb390fbb6c6faf6 Mon Sep 17 00:00:00 2001 From: Stefan Keidel <stefan.keidel@lichtblick.de> Date: Mon, 14 Aug 2023 12:18:33 +0200 Subject: [PATCH 6/7] fix: pass key columns Apologies, I only tested the case where --stats is passed. --- data_diff/format.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data_diff/format.py b/data_diff/format.py index 52ee1339..c57f21a9 100644 --- a/data_diff/format.py +++ b/data_diff/format.py @@ -60,7 +60,7 @@ def jsonify( t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema) if not stats_only: - rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows) + rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows, key_columns) summary = None if with_summary: @@ -252,7 +252,8 @@ def _group_rows( def _make_rows_diff( t1_exclusive_rows: List[Dict[str, Any]], t2_exclusive_rows: List[Dict[str, Any]], - diff_rows: List[Dict[str, Any]] + diff_rows: List[Dict[str, Any]], + key_columns: List[str], ) -> RowsDiff: diff_rows_jsonified = [] for row in diff_rows: From f473546086cf7e812552d4eefd52829b0ecad89f Mon Sep 17 00:00:00 2001 From: Stefan Keidel <stefan.keidel@lichtblick.de> Date: Mon, 14 Aug 2023 12:20:05 +0200 Subject: [PATCH 7/7] remove trailing comma --- data_diff/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_diff/format.py b/data_diff/format.py index c57f21a9..bfeb0b1e 100644 --- a/data_diff/format.py +++ b/data_diff/format.py @@ -253,7 +253,7 @@ def _make_rows_diff( t1_exclusive_rows: List[Dict[str, Any]], t2_exclusive_rows: List[Dict[str, Any]], diff_rows: List[Dict[str, Any]], - key_columns: List[str], + key_columns: List[str] ) -> RowsDiff: diff_rows_jsonified = [] for row in diff_rows: