From a75c2c980e4167b4ca38032b4670552c13aaaa4e Mon Sep 17 00:00:00 2001
From: Stefan Keidel <stefan.keidel@lichtblick.de>
Date: Thu, 13 Jul 2023 13:13:04 +0200
Subject: [PATCH 1/7] feat: add --stats flag to --dbt

---
 data_diff/__main__.py | 1 +
 data_diff/dbt.py      | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
index dfd12577..3feae22d 100644
--- a/data_diff/__main__.py
+++ b/data_diff/__main__.py
@@ -317,6 +317,7 @@ def main(conf, run, **kw):
                 json_output=kw["json_output"],
                 state=state,
                 where_flag=kw["where"],
+                stats_flag=kw["stats"],
             )
         else:
             return _data_diff(
diff --git a/data_diff/dbt.py b/data_diff/dbt.py
index 212c0fc8..6431c48e 100644
--- a/data_diff/dbt.py
+++ b/data_diff/dbt.py
@@ -60,6 +60,7 @@ class TDiffVars(pydantic.BaseModel):
     include_columns: List[str]
     exclude_columns: List[str]
     dbt_model: Optional[str] = None
+    stats_flag: bool = False
 
 
 def dbt_diff(
@@ -71,6 +72,7 @@ def dbt_diff(
     state: Optional[str] = None,
     log_status_handler: Optional[LogStatusHandler] = None,
     where_flag: Optional[str] = None,
+    stats_flag: bool = False,
 ) -> None:
     print_version_info()
     diff_threads = []
@@ -110,7 +112,7 @@ def dbt_diff(
             if log_status_handler:
                 log_status_handler.set_prefix(f"Diffing {model.alias} \n")
 
-            diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag)
+            diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, stats_flag)
 
             # we won't always have a prod path when using state
             # when the model DNE in prod manifest, skip the model diff
@@ -160,6 +162,7 @@ def _get_diff_vars(
     config: TDatadiffConfig,
     model,
     where_flag: Optional[str] = None,
+    stats_flag: bool = False,
 ) -> TDiffVars:
     dev_database = model.database
     dev_schema = model.schema_
@@ -193,6 +196,7 @@ def _get_diff_vars(
         where_filter=where_flag or datadiff_model_config.where_filter,
         include_columns=datadiff_model_config.include_columns,
         exclude_columns=datadiff_model_config.exclude_columns,
+        stats_flag=stats_flag,
     )
 
 

From 11200b572df9b3f4c384145c1d2d953613ad0bf2 Mon Sep 17 00:00:00 2001
From: Stefan Keidel <stefan.keidel@lichtblick.de>
Date: Fri, 14 Jul 2023 13:30:11 +0200
Subject: [PATCH 2/7] feat: add --stats support for --json in --dbt mode

---
 data_diff/__main__.py |  1 +
 data_diff/dbt.py      |  1 +
 data_diff/format.py   | 36 ++++++++++++++++++++++--------------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
index 3feae22d..4047483d 100644
--- a/data_diff/__main__.py
+++ b/data_diff/__main__.py
@@ -1,4 +1,5 @@
 from copy import deepcopy
+from pudb import set_trace; set_trace(paused=False)
 from datetime import datetime
 import os
 import sys
diff --git a/data_diff/dbt.py b/data_diff/dbt.py
index 6431c48e..149080c0 100644
--- a/data_diff/dbt.py
+++ b/data_diff/dbt.py
@@ -323,6 +323,7 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None:
                         "removed": columns_removed,
                         "changed": columns_type_changed,
                     },
+                    stats_only=diff_vars.stats_flag,
                 )
             ),
             flush=True,
diff --git a/data_diff/format.py b/data_diff/format.py
index 5a16f274..aa4e8d0f 100644
--- a/data_diff/format.py
+++ b/data_diff/format.py
@@ -20,6 +20,7 @@ def jsonify(
     dbt_model: str,
     with_summary: bool = False,
     with_columns: Optional[Dict[str, List[str]]] = None,
+    stats_only: bool = False,
 ) -> "JsonDiff":
     """
     Converts the diff result into a JSON-serializable format.
@@ -33,21 +34,31 @@ def jsonify(
     t1_exclusive_rows = []
     t2_exclusive_rows = []
     diff_rows = []
+    rows = None
     schema = [field for field, _ in diff_info.diff_schema]
 
     t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)
 
-    diff_rows_jsonified = []
-    for row in diff_rows:
-        diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
+    if not stats_only:
+        diff_rows_jsonified = []
+        for row in diff_rows:
+            diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
 
-    t1_exclusive_rows_jsonified = []
-    for row in t1_exclusive_rows:
-        t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
+        t1_exclusive_rows_jsonified = []
+        for row in t1_exclusive_rows:
+            t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
 
-    t2_exclusive_rows_jsonified = []
-    for row in t2_exclusive_rows:
-        t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
+        t2_exclusive_rows_jsonified = []
+        for row in t2_exclusive_rows:
+            t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
+
+        rows = RowsDiff(
+            exclusive=ExclusiveDiff(
+                dataset1=t1_exclusive_rows_jsonified,
+                dataset2=t2_exclusive_rows_jsonified
+            ),
+            diff=diff_rows_jsonified,
+        )
 
     summary = None
     if with_summary:
@@ -70,10 +81,7 @@ def jsonify(
         model=dbt_model,
         dataset1=list(table1.table_path),
         dataset2=list(table2.table_path),
-        rows=RowsDiff(
-            exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified),
-            diff=diff_rows_jsonified,
-        ),
+        rows=rows,
         summary=summary,
         columns=columns,
     ).json()
@@ -175,7 +183,7 @@ class JsonDiff:
     model: str
     dataset1: List[str]
     dataset2: List[str]
-    rows: RowsDiff
+    rows: Optional[RowsDiff]
     summary: Optional[JsonDiffSummary]
     columns: Optional[JsonColumnsSummary]
 

From 72b1ea117580377154f47cb61b0f84b92860dab9 Mon Sep 17 00:00:00 2001
From: Stefan Keidel <stefan.keidel@lichtblick.de>
Date: Fri, 14 Jul 2023 13:34:36 +0200
Subject: [PATCH 3/7] fix: remove debugger import

---
 data_diff/__main__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
index 4047483d..3feae22d 100644
--- a/data_diff/__main__.py
+++ b/data_diff/__main__.py
@@ -1,5 +1,4 @@
 from copy import deepcopy
-from pudb import set_trace; set_trace(paused=False)
 from datetime import datetime
 import os
 import sys

From 63fff1c092ba65d3fd0d09c674da4f9877767a12 Mon Sep 17 00:00:00 2001
From: Dan <daniel@datafold.com>
Date: Fri, 11 Aug 2023 14:17:55 -0600
Subject: [PATCH 4/7] add test case for --json --stats

---
 tests/test_format.py | 77 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/tests/test_format.py b/tests/test_format.py
index 5de8feec..0aa8ee8e 100644
--- a/tests/test_format.py
+++ b/tests/test_format.py
@@ -95,6 +95,83 @@ def test_jsonify_diff(self):
             },
         )
 
+    def test_jsonify_no_stats(self):
+        diff = DiffResultWrapper(
+            info_tree=InfoTree(
+                info=SegmentInfo(
+                    tables=[
+                        TableSegment(table_path=("db", "schema", "table1"), key_columns=("id",), database=Database()),
+                        TableSegment(table_path=("db", "schema", "table2"), key_columns=("id",), database=Database()),
+                    ],
+                    diff_schema=(
+                        ("is_exclusive_a", bool),
+                        ("is_exclusive_b", bool),
+                        ("is_diff_id", int),
+                        ("is_diff_value", int),
+                        ("id_a", str),
+                        ("id_b", str),
+                        ("value_a", str),
+                        ("value_b", str),
+                    ),
+                    diff=[
+                        (False, False, 0, 1, "1", "1", "3", "201"),
+                        (True, False, 1, 1, "2", None, "4", None),
+                        (False, True, 1, 1, None, "3", None, "202"),
+                    ],
+                )
+            ),
+            diff=[],
+            stats={},
+        )
+        json_diff = jsonify(
+            diff,
+            dbt_model="my_model",
+            dataset1_columns=[
+                ("id", "NUMBER", Integer()),
+                ("value", "NUMBER", Integer()),
+            ],
+            dataset2_columns=[
+                ("id", "NUMBER", Integer()),
+                ("value", "NUMBER", Integer()),
+            ],
+            columns_diff={
+                "added": [],
+                "removed": [],
+                "typeChanged": [],
+            },
+            stats_only=True
+        )
+
+        self.assertEqual(
+            json_diff,
+            {
+                "version": "1.1.0",
+                "status": "success",
+                "result": "different",
+                "model": "my_model",
+                "dataset1": ["db", "schema", "table1"],
+                "dataset2": ["db", "schema", "table2"],
+                "rows": None,
+                "columns": {
+                    "dataset1": [
+                        {"name": "id", "type": "NUMBER", "kind": "integer"},
+                        {"name": "value", "type": "NUMBER", "kind": "integer"},
+                    ],
+                    "dataset2": [
+                        {"name": "id", "type": "NUMBER", "kind": "integer"},
+                        {"name": "value", "type": "NUMBER", "kind": "integer"},
+                    ],
+                    "primaryKey": ["id"],
+                    "exclusive": {
+                        "dataset1": [],
+                        "dataset2": [],
+                    },
+                    "typeChanged": [],
+                },
+                "summary": None,
+            },
+        )
+
     def test_jsonify_diff_no_difeference(self):
         diff = DiffResultWrapper(
             info_tree=InfoTree(

From ccdf54b0bf553b4c5223e053deb62fc2eb172838 Mon Sep 17 00:00:00 2001
From: Stefan Keidel <stefan.keidel@lichtblick.de>
Date: Mon, 14 Aug 2023 07:37:57 +0200
Subject: [PATCH 5/7] Lift rows diff into separate function

---
 data_diff/format.py | 46 ++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/data_diff/format.py b/data_diff/format.py
index 9541cc6c..52ee1339 100644
--- a/data_diff/format.py
+++ b/data_diff/format.py
@@ -60,25 +60,7 @@ def jsonify(
     t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)
 
     if not stats_only:
-        diff_rows_jsonified = []
-        for row in diff_rows:
-            diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
-
-        t1_exclusive_rows_jsonified = []
-        for row in t1_exclusive_rows:
-            t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
-
-        t2_exclusive_rows_jsonified = []
-        for row in t2_exclusive_rows:
-            t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
-
-        rows = RowsDiff(
-            exclusive=ExclusiveDiff(
-                dataset1=t1_exclusive_rows_jsonified,
-                dataset2=t2_exclusive_rows_jsonified
-            ),
-            diff=diff_rows_jsonified,
-        )
+        rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows)
 
     summary = None
     if with_summary:
@@ -267,6 +249,32 @@ def _group_rows(
     return t1_exclusive_rows, t2_exclusive_rows, diff_rows
 
 
+def _make_rows_diff(
+    t1_exclusive_rows: List[Dict[str, Any]],
+    t2_exclusive_rows: List[Dict[str, Any]],
+    diff_rows: List[Dict[str, Any]]
+) -> RowsDiff:
+    diff_rows_jsonified = []
+    for row in diff_rows:
+        diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
+
+    t1_exclusive_rows_jsonified = []
+    for row in t1_exclusive_rows:
+        t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
+
+    t2_exclusive_rows_jsonified = []
+    for row in t2_exclusive_rows:
+        t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
+
+    return RowsDiff(
+        exclusive=ExclusiveDiff(
+            dataset1=t1_exclusive_rows_jsonified,
+            dataset2=t2_exclusive_rows_jsonified
+        ),
+        diff=diff_rows_jsonified,
+    )
+
+
 def _jsonify_diff(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonDiffRowValue]:
     columns = collections.defaultdict(dict)
     for field, value in row.items():

From 440c43b1bc4ee22d118c85015fb390fbb6c6faf6 Mon Sep 17 00:00:00 2001
From: Stefan Keidel <stefan.keidel@lichtblick.de>
Date: Mon, 14 Aug 2023 12:18:33 +0200
Subject: [PATCH 6/7] fix: pass key columns

Apologies, I only tested the case where --stats is passed.
---
 data_diff/format.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data_diff/format.py b/data_diff/format.py
index 52ee1339..c57f21a9 100644
--- a/data_diff/format.py
+++ b/data_diff/format.py
@@ -60,7 +60,7 @@ def jsonify(
     t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)
 
     if not stats_only:
-        rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows)
+        rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows, key_columns)
 
     summary = None
     if with_summary:
@@ -252,7 +252,8 @@ def _group_rows(
 def _make_rows_diff(
     t1_exclusive_rows: List[Dict[str, Any]],
     t2_exclusive_rows: List[Dict[str, Any]],
-    diff_rows: List[Dict[str, Any]]
+    diff_rows: List[Dict[str, Any]],
+    key_columns: List[str],
 ) -> RowsDiff:
     diff_rows_jsonified = []
     for row in diff_rows:

From f473546086cf7e812552d4eefd52829b0ecad89f Mon Sep 17 00:00:00 2001
From: Stefan Keidel <stefan.keidel@lichtblick.de>
Date: Mon, 14 Aug 2023 12:20:05 +0200
Subject: [PATCH 7/7] remove trailing comma

---
 data_diff/format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_diff/format.py b/data_diff/format.py
index c57f21a9..bfeb0b1e 100644
--- a/data_diff/format.py
+++ b/data_diff/format.py
@@ -253,7 +253,7 @@ def _make_rows_diff(
     t1_exclusive_rows: List[Dict[str, Any]],
     t2_exclusive_rows: List[Dict[str, Any]],
     diff_rows: List[Dict[str, Any]],
-    key_columns: List[str],
+    key_columns: List[str]
 ) -> RowsDiff:
     diff_rows_jsonified = []
     for row in diff_rows: