Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

rudimentary support for --stats in --dbt --json mode #647

Merged
merged 11 commits into from
Aug 14, 2023
1 change: 1 addition & 0 deletions data_diff/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ def main(conf, run, **kw):
json_output=kw["json_output"],
state=state,
where_flag=kw["where"],
stats_flag=kw["stats"],
columns_flag=kw["columns"],
)
else:
Expand Down
7 changes: 6 additions & 1 deletion data_diff/dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class TDiffVars(pydantic.BaseModel):
include_columns: List[str]
exclude_columns: List[str]
dbt_model: Optional[str] = None
stats_flag: bool = False


def dbt_diff(
Expand All @@ -73,6 +74,7 @@ def dbt_diff(
state: Optional[str] = None,
log_status_handler: Optional[LogStatusHandler] = None,
where_flag: Optional[str] = None,
stats_flag: bool = False,
columns_flag: Optional[Tuple[str]] = None,
) -> None:
print_version_info()
Expand Down Expand Up @@ -113,7 +115,7 @@ def dbt_diff(
if log_status_handler:
log_status_handler.set_prefix(f"Diffing {model.alias} \n")

diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, columns_flag)
diff_vars = _get_diff_vars(dbt_parser, config, model, where_flag, stats_flag, columns_flag)

# we won't always have a prod path when using state
# when the model DNE in prod manifest, skip the model diff
Expand Down Expand Up @@ -165,6 +167,7 @@ def _get_diff_vars(
config: TDatadiffConfig,
model,
where_flag: Optional[str] = None,
stats_flag: bool = False,
columns_flag: Optional[Tuple[str]] = None,
) -> TDiffVars:
cli_columns = list(columns_flag) if columns_flag else []
Expand Down Expand Up @@ -200,6 +203,7 @@ def _get_diff_vars(
where_filter=where_flag or datadiff_model_config.where_filter,
include_columns=cli_columns or datadiff_model_config.include_columns,
exclude_columns=[] if cli_columns else datadiff_model_config.exclude_columns,
stats_flag=stats_flag,
)


Expand Down Expand Up @@ -338,6 +342,7 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None:
"removed": columns_removed,
"changed": columns_type_changed,
},
stats_only=diff_vars.stats_flag,
)
),
flush=True,
Expand Down
49 changes: 33 additions & 16 deletions data_diff/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def jsonify(
dataset2_columns: Columns,
columns_diff: Dict[str, List[str]],
with_summary: bool = False,
stats_only: bool = False,
) -> "JsonDiff":
"""
Converts the diff result into a JSON-serializable format.
Expand All @@ -53,21 +54,13 @@ def jsonify(
t1_exclusive_rows = []
t2_exclusive_rows = []
diff_rows = []
rows = None
schema = [field for field, _ in diff_info.diff_schema]

t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)

diff_rows_jsonified = []
for row in diff_rows:
diff_rows_jsonified.append(_jsonify_diff(row, key_columns))

t1_exclusive_rows_jsonified = []
for row in t1_exclusive_rows:
t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))

t2_exclusive_rows_jsonified = []
for row in t2_exclusive_rows:
t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
if not stats_only:
rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows, key_columns)

summary = None
if with_summary:
Expand All @@ -87,10 +80,7 @@ def jsonify(
model=dbt_model,
dataset1=list(table1.table_path),
dataset2=list(table2.table_path),
rows=RowsDiff(
exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified),
diff=diff_rows_jsonified,
),
rows=rows,
summary=summary,
columns=columns,
).json()
Expand Down Expand Up @@ -228,7 +218,7 @@ class JsonDiff:
model: str
dataset1: List[str]
dataset2: List[str]
rows: RowsDiff
rows: Optional[RowsDiff]
summary: Optional[JsonDiffSummary]
columns: Optional[JsonColumnsSummary]

Expand Down Expand Up @@ -259,6 +249,33 @@ def _group_rows(
return t1_exclusive_rows, t2_exclusive_rows, diff_rows


def _make_rows_diff(
t1_exclusive_rows: List[Dict[str, Any]],
t2_exclusive_rows: List[Dict[str, Any]],
diff_rows: List[Dict[str, Any]],
key_columns: List[str]
) -> RowsDiff:
diff_rows_jsonified = []
for row in diff_rows:
diff_rows_jsonified.append(_jsonify_diff(row, key_columns))

t1_exclusive_rows_jsonified = []
for row in t1_exclusive_rows:
t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))

t2_exclusive_rows_jsonified = []
for row in t2_exclusive_rows:
t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))

return RowsDiff(
exclusive=ExclusiveDiff(
dataset1=t1_exclusive_rows_jsonified,
dataset2=t2_exclusive_rows_jsonified
),
diff=diff_rows_jsonified,
)


def _jsonify_diff(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonDiffRowValue]:
columns = collections.defaultdict(dict)
for field, value in row.items():
Expand Down
77 changes: 77 additions & 0 deletions tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,83 @@ def test_jsonify_diff(self):
},
)

def test_jsonify_no_stats(self):
diff = DiffResultWrapper(
info_tree=InfoTree(
info=SegmentInfo(
tables=[
TableSegment(table_path=("db", "schema", "table1"), key_columns=("id",), database=Database()),
TableSegment(table_path=("db", "schema", "table2"), key_columns=("id",), database=Database()),
],
diff_schema=(
("is_exclusive_a", bool),
("is_exclusive_b", bool),
("is_diff_id", int),
("is_diff_value", int),
("id_a", str),
("id_b", str),
("value_a", str),
("value_b", str),
),
diff=[
(False, False, 0, 1, "1", "1", "3", "201"),
(True, False, 1, 1, "2", None, "4", None),
(False, True, 1, 1, None, "3", None, "202"),
],
)
),
diff=[],
stats={},
)
json_diff = jsonify(
diff,
dbt_model="my_model",
dataset1_columns=[
("id", "NUMBER", Integer()),
("value", "NUMBER", Integer()),
],
dataset2_columns=[
("id", "NUMBER", Integer()),
("value", "NUMBER", Integer()),
],
columns_diff={
"added": [],
"removed": [],
"typeChanged": [],
},
stats_only=True
)

self.assertEqual(
json_diff,
{
"version": "1.1.0",
"status": "success",
"result": "different",
"model": "my_model",
"dataset1": ["db", "schema", "table1"],
"dataset2": ["db", "schema", "table2"],
"rows": None,
"columns": {
"dataset1": [
{"name": "id", "type": "NUMBER", "kind": "integer"},
{"name": "value", "type": "NUMBER", "kind": "integer"},
],
"dataset2": [
{"name": "id", "type": "NUMBER", "kind": "integer"},
{"name": "value", "type": "NUMBER", "kind": "integer"},
],
"primaryKey": ["id"],
"exclusive": {
"dataset1": [],
"dataset2": [],
},
"typeChanged": [],
},
"summary": None,
},
)

def test_jsonify_diff_no_difeference(self):
diff = DiffResultWrapper(
info_tree=InfoTree(
Expand Down