Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Refactor Artifacts Parser to be Native so it's less brittle with each dbt version change #688

Merged
merged 40 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
9a85329
helpful notes for sung
Aug 25, 2023
e25163d
v1 of native run results parser
Aug 28, 2023
13fead8
remove debug comments
Aug 28, 2023
546f991
remove from import
Aug 28, 2023
13ef7bb
Update data_diff/dbt_parser.py
sungchun12 Aug 28, 2023
878be3c
Update data_diff/dbt_parser.py
sungchun12 Aug 28, 2023
71a0333
remove an import
Aug 28, 2023
8e2d9f2
remove another print
Aug 28, 2023
b262646
add schema validation for specific fields
Aug 29, 2023
5375832
stricter validation
Aug 30, 2023
aae3d72
replaced manifest parser with native one
Aug 30, 2023
224028b
Apply suggestions from code review for spacing
sungchun12 Aug 30, 2023
8a30c6e
Apply suggestions from code review for double quotes
sungchun12 Aug 30, 2023
1aeb072
create space
Aug 30, 2023
f60afc6
Apply suggestions from code review for more formatting
sungchun12 Aug 30, 2023
ad8f16b
add more necessary fields
Aug 30, 2023
69c0354
something to think through
Aug 30, 2023
dbafe21
better type hints
Aug 31, 2023
5307a42
remove comment
Aug 31, 2023
4aa40dd
separation of duties
Aug 31, 2023
e0d98fe
remove mock call
Aug 31, 2023
da727a9
draft unit tests
Sep 1, 2023
5fe47e9
first draft of unit tests
Sep 1, 2023
f3c4cb1
passing tests
Sep 1, 2023
01b59d9
more pythonic
Sep 1, 2023
545ef0e
remove nested git repo
Sep 1, 2023
f0a7b01
require name
Sep 1, 2023
5c72749
add strictness
Sep 1, 2023
0bfe69f
black formatting
Sep 5, 2023
a092a93
reduce scope of changes
Sep 6, 2023
ab3b89b
fix imports
Sep 6, 2023
8246f18
update patches
Sep 6, 2023
d62ab3c
fix mocking
Sep 6, 2023
b5370fa
fix test failure
Sep 6, 2023
1d4a34b
fix mock tests
Sep 6, 2023
3b4844b
Merge branch 'master' into refactor/native-manifest-parser
dlawin Sep 12, 2023
bef7a73
remove submodule
Sep 12, 2023
2e682e2
update toml
Sep 12, 2023
385d96e
remove submodule again
Sep 12, 2023
2e938db
add pydantic back in
Sep 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions data_diff/dbt_config_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from enum import Enum
from typing import List, Dict, Optional, Any
from pydantic import BaseModel, Field


class ManifestJsonConfig(BaseModel):
class Metadata(BaseModel):
dbt_version: str = Field(..., regex=r"^\d+\.\d+\.\d+([a-zA-Z0-9]+)?$")
project_id: str
user_id: str

class Nodes(BaseModel):
class Config(BaseModel):
database: Optional[str]
schema_: Optional[str] = Field(..., alias="schema")
tags: List[str]

class Column(BaseModel):
meta: Dict[str, Any]
tags: List[str]

class TestMetadata(BaseModel):
name: str
kwargs: Dict[str, Any]

class DependsOn(BaseModel):
macros: List[str] = []
nodes: List[str] = []

unique_id: str
resource_type: str
name: str
alias: str
database: str
schema_: str = Field(..., alias="schema")
columns: Optional[Dict[str, Column]]
meta: Dict[str, Any]
config: Config
tags: List[str]
test_metadata: Optional[TestMetadata]
depends_on: DependsOn

metadata: Metadata
nodes: Dict[str, Nodes]


class RunResultsJsonConfig(BaseModel):
class Metadata(BaseModel):
dbt_version: str = Field(..., regex=r"^\d+\.\d+\.\d+([a-zA-Z0-9]+)?$")

class Results(BaseModel):
class Status(Enum):
success = "success"
error = "error"
skipped = "skipped"
pass_ = "pass"
fail = "fail"
warn = "warn"
runtime_error = "runtime error"

status: Status
unique_id: str = Field("...")

metadata: Metadata
results: List[Results]
26 changes: 12 additions & 14 deletions data_diff/dbt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from pathlib import Path
from typing import List, Dict, Tuple, Set, Optional
import yaml
from pydantic import BaseModel

from packaging.version import parse as parse_version
import pydantic
from dbt_artifacts_parser.parser import parse_run_results, parse_manifest
from dbt.config.renderer import ProfileRenderer
from .dbt_config_validators import ManifestJsonConfig, RunResultsJsonConfig

from data_diff.errors import (
DataDiffDbtBigQueryUnsupportedMethodError,
Expand Down Expand Up @@ -81,13 +81,13 @@ def legacy_profiles_dir() -> Path:
return Path.home() / ".dbt"


class TDatadiffModelConfig(pydantic.BaseModel):
class TDatadiffModelConfig(BaseModel):
where_filter: Optional[str] = None
include_columns: List[str] = []
exclude_columns: List[str] = []


class TDatadiffConfig(pydantic.BaseModel):
class TDatadiffConfig(BaseModel):
prod_database: Optional[str] = None
prod_schema: Optional[str] = None
prod_custom_schema: Optional[str] = None
Expand Down Expand Up @@ -213,7 +213,6 @@ def get_dbt_selection_models(self, dbt_selection: str) -> List[str]:

def get_simple_model_selection(self, dbt_selection: str):
model_nodes = dict(filter(lambda item: item[0].startswith("model."), self.dev_manifest_obj.nodes.items()))

model_unique_key_list = [k for k, v in model_nodes.items() if v.name == dbt_selection]

# name *should* always be unique, but just in case:
Expand All @@ -230,13 +229,13 @@ def get_simple_model_selection(self, dbt_selection: str):

return [model]

def get_run_results_models(self):
def get_run_results_models(self) -> List[ManifestJsonConfig.Nodes]:
with open(self.project_dir / RUN_RESULTS_PATH) as run_results:
logger.info(f"Parsing file {RUN_RESULTS_PATH}")
run_results_dict = json.load(run_results)
run_results_obj = parse_run_results(run_results=run_results_dict)
run_results_validated = RunResultsJsonConfig.parse_obj(run_results_dict)

dbt_version = parse_version(run_results_obj.metadata.dbt_version)
dbt_version = parse_version(run_results_validated.metadata.dbt_version)

if dbt_version < parse_version(LOWER_DBT_V):
raise DataDiffDbtRunResultsVersionError(
Expand All @@ -247,7 +246,8 @@ def get_run_results_models(self):
f"{dbt_version} is a recent version of dbt and may not be fully tested with data-diff! \nPlease report any issues to https://github.com/datafold/data-diff/issues"
)

success_models = [x.unique_id for x in run_results_obj.results if x.status.name == "success"]
success_models = [x.unique_id for x in run_results_validated.results if x.status == x.Status.success]

models = [self.dev_manifest_obj.nodes.get(x) for x in success_models]
if not models:
raise DataDiffDbtNoSuccessfulModelsInRunError(
Expand All @@ -256,11 +256,11 @@ def get_run_results_models(self):

return models

def get_manifest_obj(self, path: Path):
def get_manifest_obj(self, path: Path) -> ManifestJsonConfig:
with open(path) as manifest:
logger.info(f"Parsing file {path}")
manifest_dict = json.load(manifest)
manifest_obj = parse_manifest(manifest=manifest_dict)
manifest_obj = ManifestJsonConfig.parse_obj(manifest_dict)
return manifest_obj

def get_project_dict(self):
Expand Down Expand Up @@ -433,7 +433,6 @@ def get_pk_from_model(self, node, unique_columns: dict, pk_tag: str) -> List[str
if from_tags:
logger.debug("Found PKs via Tags: " + str(from_tags))
return from_tags

if node.unique_id in unique_columns:
from_uniq = unique_columns.get(node.unique_id)
if from_uniq is not None:
Expand All @@ -451,7 +450,7 @@ def get_unique_columns(self) -> Dict[str, Set[str]]:
cols_by_uid = defaultdict(set)
for node in manifest.nodes.values():
try:
if not (node.resource_type.value == "test" and hasattr(node, "test_metadata")):
if not (node.resource_type == "test" and hasattr(node, "test_metadata")):
continue

if not node.depends_on or not node.depends_on.nodes:
Expand All @@ -465,7 +464,6 @@ def get_unique_columns(self) -> Dict[str, Set[str]]:
continue

model_node = manifest.nodes[uid]

if node.test_metadata.name == "unique":
column_name: str = node.test_metadata.kwargs["column_name"]
for col in self._parse_concat_pk_definition(column_name):
Expand Down
1 change: 1 addition & 0 deletions datafold-demo-sung
Submodule datafold-demo-sung added at 0f8322
Loading