Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Refactor Artifacts Parser to be Native so it's less brittle with each dbt version change #688

Merged
merged 40 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
9a85329
helpful notes for sung
Aug 25, 2023
e25163d
v1 of native run results parser
Aug 28, 2023
13fead8
remove debug comments
Aug 28, 2023
546f991
remove from import
Aug 28, 2023
13ef7bb
Update data_diff/dbt_parser.py
sungchun12 Aug 28, 2023
878be3c
Update data_diff/dbt_parser.py
sungchun12 Aug 28, 2023
71a0333
remove an import
Aug 28, 2023
8e2d9f2
remove another print
Aug 28, 2023
b262646
add schema validation for specific fields
Aug 29, 2023
5375832
stricter validation
Aug 30, 2023
aae3d72
replaced manifest parser with native one
Aug 30, 2023
224028b
Apply suggestions from code review for spacing
sungchun12 Aug 30, 2023
8a30c6e
Apply suggestions from code review for double quotes
sungchun12 Aug 30, 2023
1aeb072
create space
Aug 30, 2023
f60afc6
Apply suggestions from code review for more formatting
sungchun12 Aug 30, 2023
ad8f16b
add more necessary fields
Aug 30, 2023
69c0354
something to think through
Aug 30, 2023
dbafe21
better type hints
Aug 31, 2023
5307a42
remove comment
Aug 31, 2023
4aa40dd
separation of duties
Aug 31, 2023
e0d98fe
remove mock call
Aug 31, 2023
da727a9
draft unit tests
Sep 1, 2023
5fe47e9
first draft of unit tests
Sep 1, 2023
f3c4cb1
passing tests
Sep 1, 2023
01b59d9
more pythonic
Sep 1, 2023
545ef0e
remove nested git repo
Sep 1, 2023
f0a7b01
require name
Sep 1, 2023
5c72749
add strictness
Sep 1, 2023
0bfe69f
black formatting
Sep 5, 2023
a092a93
reduce scope of changes
Sep 6, 2023
ab3b89b
fix imports
Sep 6, 2023
8246f18
update patches
Sep 6, 2023
d62ab3c
fix mocking
Sep 6, 2023
b5370fa
fix test failure
Sep 6, 2023
1d4a34b
fix mock tests
Sep 6, 2023
3b4844b
Merge branch 'master' into refactor/native-manifest-parser
dlawin Sep 12, 2023
bef7a73
remove submodule
Sep 12, 2023
2e682e2
update toml
Sep 12, 2023
385d96e
remove submodule again
Sep 12, 2023
2e938db
add pydantic back in
Sep 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions data_diff/dbt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import yaml

from packaging.version import parse as parse_version
import pydantic
from dbt_artifacts_parser.parser import parse_run_results, parse_manifest
from pydantic import BaseModel, Field
from dbt_artifacts_parser.parser import parse_manifest # TODO: remove this import
from dbt.config.renderer import ProfileRenderer

from data_diff.errors import (
Expand Down Expand Up @@ -81,19 +81,31 @@ def legacy_profiles_dir() -> Path:
return Path.home() / ".dbt"


class TDatadiffModelConfig(pydantic.BaseModel):
class TDatadiffModelConfig(BaseModel):
where_filter: Optional[str] = None
include_columns: List[str] = []
exclude_columns: List[str] = []


class TDatadiffConfig(pydantic.BaseModel):
class TDatadiffConfig(BaseModel):
prod_database: Optional[str] = None
prod_schema: Optional[str] = None
prod_custom_schema: Optional[str] = None
datasource_id: Optional[int] = None


class RunResultsJsonConfig(BaseModel):
class Metadata(BaseModel):
dbt_version: str = Field(..., regex=r'^\d+\.\d+\.\d+([a-zA-Z0-9]+)?$')

class Results(BaseModel):
status: str
unique_id: str

metadata: Metadata
results: List[Results]


class DbtParser:
def __init__(
self,
Expand All @@ -106,10 +118,10 @@ def __init__(
self.project_dir = Path(project_dir_override or default_project_dir())
self.connection = {}
self.project_dict = self.get_project_dict()
self.dev_manifest_obj = self.get_manifest_obj(self.project_dir / MANIFEST_PATH)
self.dev_manifest_obj = self.get_manifest_obj(self.project_dir / MANIFEST_PATH) # TODO: this is where the manfiest object gets called for dev
self.prod_manifest_obj = None
if state:
self.prod_manifest_obj = self.get_manifest_obj(Path(state))
self.prod_manifest_obj = self.get_manifest_obj(Path(state)) # TODO: this is where the manfiest object gets called for prod based on a state parameter

self.dbt_user_id = self.dev_manifest_obj.metadata.user_id
self.dbt_version = self.dev_manifest_obj.metadata.dbt_version
Expand Down Expand Up @@ -230,13 +242,16 @@ def get_simple_model_selection(self, dbt_selection: str):

return [model]

def get_run_results_models(self):
# TODO: add pydantic to valdidate a subset of the run_results.json schema, example; /Users/sung/Desktop/data-diff/data_diff_demo/pydantic_example.py
# TODO: raise an exception that `run_results.json` is malformed based on the pydantic validation
def get_run_results_models(self) -> List[str]:
with open(self.project_dir / RUN_RESULTS_PATH) as run_results:
logger.info(f"Parsing file {RUN_RESULTS_PATH}")
run_results_dict = json.load(run_results)
run_results_obj = parse_run_results(run_results=run_results_dict)

run_results_validated = RunResultsJsonConfig.parse_obj(run_results_dict)

dbt_version = parse_version(run_results_obj.metadata.dbt_version)
dbt_version = parse_version(run_results_validated.metadata.dbt_version)

if dbt_version < parse_version(LOWER_DBT_V):
raise DataDiffDbtRunResultsVersionError(
Expand All @@ -247,8 +262,10 @@ def get_run_results_models(self):
f"{dbt_version} is a recent version of dbt and may not be fully tested with data-diff! \nPlease report any issues to https://github.com/datafold/data-diff/issues"
)

success_models = [x.unique_id for x in run_results_obj.results if x.status.name == "success"]
success_models = [x.unique_id for x in run_results_validated.results if x.status == "success"]

models = [self.dev_manifest_obj.nodes.get(x) for x in success_models]
print(type(models[0])) # TODO this prints a class object type, I'll need to understand what other attributes are accessed before assuming getting a list of strings is enough
if not models:
raise DataDiffDbtNoSuccessfulModelsInRunError(
"Expected > 0 successful models runs from the last dbt command."
Expand All @@ -260,7 +277,7 @@ def get_manifest_obj(self, path: Path):
with open(path) as manifest:
logger.info(f"Parsing file {path}")
manifest_dict = json.load(manifest)
manifest_obj = parse_manifest(manifest=manifest_dict)
manifest_obj = parse_manifest(manifest=manifest_dict) # TODO: replace this
return manifest_obj

def get_project_dict(self):
Expand Down Expand Up @@ -447,9 +464,9 @@ def get_pk_from_model(self, node, unique_columns: dict, pk_tag: str) -> List[str
return []

def get_unique_columns(self) -> Dict[str, Set[str]]:
manifest = self.dev_manifest_obj
manifest = self.dev_manifest_obj #TODO: need to refactor this for dictionary calls
cols_by_uid = defaultdict(set)
for node in manifest.nodes.values():
for node in manifest.nodes.values():#TODO: example: manifest["nodes"].values()
try:
if not (node.resource_type.value == "test" and hasattr(node, "test_metadata")):
continue
Expand Down
1 change: 1 addition & 0 deletions data_diff_demo
Submodule data_diff_demo added at d0784e