improves shapes of traces

rudolfix · rudolfix · commit dfa297f924ba · 2023-12-20T23:25:51.000+01:00
diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py
@@ -41,12 +41,22 @@
 from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract
 from dlt.common.source import get_current_pipe_name
 from dlt.common.storages.load_storage import LoadPackageInfo
-from dlt.common.typing import DictStrAny, REPattern, SupportsHumanize
+from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize
 from dlt.common.jsonpath import delete_matches, TAnyJsonPath
 from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat
 from dlt.common.utils import RowCounts, merge_row_counts
 
 
+class _StepInfo(NamedTuple):
+    pipeline: "SupportsPipeline"
+    loads_ids: List[str]
+    """ids of the loaded packages"""
+    load_packages: List[LoadPackageInfo]
+    """Information on loaded packages"""
+    started_at: datetime.datetime
+    first_run: bool
+
+
 class StepInfo(SupportsHumanize):
     pipeline: "SupportsPipeline"
     loads_ids: List[str]
@@ -56,9 +66,34 @@ class StepInfo(SupportsHumanize):
     started_at: datetime.datetime
     first_run: bool
 
+    def asdict(self) -> DictStrAny:
+        # to be mixed with NamedTuple
+        d: DictStrAny = self._asdict()  # type: ignore
+        d["pipeline"] = {"pipeline_name": self.pipeline.pipeline_name}
+        d["load_packages"] = [package.asdict() for package in self.load_packages]
+        return d
+
     def __str__(self) -> str:
         return self.asstr(verbosity=0)
 
+    @staticmethod
+    def job_metrics_asdict(
+        job_metrics: Dict[str, DataWriterMetrics], key_name: str = "job_id", extend: StrAny = None
+    ) -> List[DictStrAny]:
+        jobs = []
+        for job_id, metrics in job_metrics.items():
+            d = metrics._asdict()
+            if extend:
+                d.update(extend)
+            d[key_name] = job_id
+            jobs.append(d)
+        return jobs
+
+    def _astuple(self) -> _StepInfo:
+        return _StepInfo(
+            self.pipeline, self.loads_ids, self.load_packages, self.started_at, self.first_run
+        )
+
 
 class ExtractDataInfo(TypedDict):
     name: str
@@ -82,6 +117,7 @@ class ExtractMetrics(TypedDict):
 class _ExtractInfo(NamedTuple):
     pipeline: "SupportsPipeline"
     metrics: Dict[str, List[ExtractMetrics]]
+    """Metrics per load id. If many sources with the same name were extracted, there will be more than 1 element in the list"""
     extract_data_info: List[ExtractDataInfo]
     loads_ids: List[str]
     """ids of the loaded packages"""
@@ -96,12 +132,46 @@ class ExtractInfo(StepInfo, _ExtractInfo):
 
     def asdict(self) -> DictStrAny:
         """A dictionary representation of ExtractInfo that can be loaded with `dlt`"""
-        d = self._asdict()
-        d["pipeline"] = {"pipeline_name": self.pipeline.pipeline_name}
-        d["load_packages"] = [package.asdict() for package in self.load_packages]
-        # TODO: transform and leave metrics when we have them implemented
-        # d.pop("metrics")
+        d = super().asdict()
         d.pop("extract_data_info")
+        # transform metrics
+        d.pop("metrics")
+        load_metrics: Dict[str, List[Any]] = {
+            "job_metrics": [],
+            "table_metrics": [],
+            "resource_metrics": [],
+            "dag": [],
+            "hints": [],
+        }
+        for load_id, metrics_list in self.metrics.items():
+            for idx, metrics in enumerate(metrics_list):
+                extend = {"load_id": load_id, "extract_idx": idx}
+                load_metrics["job_metrics"].extend(
+                    self.job_metrics_asdict(metrics["job_metrics"], extend=extend)
+                )
+                load_metrics["table_metrics"].extend(
+                    self.job_metrics_asdict(
+                        metrics["table_metrics"], key_name="table_name", extend=extend
+                    )
+                )
+                load_metrics["resource_metrics"].extend(
+                    self.job_metrics_asdict(
+                        metrics["resource_metrics"], key_name="resource_name", extend=extend
+                    )
+                )
+                load_metrics["dag"].extend(
+                    [
+                        {**extend, "parent_name": edge[0], "resource_name": edge[1]}
+                        for edge in metrics["dag"]
+                    ]
+                )
+                load_metrics["hints"].extend(
+                    [
+                        {**extend, "resource_name": name, **hints}
+                        for name, hints in metrics["hints"].items()
+                    ]
+                )
+        d.update(load_metrics)
         return d
 
     def asstr(self, verbosity: int = 0) -> str:
@@ -143,19 +213,25 @@ def row_counts(self) -> RowCounts:
 
     def asdict(self) -> DictStrAny:
         """A dictionary representation of NormalizeInfo that can be loaded with `dlt`"""
-        d = self._asdict()
-        d["pipeline"] = {"pipeline_name": self.pipeline.pipeline_name}
-        d["load_packages"] = [package.asdict() for package in self.load_packages]
-        # list representation creates a nice table
-        d["row_counts"] = []
-        for load_id, metrics in self.metrics.items():
-            assert len(metrics) == 1, "Cannot deal with more than 1 normalize metric per load_id"
-            d["row_counts"].extend(
-                [
-                    {"load_id": load_id, "table_name": k, "count": v.items_count}
-                    for k, v in metrics[0]["table_metrics"].items()
-                ]
-            )
+        d = super().asdict()
+        # transform metrics
+        d.pop("metrics")
+        load_metrics: Dict[str, List[Any]] = {
+            "job_metrics": [],
+            "table_metrics": [],
+        }
+        for load_id, metrics_list in self.metrics.items():
+            for idx, metrics in enumerate(metrics_list):
+                extend = {"load_id": load_id, "extract_idx": idx}
+                load_metrics["job_metrics"].extend(
+                    self.job_metrics_asdict(metrics["job_metrics"], extend=extend)
+                )
+                load_metrics["table_metrics"].extend(
+                    self.job_metrics_asdict(
+                        metrics["table_metrics"], key_name="table_name", extend=extend
+                    )
+                )
+        d.update(load_metrics)
         return d
 
     def asstr(self, verbosity: int = 0) -> str:
@@ -192,10 +268,7 @@ class LoadInfo(StepInfo, _LoadInfo):
 
     def asdict(self) -> DictStrAny:
         """A dictionary representation of LoadInfo that can be loaded with `dlt`"""
-        d = self._asdict()
-        d["pipeline"] = {"pipeline_name": self.pipeline.pipeline_name}
-        d["load_packages"] = [package.asdict() for package in self.load_packages]
-        return d
+        return super().asdict()
 
     def asstr(self, verbosity: int = 0) -> str:
         msg = f"Pipeline {self.pipeline.pipeline_name} completed in "
@@ -273,7 +346,7 @@ def __init__(self) -> None:
 
     def _step_info_start_load_id(self, load_id: str) -> None:
         self._current_load_id = load_id
-        self._load_id_metrics[load_id] = []
+        self._load_id_metrics.setdefault(load_id, [])
 
     def _step_info_complete_load_id(self, load_id: str, metrics: TStepMetrics) -> None:
         assert self._current_load_id == load_id, (
diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py
@@ -76,13 +76,20 @@ def close_writers(self, load_id: str) -> None:
                 writer.close()
 
     def closed_files(self, load_id: str) -> List[DataWriterMetrics]:
+        """Return metrics for all fully processed (closed) files"""
         files: List[DataWriterMetrics] = []
         for name, writer in self.buffered_writers.items():
             if name.startswith(load_id):
                 files.extend(writer.closed_files)
 
         return files
 
+    def remove_closed_files(self, load_id: str) -> None:
+        """Remove metrics for closed files in a given `load_id`"""
+        for name, writer in self.buffered_writers.items():
+            if name.startswith(load_id):
+                writer.closed_files.clear()
+
     def _write_temp_job_file(
         self,
         load_id: str,
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -145,15 +145,15 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
                 logger.info(f"Will truncate tables in {truncate_dir}")
                 try:
                     all_files = self.fs_client.ls(truncate_dir, detail=False, refresh=True)
-                    # logger.info(f"Found {len(all_files)} CANDIDATE files in {truncate_dir}")
-                    # print(f"in truncate dir {truncate_dir}: {all_files}")
+                    logger.info(f"Found {len(all_files)} CANDIDATE files in {truncate_dir}")
+                    print(f"in truncate dir {truncate_dir}: {all_files}")
                     for item in all_files:
                         # check every file against all the prefixes
                         for search_prefix in truncate_prefixes:
                             if item.startswith(search_prefix):
                                 # NOTE: deleting in chunks on s3 does not raise on access denied, file non existing and probably other errors
                                 # logger.info(f"DEL {item}")
-                                # print(f"DEL {item}")
+                                print(f"DEL {item}")
                                 self.fs_client.rm(item)
                 except FileNotFoundError:
                     logger.info(
diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py
@@ -217,9 +217,10 @@ def _compute_metrics(self, load_id: str, source: DltSource) -> ExtractMetrics:
                     hints[name] = get_callable_name(hint)
                     continue
                 if name == "columns":
-                    hints[name] = yaml.dump(
-                        hint, allow_unicode=True, default_flow_style=False, sort_keys=False
-                    )
+                    if hint:
+                        hints[name] = yaml.dump(
+                            hint, allow_unicode=True, default_flow_style=False, sort_keys=False
+                        )
                     continue
                 hints[name] = hint
 
@@ -307,6 +308,9 @@ def _extract_single_source(
             self.extract_storage.close_writers(load_id)
             # gather metrics
             self._step_info_complete_load_id(load_id, self._compute_metrics(load_id, source))
+            # remove the metrics of files processed in this extract run
+            # NOTE: there may be more than one extract run per load id: ie. the resource and then dlt state
+            self.extract_storage.remove_closed_files(load_id)
 
     def extract(
         self,
diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py
@@ -94,6 +94,10 @@ def closed_files(self, load_id: str) -> List[DataWriterMetrics]:
             files.extend(storage.closed_files(load_id))
         return files
 
+    def remove_closed_files(self, load_id: str) -> None:
+        for storage in self._item_storages.values():
+            storage.remove_closed_files(load_id)
+
     def commit_new_load_package(self, load_id: str, schema: Schema) -> None:
         self.new_packages.save_schema(load_id, schema)
         self.storage.rename_tree(
diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py
@@ -106,7 +106,6 @@ def load_state_from_destination(pipeline_name: str, client: WithStateSync) -> TP
     if not state:
         return None
     s = decompress_state(state.state)
-    print(f"BEFORE M {s}")
     return migrate_state(pipeline_name, s, s["_state_engine_version"], STATE_ENGINE_VERSION)
 
 
diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py
@@ -98,7 +98,11 @@ def asdict(self) -> DictStrAny:
         d = self._asdict()
         if self.step_info:
             # name property depending on step name - generates nicer data
-            d[f"{self.step}_info"] = d.pop("step_info")
+            d[f"{self.step}_info"] = step_info_dict = d.pop("step_info").asdict()
+            d["step_info"] = {}
+            # take only the base keys
+            for prop in self.step_info._astuple()._asdict():
+                d["step_info"][prop] = step_info_dict.pop(prop)
         # replace the attributes in exception traces with json dumps
         if self.exception_traces:
             # do not modify original traces
@@ -161,7 +165,8 @@ def last_pipeline_step_trace(self, step_name: TPipelineStep) -> PipelineStepTrac
     def asdict(self) -> DictStrAny:
         """A dictionary representation of PipelineTrace that can be loaded with `dlt`"""
         d = self._asdict()
-        d["steps"] = [step.asdict() for step in self.steps]
+        # run step is the same as load step
+        d["steps"] = [step.asdict() for step in self.steps]  # if step.step != "run"
         return d
 
     @property
diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py