improved test, no boundary dedup edge case now resets unique hashes

anuunchin · anuunchin · commit 7e6d35af3902 · 2025-10-02T09:55:53.000+02:00
diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py
@@ -618,10 +618,6 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]:
         # writing back state
         self._cached_state["last_value"] = transformer.last_value
 
-        initial_hash_list = self._cached_state.get("unique_hashes")
-        initial_hash_count = len(initial_hash_list) if initial_hash_list else 0
-        self.custom_metrics["initial_unique_hashes_count"] = initial_hash_count
-
         if transformer.boundary_deduplication:
             # compute hashes for new last rows
             # NOTE: object transform uses last_rows to pass rows to dedup, arrow computes
@@ -630,13 +626,19 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]:
                 transformer.compute_unique_value(row, self.primary_key)
                 for row in transformer.last_rows
             )
+            initial_hash_list = self._cached_state.get("unique_hashes")
+            initial_hash_count = len(initial_hash_list) if initial_hash_list else 0
+            self.custom_metrics["initial_unique_hashes_count"] = initial_hash_count
+
             # add directly computed hashes
             unique_hashes.update(transformer.unique_hashes)
             self._cached_state["unique_hashes"] = list(unique_hashes)
             final_hash_count = len(self._cached_state["unique_hashes"])
             self.custom_metrics["final_unique_hashes_count"] = final_hash_count
 
             self._check_duplicate_cursor_threshold(initial_hash_count, final_hash_count)
+        else:
+            self._cached_state["unique_hashes"] = []
         return rows
 
     def _check_duplicate_cursor_threshold(
diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py
@@ -4444,34 +4444,39 @@ def _run_with_items(items: TDataItems, as_batch: bool) -> str:
     load_id = _run_with_items([{"id": 3, "value": "3"}, {"id": 4, "value": "4"}], False)
     _assert_custom_metrics(load_id, 5, 4, 1, 1, 1)
 
-    # 4. run with duplicate cursor field values, but different hashes, as batch
+    # 4. run with duplicate cursor field values, but different hashes, as a single batch
     load_id = _run_with_items(
         [{"id": 5, "value": "5.1"}, {"id": 5, "value": "5.2"}, {"id": 5, "value": "5.3"}], True
     )
     _assert_custom_metrics(load_id, 8, 5, 1, 3, 3)
 
-    # 5. run with the same values as batch from previous run, but with no boundary deduplication
+    # 5. reset incremental with no boundary deduplication (primary_key=()) and run with the same values
+    # from previous run, should be loaded as a single batch with 3 items
     resource_with_metrics.apply_hints(
         incremental=dlt.sources.incremental(cursor_path="id", initial_value=-1, primary_key=())
     )
     load_id = _run_with_items(
         [{"id": 5, "value": "5.1"}, {"id": 5, "value": "5.2"}, {"id": 5, "value": "5.3"}], True
     )
-    _assert_custom_metrics(load_id, 3, 1, 3, 0, 3)
+    _assert_custom_metrics(load_id, 3, 1, 0, 0, 3)
 
-    # 6. run with two new items as a single batch
-    load_id = _run_with_items([{"id": 6, "value": "6.1"}, {"id": 6, "value": "6.2"}], True)
-    _assert_custom_metrics(load_id, 5, 2, 3, 0, 2)
+    # 6. run with one old and one new item as a single batch (still no boundary deduplication)
+    # should be loaded as a single batch with 2 items
+    load_id = _run_with_items([{"id": 5, "value": "5.1"}, {"id": 6, "value": "6.1"}], True)
+    _assert_custom_metrics(load_id, 5, 2, 0, 0, 2)
 
-    # 7. run with two new items as a single batch, with boundary deduplication
+    # 7. enable boundary deduplication and run with one old and one new item as a single batch
+    # should be loaded as a single batch with 2 items
     resource_with_metrics.incremental.primary_key = "id"
-    load_id = _run_with_items({"id": 7, "value": "7"}, True)
-    _assert_custom_metrics(load_id, 6, 3, 3, 1, 1)
+    load_id = _run_with_items([{"id": 6, "value": "6.1"}, {"id": 7, "value": "7"}], True)
+    _assert_custom_metrics(load_id, 7, 3, 0, 1, 2)
 
-    # 8. run with None within a batch -> should increment unfiltered_items_count
-    load_id = _run_with_items([None, {"id": 8, "value": "8"}], True)
-    _assert_custom_metrics(load_id, 8, 4, 1, 1, 1)
-
-    # 9. run with None as a single batch -> should not increment unfiltered_items_count
-    load_id = _run_with_items([None, {"id": 9, "value": "9"}], False)
+    # 8. run with one old and one new item each as batch
+    # only the new item should be loaded
+    load_id = _run_with_items([{"id": 7, "value": "7"}, {"id": 8, "value": "8"}], False)
     _assert_custom_metrics(load_id, 9, 5, 1, 1, 1)
+
+    # 9. run with None items and one new item as single batch
+    # None items should increment unfiltered_items_count
+    load_id = _run_with_items([None, None, {"id": 9, "value": "9"}], True)
+    _assert_custom_metrics(load_id, 12, 6, 1, 1, 1)