Refactor boundary timestamp handling in SqlMergeFollowupJob and SqlalchemyMergeFollowupJob to ensure current load package creation time is used when no boundary timestamp is provided. Update DltResourceHints class to streamline timestamp validation for active_record_timestamp and boundary_timestamp. Adjust tests accordingly. (#3378)

alkaline-0 · web-flow · commit 1e73d678ff76 · 2025-11-25T17:34:11.000+01:00
diff --git a/dlt/destinations/impl/sqlalchemy/merge_job.py b/dlt/destinations/impl/sqlalchemy/merge_job.py
@@ -1,7 +1,8 @@
-from typing import Sequence, Tuple, Optional, List, Union
+from typing import Sequence, Tuple, Optional, List, Union, cast
 import operator
 import sqlalchemy as sa
 
+from dlt.common.typing import TAnyDateTime
 from dlt.common.utils import uniq_id
 from dlt.common.destination import PreparedTableSchema, DestinationCapabilitiesContext
 from dlt.common.schema.utils import (
@@ -374,10 +375,13 @@ def gen_scd2_sql(
             format_datetime_literal = (
                 DestinationCapabilitiesContext.generic_capabilities().format_datetime_literal
             )
-
-        boundary_ts = ensure_pendulum_datetime_utc(
-            root_table.get("x-boundary-timestamp", current_load_package()["state"]["created_at"])  # type: ignore[arg-type]
+        _boundary_ts = cast(Optional[TAnyDateTime], root_table.get("x-boundary-timestamp"))
+        boundary_ts: TAnyDateTime = (
+            _boundary_ts
+            if _boundary_ts is not None
+            else current_load_package()["state"]["created_at"]
         )
+        boundary_ts = ensure_pendulum_datetime_utc(boundary_ts)
 
         boundary_literal = format_datetime_literal(boundary_ts, caps.timestamp_precision)
 
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
@@ -4,7 +4,7 @@
 from dlt.common.time import ensure_pendulum_datetime_utc
 from dlt.common.destination import PreparedTableSchema
 from dlt.common.destination.utils import resolve_merge_strategy
-from dlt.common.typing import TypedDict
+from dlt.common.typing import TAnyDateTime, TypedDict
 
 from dlt.common.schema.typing import (
     TSortOrder,
@@ -845,12 +845,14 @@ def gen_scd2_sql(
                 DestinationCapabilitiesContext.generic_capabilities().format_datetime_literal
             )
 
-        boundary_ts = ensure_pendulum_datetime_utc(
-            root_table.get(  # type: ignore[arg-type]
-                "x-boundary-timestamp",
-                current_load_package()["state"]["created_at"],
-            )
+        _boundary_ts = cast(Optional[TAnyDateTime], root_table.get("x-boundary-timestamp"))
+        boundary_ts: TAnyDateTime = (
+            _boundary_ts
+            if _boundary_ts is not None
+            else current_load_package()["state"]["created_at"]
         )
+        boundary_ts = ensure_pendulum_datetime_utc(boundary_ts)
+
         boundary_literal = format_datetime_literal(
             boundary_ts,
             caps.timestamp_precision,
diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py
@@ -830,6 +830,8 @@ def validate_write_disposition_hint(template: TResourceHints) -> None:
                     ):
                         continue  # None is allowed for active_record_timestamp
                     if ts in wd:
+                        if wd[ts] is None:  # type: ignore[literal-required]
+                            continue
                         try:
                             ensure_pendulum_datetime_utc(wd[ts])  # type: ignore[literal-required]
                         except Exception:
diff --git a/docs/website/docs/general-usage/merge-loading.md b/docs/website/docs/general-usage/merge-loading.md
@@ -567,6 +567,35 @@ def dim_customer():
     ...
 ```
 
+#### Reset boundary timestamp to the current load time
+To stop using a previously set `boundary_timestamp` and revert to the default (the current load package creation time), set `boundary_timestamp` to `None`. You can do this either at definition time or dynamically with `apply_hints` before a run.
+
+Definition-time (always use current load time):
+```py
+@dlt.resource(
+    write_disposition={
+        "disposition": "merge",
+        "strategy": "scd2",
+        "boundary_timestamp": None,  # reset to current load time
+    }
+)
+def dim_customer():
+    ...
+```
+
+Per-run reset (override just for this run):
+```py
+r.apply_hints(
+    write_disposition={
+        "disposition": "merge",
+        "strategy": "scd2",
+        "boundary_timestamp": None,  # reset to current load time for this run
+    }
+)
+pipeline.run(r(...))
+```
+When `boundary_timestamp` is `None` (or omitted), `dlt` uses the load package's creation timestamp as the boundary for both retiring existing versions and creating new versions.
+
 ### Example: Use your own row hash
 By default, `dlt` generates a row hash based on all columns provided by the resource and stores it in `_dlt_id`. You can use your own hash instead by specifying `row_version_column_name` in the `write_disposition` dictionary. You might already have a column present in your resource that can naturally serve as a row hash, in which case it's more efficient to use those pre-existing hash values than to generate new artificial ones. This option also allows you to use hashes based on a subset of columns, in case you want to ignore changes in some of the columns. When using your own hash, values for `_dlt_id` are randomly generated.
 ```py
diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py
@@ -1,5 +1,6 @@
 # timezone is removed from all datetime objects in these tests to simplify comparison
 
+from unittest import mock
 import pytest
 from typing import List, Dict, Any, Optional
 from datetime import date, datetime, timezone  # noqa: I251
@@ -633,7 +634,7 @@ def r():
 
 @pytest.mark.parametrize(
     "destination_config",
-    destinations_configs(default_sql_configs=True, subset=["duckdb"]),
+    destinations_configs(default_sql_configs=True, subset=["sqlalchemy", "duckdb"]),
     ids=lambda x: x.name,
 )
 def test_boundary_timestamp(
@@ -645,6 +646,7 @@ def test_boundary_timestamp(
     ts2 = "2024-08-22"
     ts3 = date(2024, 8, 20)  # earlier than ts1 and ts2
     ts4 = "i_am_not_a_timestamp"
+    ts5 = pendulum.datetime(2025, 8, 21, 12, 15, tz="UTC").timestamp()
 
     @dlt.resource(
         table_name="dim_test",
@@ -657,75 +659,127 @@ def test_boundary_timestamp(
     def r(data):
         yield data
 
+    # normalize timestamps once for assertions
+    ts1_dt = strip_timezone(ts1)
+    ts2_dt = strip_timezone(ts2)
+    ts3_dt = strip_timezone(ts3)
+    ts5_dt = strip_timezone(ts5)
+
     # load 1 — initial load
     dim_snap = [
         l1_1 := {"nk": 1, "foo": "foo"},
         l1_2 := {"nk": 2, "foo": "foo"},
     ]
-    info = p.run(r(dim_snap), **destination_config.run_kwargs)
-    assert_load_info(info)
-    assert load_table_counts(p, "dim_test")["dim_test"] == 2
-    expected = [
-        {**{FROM: strip_timezone(ts1), TO: None}, **l1_1},
-        {**{FROM: strip_timezone(ts1), TO: None}, **l1_2},
-    ]
-    assert get_table(p, "dim_test", "nk") == expected
-
-    # load 2 — different source records, different boundary timestamp
-    r.apply_hints(
-        write_disposition={
-            "disposition": "merge",
-            "strategy": "scd2",
-            "boundary_timestamp": ts2,
-        }
-    )
-    dim_snap = [
-        l2_1 := {"nk": 1, "foo": "bar"},  # natural key 1 updated
-        # l1_2,  # natural key 2 no longer present
-        l2_3 := {"nk": 3, "foo": "foo"},  # new natural key
-    ]
-    info = p.run(r(dim_snap), **destination_config.run_kwargs)
-    assert_load_info(info)
-    assert load_table_counts(p, "dim_test")["dim_test"] == 4
-    expected = [
-        {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1},  # retired
-        {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2},  # retired
-        {**{FROM: strip_timezone(ts2), TO: None}, **l2_1},  # new
-        {**{FROM: strip_timezone(ts2), TO: None}, **l2_3},  # new
-    ]
-    assert_records_as_set(get_table(p, "dim_test"), expected)
-
-    # load 3 — earlier boundary timestamp
-    # we naively apply any valid timestamp
-    # may lead to "valid from" > "valid to", as in this test case
-    r.apply_hints(
-        write_disposition={
-            "disposition": "merge",
-            "strategy": "scd2",
-            "boundary_timestamp": ts3,
-        }
-    )
-    dim_snap = [l2_1]  # natural key 3 no longer present
-    info = p.run(r(dim_snap), **destination_config.run_kwargs)
-    assert_load_info(info)
-    assert load_table_counts(p, "dim_test")["dim_test"] == 4
-    expected = [
-        {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1},  # unchanged
-        {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2},  # unchanged
-        {**{FROM: strip_timezone(ts2), TO: None}, **l2_1},  # unchanged
-        {**{FROM: strip_timezone(ts2), TO: strip_timezone(ts3)}, **l2_3},  # retired
-    ]
-    assert_records_as_set(get_table(p, "dim_test"), expected)
+    current_time: Dict[str, Optional[float]] = {"ts": None}
+    with mock.patch(
+        "dlt.common.storages.load_package.precise_time",
+        side_effect=lambda: current_time["ts"],
+    ):
+        # load 1 — initial load
+        current_time["ts"] = pendulum.datetime(2024, 8, 21, 12, 15, tz="UTC").timestamp()
+        r.apply_hints(
+            write_disposition={
+                "disposition": "merge",
+                "strategy": "scd2",
+                "boundary_timestamp": ts1,
+            }
+        )
+        info = p.run(r(dim_snap), **destination_config.run_kwargs)
+        assert_load_info(info)
+        assert load_table_counts(p, "dim_test")["dim_test"] == 2
+        expected = [
+            {**{FROM: ts1_dt, TO: None}, **l1_1},
+            {**{FROM: ts1_dt, TO: None}, **l1_2},
+        ]
+        assert get_table(p, "dim_test", "nk", ts_columns=[FROM, TO]) == expected
+
+        # load 2 — different source records, different boundary timestamp
+        current_time["ts"] = pendulum.datetime(2024, 8, 22, tz="UTC").timestamp()
+        dim_snap = [
+            l2_1 := {"nk": 1, "foo": "bar"},  # natural key 1 updated
+            # l1_2,  # natural key 2 no longer present
+            l2_3 := {"nk": 3, "foo": "foo"},  # new natural key
+        ]
+        r.apply_hints(
+            write_disposition={
+                "disposition": "merge",
+                "strategy": "scd2",
+                "boundary_timestamp": ts2,
+            }
+        )
+        info = p.run(r(dim_snap), **destination_config.run_kwargs)
+        assert_load_info(info)
+        assert load_table_counts(p, "dim_test")["dim_test"] == 4
+        expected = [
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_1},  # retired
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_2},  # retired
+            {**{FROM: ts2_dt, TO: None}, **l2_1},  # new
+            {**{FROM: ts2_dt, TO: None}, **l2_3},  # new
+        ]
+        assert_records_as_set(get_table(p, "dim_test", ts_columns=[FROM, TO]), expected)
+
+        # load 3 — earlier boundary timestamp
+        # we naively apply any valid timestamp
+        # may lead to "valid from" > "valid to", as in this test case
+        current_time["ts"] = pendulum.datetime(2024, 8, 22, 0, 0, 1, tz="UTC").timestamp()
+        dim_snap = [l2_1]  # natural key 3 no longer present
+        r.apply_hints(
+            write_disposition={
+                "disposition": "merge",
+                "strategy": "scd2",
+                "boundary_timestamp": ts3,
+            }
+        )
+        info = p.run(r(dim_snap), **destination_config.run_kwargs)
+        assert_load_info(info)
+        assert load_table_counts(p, "dim_test")["dim_test"] == 4
+        expected = [
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_1},  # unchanged
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_2},  # unchanged
+            {**{FROM: ts2_dt, TO: None}, **l2_1},  # unchanged
+            {**{FROM: ts2_dt, TO: ts3_dt}, **l2_3},  # retired
+        ]
+        assert_records_as_set(get_table(p, "dim_test", ts_columns=[FROM, TO]), expected)
+
+        # invalid boundary timestamp should raise error
+        with pytest.raises(ValueError):
+            r.apply_hints(
+                write_disposition={
+                    "disposition": "merge",
+                    "strategy": "scd2",
+                    "boundary_timestamp": ts4,
+                }
+            )
 
-    # invalid boundary timestamp should raise error
-    with pytest.raises(ValueError):
+        # run 4 — no boundary timestamp (use current precise_time)
+        current_time["ts"] = ts5
+        dim_snap = [
+            l3_1 := {"nk": 1, "foo": "foobar"},  # updated
+        ]
         r.apply_hints(
             write_disposition={
                 "disposition": "merge",
                 "strategy": "scd2",
-                "boundary_timestamp": ts4,
+                "boundary_timestamp": None,
             }
         )
+        info = p.run(r(dim_snap), **destination_config.run_kwargs)
+        assert_load_info(info)
+        assert load_table_counts(p, "dim_test")["dim_test"] == 5
+        expected = [
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_1},  # unchanged
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_2},  # unchanged
+            {
+                **{FROM: ts2_dt, TO: ts5_dt},
+                **l2_1,
+            },  # retired in this run
+            {
+                **{FROM: ts2_dt, TO: ts3_dt},
+                **l2_3,
+            },  # unchanged (already retired in load 3)
+            {**{FROM: ts5_dt, TO: None}, **l3_1},
+        ]
+        assert_records_as_set(get_table(p, "dim_test", ts_columns=[FROM, TO]), expected)
 
 
 @pytest.mark.essential