root key WIP

rudolfix · rudolfix · commit 283e6fcb7f30 · 2025-08-31T22:57:18.000+02:00
diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py
@@ -357,10 +357,14 @@ def update_schema(self, schema: "Schema") -> None:
         self._settings = deepcopy(schema.settings)
         # make shallow copy of normalizer settings
         self._configure_normalizers(copy(schema._normalizers_config))
+        self.data_item_normalizer.extend_schema()
         self._compile_settings()
-        # update all tables
-        for table in schema.tables.values():
-            self.update_table(table)
+        # update all tables starting for parents and then nested tables in order
+        tables = list(schema.tables.values())
+        for table in tables:
+            if not utils.is_nested_table(table):
+                for chain_table in utils.get_nested_tables(schema._schema_tables, table["name"]):
+                    self.update_table(chain_table)
 
     def drop_tables(
         self, table_names: Sequence[str], seen_data_only: bool = False
@@ -766,6 +770,7 @@ def update_normalizers(self) -> None:
         as textual parts can be extracted from an expression.
         """
         self._configure_normalizers(configured_normalizers(schema_name=self._schema_name))
+        self.data_item_normalizer.extend_schema()
         self._compile_settings()
 
     def will_update_normalizers(self) -> bool:
@@ -1042,7 +1047,6 @@ def _configure_normalizers(self, explicit_normalizers: TNormalizersConfig) -> No
         self._replace_and_apply_naming(normalizers_config, to_naming, self.naming)
         # data item normalization function
         self.data_item_normalizer = item_normalizer_class(self)
-        self.data_item_normalizer.extend_schema()
 
     def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> None:
         self._schema_tables: TSchemaTables = {}
@@ -1072,6 +1076,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No
         if not normalizers:
             normalizers = configured_normalizers(schema_name=self._schema_name)
         self._configure_normalizers(normalizers)
+        self.data_item_normalizer.extend_schema()  # type: ignore[attr-defined]
         # add version tables
         self._add_standard_tables()
         # compile all known regexes
diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py
@@ -180,9 +180,6 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
                 pass
             raise outer
         finally:
-            # clear all pending result sets
-            while curr.nextset():
-                pass
             # always close cursor
             curr.close()
 
diff --git a/dlt/extract/items_transform.py b/dlt/extract/items_transform.py
@@ -167,20 +167,24 @@ def bind(self, pipe: SupportsPipe) -> "LimitItem":
 
     def limit(self, chunk_size: int) -> Optional[int]:
         """Calculate the maximum number of rows to which result is limited. Limit works in chunks
-        that controlled by the data source and this must be provided in `chunk_size`
+        that controlled by the data source and this must be provided in `chunk_size`.
+        `chunk_size` will be ignore if counting rows (`count_rows` is `True`). Mind that
+        this count method will not split batches so you may get more items (up to the full last batch)
+        than `limit` method indicates.
         """
         if self.max_items in (None, -1):
             return None
-        return self.max_items * chunk_size
+        return self.max_items * (1 if self.count_rows else chunk_size)
 
     def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
         row_count = count_rows_in_items(item)
         if row_count > 0:
             self.count += row_count if self.count_rows else 1
 
         # detect when the limit is reached, max time or yield count
+        # self.max_items < 0 disables the limit on max items (legacy)
         if (
-            (self.count >= self.max_items)
+            (self.count >= self.max_items and self.max_items >= 0)
             or (self.max_time and time.time() - self.start_time > self.max_time)
             or self.max_items == 0
         ):
diff --git a/dlt/extract/source.py b/dlt/extract/source.py
@@ -21,6 +21,7 @@
 from dlt.common.configuration.specs.base_configuration import ContainerInjectableContext, configspec
 from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
 from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer
+from dlt.common.normalizers.json.typing import RelationalNormalizerConfig
 from dlt.common.schema import Schema
 from dlt.common.schema.typing import TColumnName, TSchemaContract
 from dlt.common.schema.utils import normalize_table_identifiers
@@ -390,7 +391,6 @@ def from_data(cls, schema: Schema, section: str, data: Any) -> Self:
     def name(self) -> str:
         return self._schema.name
 
-    # TODO: max_table_nesting/root_key below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer.
     @property
     def max_table_nesting(self) -> int:
         """A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON."""
@@ -412,30 +412,38 @@ def root_key(self) -> Optional[bool]:
 
         """
         # this also check the normalizer type
-        config = RelationalNormalizer.get_normalizer_config(self._schema).get("propagation")
-        data_normalizer = self._schema.data_item_normalizer
-        assert isinstance(data_normalizer, RelationalNormalizer)
-        return config.get("root_key_propagation")  # type: ignore[return-value]
+        config = RelationalNormalizer.get_normalizer_config(self._schema)
+        is_root_key = config.get("root_key_propagation")
+        if is_root_key is None:
+            # if not found get legacy value
+            is_root_key = self._get_root_key_legacy(config)
+            if is_root_key:
+                # set the root key if legacy value set
+                self.root_key = True
+        return is_root_key
 
     @root_key.setter
     def root_key(self, value: bool) -> None:
         # this also check the normalizer type
         config = RelationalNormalizer.get_normalizer_config(self._schema)
+        if value is None:
+            value = self._get_root_key_legacy(config)
+        if value is not None:
+            RelationalNormalizer.update_normalizer_config(
+                self._schema,
+                {"root_key_propagation": value},
+            )
+
+    def _get_root_key_legacy(self, config: RelationalNormalizerConfig) -> Optional[bool]:
         data_normalizer = self._schema.data_item_normalizer
         assert isinstance(data_normalizer, RelationalNormalizer)
-
         # we must remove root key propagation
         with contextlib.suppress(KeyError):
             propagation_config = config["propagation"]
             propagation_config["root"].pop(data_normalizer.c_dlt_id)
             # and set the value below
-            value = True
-
-        if value is not None:
-            RelationalNormalizer.update_normalizer_config(
-                self._schema,
-                {"root_key_propagation": value},
-            )
+            return True
+        return None
 
     @property
     def schema_contract(self) -> TSchemaContract:
diff --git a/dlt/sources/filesystem/__init__.py b/dlt/sources/filesystem/__init__.py
@@ -111,11 +111,14 @@ def filesystem(  # noqa DOC
     # NOTE: fsspec glob for buckets reads all files before running iterator
     #  so below we do not have real batching anyway
     if incremental and incremental.row_order:
+        reverse = (incremental.row_order == "asc" and incremental.last_value_func is min) or (
+            incremental.row_order == "desc" and incremental.last_value_func is max
+        )
         iter_ = iter(
             sorted(
                 list(glob_files(fs_client, bucket_url, file_glob)),
                 key=lambda f_: f_[incremental.cursor_path],  # type: ignore[literal-required]
-                reverse=incremental.row_order == "desc",
+                reverse=reverse,
             )
         )
 
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
@@ -439,6 +439,8 @@ print(load_info)
 ### 6. Split large incremental loads
 If you have many files to process or they are large you may choose to split pipeline runs into smaller chunks (where single file is the smallest). There are
 two methods to do that:
+* Partitioning where divide your source data in several ranges and load them (possibly in parallel) and then continue to load data incrementally.
+* Split where you load data sequentially in small chunks
 
 Partitioning works as follows:
 1. Obtain a list of files ie. by just listing your resource `files = list(filesystem(...))`
diff --git a/docs/website/docs/general-usage/incremental/cursor.md b/docs/website/docs/general-usage/incremental/cursor.md
@@ -146,7 +146,7 @@ Note that dlt's incremental filtering considers the ranges half-closed. `initial
 With the `row_order` argument set, dlt will stop retrieving data from the data source (e.g., GitHub API) if it detects that the values of the cursor field are out of the range of **start** and **end** values.
 
 In particular:
-* dlt stops processing when the resource yields any item with a cursor value _equal to or greater than_ the `end_value` and `row_order` is set to **asc**. (`end_value` is not included)
+* dlt stops processing when the resource yields any item with a cursor value _equal to or greater than_ the `end_value` and `row_order` is set to **asc**. (`end_value` is not included, also see )
 * dlt stops processing when the resource yields any item with a cursor value _lower_ than the `last_value` and `row_order` is set to **desc**. (`last_value` is included)
 
 :::note
@@ -215,7 +215,6 @@ def tickets(
         "updated_at",
         initial_value="2023-01-01T00:00:00Z",
         end_value="2023-02-01T00:00:00Z",
-        row_order="asc"
     ),
 ):
     for page in zendesk_client.get_pages(
@@ -229,7 +228,48 @@ def tickets(
 ```
 :::
 
-## Deduplicate overlapping ranges with primary key
+## Partition large loads
+You can execute a backfill on large amount of data by partitioning it into smaller fragments. Best case is if you can partition.
+
+
+:::Note
+
+
+## Split large loads into chunks
+You can split large incremental resources into smaller chunks and load them sequentially. This way you'll see the data quicker and
+in case of loading error you are able to retry a single chunk. **This method works only if your source returns data in deterministic order**, for example:
+* you can request your REST API endpoint to return data ordered by `updated_at`.
+* you use `row_order` on one of supported sources like `sql_database` or `filesystem`.
+
+Below we go for the second option and load data from messages table that we order on `created_at` column.
+```py
+import dlt
+from dlt
+
+pipeline = dlt.pipeline("test_load_sql_table_split_loading", destination="duckdb")
+
+messages = sql_table(
+    table="chat_message",
+    incremental=dlt.sources.incremental(
+        "created_at",
+        row_order="asc",  # critical to set row_order when doing split loading
+        range_start="open",  # use open range to disable deduplication
+    ),
+)
+
+# produce chunk each minute
+while pipeline.run(messages.add_limit(max_time=60)).has_data:
+    pass
+```
+Note how we combine `incremental` and `add_limit` to generate chunk each minute. If you create and index on `created_at`, the database
+engine will be able to stream data using the index without the need to scan the whole table.
+
+:::caution
+If your source returns unordered data, you will most probably miss some data items or load them twice.
+:::
+
+
+## Deduplicate overlapping ranges
 
 `Incremental` **does not** deduplicate datasets like the **merge** write disposition does. However, it ensures that when another portion of data is extracted, records that were previously loaded **at the end of range** won't be included again. `dlt` assumes that you load a range of data, where the lower bound is inclusive by default (i.e., greater than or equal). This ensures that you never lose any data but will also re-acquire some rows. For example, if you have a database table with a cursor field on `updated_at` which has a day resolution, then there's a high chance that after you extract data on a given day, more records will still be added. When you extract on the next day, you should reacquire data from the last day to ensure all records are present; however, this will create overlap with data from the previous extract.
 
diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md
@@ -504,7 +504,7 @@ You can also set the limit to `0` for the resource to not yield any items.
 You can use `add_limit` to split incremental resources that process large data into manageable chunks:
 ```py
 ```
-splits loading of `issues` table into one hour chunks that are loaded in a loop. You'll see your data quicker without impacting the performance.
+splits loading of `issues` table into 10 minute chunks that are loaded in a loop. You'll see your data quicker without impacting the performance.
 Note **row_order** above! this makes sure that your table rows are returned deterministically so `dlt` can process consecutive chunks without
 losing data. Mind that ordering results may increase load on the database server. [Please read about other backfill strategies]
 
diff --git a/tests/common/runtime/test_telemetry.py b/tests/common/runtime/test_telemetry.py
@@ -133,19 +133,18 @@ def test_telemetry_endpoint_exceptions(
 def test_track_anon_event(
     mocker: MockerFixture, disable_temporary_telemetry: RuntimeConfiguration
 ) -> None:
-    from dlt.sources.helpers import requests
     from dlt.common.runtime import anon_tracker
 
     mock_github_env(os.environ)
     mock_pod_env(os.environ)
     SENT_ITEMS.clear()
     config = SentryLoggerConfiguration()
 
-    requests_post = mocker.spy(requests, "post")
-
     props = {"destination_name": "duckdb", "elapsed_time": 712.23123, "success": True}
     with patch("dlt.common.runtime.anon_tracker.before_send", _mock_before_send):
         start_test_telemetry(config)
+        # requests client created on start telemetry
+        requests_post = mocker.spy(anon_tracker.requests, "post")
         track("pipeline", "run", props)
         # this will send stuff
         disable_anon_tracker()
diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py
@@ -453,7 +453,7 @@ def test_update_schema_normalizer_props() -> None:
     schema = make_issues_schema_for_normalizers_update()
     schema_2 = make_issues_schema_for_normalizers_update()
     # remove issues table
-    del schema_2._schema_tables["issues"]
+    schema_2.drop_tables(["issues"])
     schema_2.update_schema(schema)
 
     os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper"
diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py
@@ -12,6 +12,7 @@
 from dlt.common.configuration.specs import BaseConfiguration
 from dlt.common.data_types.typing import TDataType
 from dlt.common.exceptions import DictValidationException, PipelineStateNotAvailable
+from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer
 from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention
 from dlt.common.pipeline import StateInjectableContext
 from dlt.common.schema import Schema
@@ -39,6 +40,7 @@
 )
 from dlt.extract.pipe import Pipe
 from dlt.extract.utils import dynstr, make_schema_with_default_name
+from tests.common.utils import load_yml_case
 
 
 @pytest.fixture(autouse=True)
@@ -82,6 +84,32 @@ def basic_gen():
     assert s.schema_contract is None
 
 
+def test_root_key_backward_compat() -> None:
+    def basic_gen():
+        yield 1
+
+    eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11")
+    orig_schema = Schema.from_dict(eth_V11)
+    s = DltSource.from_data(orig_schema, "section", basic_gen)
+
+    # set none surfaces a real value
+    s.root_key = None
+    assert s.root_key is True
+    config = RelationalNormalizer.get_normalizer_config(orig_schema)
+    # dropped root propagation so this is false
+    assert config["propagation"]["root"] == {}
+    assert s._get_root_key_legacy(config) is None
+
+    eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11")
+    orig_schema = Schema.from_dict(eth_V11)
+    s = DltSource.from_data(orig_schema, "section", basic_gen)
+
+    assert s.root_key is True
+    assert s.root_key is True
+    config = RelationalNormalizer.get_normalizer_config(orig_schema)
+    assert config["propagation"]["root"] == {}
+
+
 def test_call_data_resource() -> None:
     with pytest.raises(TypeError):
         DltResource.from_data([1], name="t")()
@@ -1060,16 +1088,23 @@ async def r_async():
 def test_limit_count_by_rows() -> None:
     # no batching - processing row by row
     r = dlt.resource([1, 2, 3, 4, 5], name="test")
-    assert list(r._clone().add_limit(3, count_rows=True)) == [1, 2, 3]
+    r_limit = r._clone().add_limit(3, count_rows=True)
+    assert list(r_limit) == [1, 2, 3]
+    # chunk size ignored when counting by rows
+    assert r_limit.limit.limit(chunk_size=5) == 3
 
     # counts batches
     r = dlt.resource([[1, 2, 3], [4, 5], [6, 7]], name="test")
+    # NOTE: if batches are irregular, LimitItem::limit has no meaning
     assert list(r._clone().add_limit(3, count_rows=False)) == [1, 2, 3, 4, 5, 6, 7]
     # counts rows
     assert list(r._clone().add_limit(3, count_rows=True)) == [1, 2, 3]
 
     # last batch will not be cut so we get it in full
-    assert list(r._clone().add_limit(4, count_rows=True)) == [1, 2, 3, 4, 5]
+    r_limit = r._clone().add_limit(4, count_rows=True)
+    assert list(r_limit) == [1, 2, 3, 4, 5]
+    # still the limit is 3 items
+    assert r_limit.limit.limit(5) == 4
 
     # list in batches are single rows
     r = dlt.resource([[1, [2, 3]], [4, [5]]], name="test")
diff --git a/tests/load/sources/filesystem/test_filesystem_source.py b/tests/load/sources/filesystem/test_filesystem_source.py
diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py
diff --git a/tests/load/sources/sql_database/test_sql_table_backfill.py b/tests/load/sources/sql_database/test_sql_table_backfill.py
diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py