Skip to content

Commit fe4ceee

Browse files
authored
allows setting table name via property on DltResource (#593)
* allows setting table name in resource via property * incremental load snippets cleanup and simplification * improves exceptions on wrong datatypes in run * bumps to version 0.3.12
1 parent 784c1cf commit fe4ceee

File tree

13 files changed

+128
-29
lines changed

13 files changed

+128
-29
lines changed

dlt/common/destination/reference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class DestinationClientDwhConfiguration(DestinationClientConfiguration):
5454
def normalize_dataset_name(self, schema: Schema) -> str:
5555
"""Builds full db dataset (schema) name out of configured dataset name and schema name: {dataset_name}_{schema.name}. The resulting name is normalized.
5656
57-
If default schema name equals schema.name, the schema suffix is skipped.
57+
If default schema name is None or equals schema.name, the schema suffix is skipped.
5858
"""
5959
if not schema.name:
6060
raise ValueError("schema_name is None or empty")

dlt/extract/exceptions.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ def __init__(self, pipe_name: str, gen: Any, msg: str, kind: str) -> None:
6262
super().__init__(pipe_name, f"extraction of resource {pipe_name} in {kind} {self.func_name} caused an exception: {msg}")
6363

6464

65+
class PipeGenInvalid(PipeException):
66+
def __init__(self, pipe_name: str, gen: Any) -> None:
67+
msg = "A pipe generator element must be an Iterator (ie. list or generator function). Generator element is typically created from a `data` argument to pipeline.run or extract method."
68+
msg += "dlt will evaluate functions that were passed as data argument. If you passed a function the returned data type is not iterable. "
69+
type_name = str(type(gen))
70+
msg += f" Generator type is {type_name}."
71+
if "DltSource" in type_name:
72+
msg += " Did you pass a @dlt.source decorated function without calling it?"
73+
if "DltResource" in type_name:
74+
msg += " Did you pass a function that returns dlt.resource without calling it?"
75+
76+
super().__init__(pipe_name, msg)
77+
78+
6579
class ResourceNameMissing(DltResourceException):
6680
def __init__(self) -> None:
6781
super().__init__(None, """Resource name is missing. If you create a resource directly from data ie. from a list you must pass the name explicitly in `name` argument.

dlt/extract/extract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def _write_static_table(resource: DltResource, table_name: str) -> None:
142142
_write_dynamic_table(resource, pipe_item.item)
143143
else:
144144
# write item belonging to table with static name
145-
table_name = resource.table_name
145+
table_name = resource.table_name # type: ignore
146146
_write_static_table(resource, table_name)
147147
_write_item(table_name, resource.name, pipe_item.item)
148148

dlt/extract/pipe.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from dlt.common.typing import AnyFun, AnyType, TDataItems
1919
from dlt.common.utils import get_callable_name
2020

21-
from dlt.extract.exceptions import CreatePipeException, DltSourceException, ExtractorException, InvalidResourceDataTypeFunctionNotAGenerator, InvalidStepFunctionArguments, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, PipeException, PipeItemProcessingError, PipeNotBoundToData, ResourceExtractionError
21+
from dlt.extract.exceptions import CreatePipeException, DltSourceException, ExtractorException, InvalidResourceDataTypeFunctionNotAGenerator, InvalidStepFunctionArguments, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, PipeException, PipeGenInvalid, PipeItemProcessingError, PipeNotBoundToData, ResourceExtractionError
2222
from dlt.extract.typing import DataItemWithMeta, ItemTransform, SupportsPipe, TPipedDataItems
2323

2424
if TYPE_CHECKING:
@@ -454,7 +454,8 @@ def from_pipe(cls, pipe: Pipe, *, max_parallel_items: int = 20, workers: int = 5
454454
pipe = pipe._clone()
455455
# head must be iterator
456456
pipe.evaluate_gen()
457-
assert isinstance(pipe.gen, Iterator)
457+
if not isinstance(pipe.gen, Iterator):
458+
raise PipeGenInvalid(pipe.name, pipe.gen)
458459
# create extractor
459460
extract = cls(max_parallel_items, workers, futures_poll_interval, next_item_mode)
460461
# add as first source
@@ -495,7 +496,8 @@ def _fork_pipeline(pipe: Pipe) -> None:
495496
else:
496497
# head of independent pipe must be iterator
497498
pipe.evaluate_gen()
498-
assert isinstance(pipe.gen, Iterator)
499+
if not isinstance(pipe.gen, Iterator):
500+
raise PipeGenInvalid(pipe.name, pipe.gen)
499501
# add every head as source only once
500502
if not any(i.pipe == pipe for i in extract._sources):
501503
extract._sources.append(SourcePipeItem(pipe.gen, 0, pipe, None))

dlt/extract/schema.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,15 @@ def __init__(self, name: str, table_schema_template: TTableSchemaTemplate = None
3434
self.set_template(table_schema_template)
3535

3636
@property
37-
def table_name(self) -> str:
38-
"""Get table name to which resource loads data. Raises in case of table names derived from data."""
37+
def table_name(self) -> TTableHintTemplate[str]:
38+
"""Get table name to which resource loads data. May return a callable."""
3939
if self._table_name_hint_fun:
40-
raise DataItemRequiredForDynamicTableHints(self._name)
41-
return self._table_schema_template["name"] if self._table_schema_template else self._name # type: ignore
40+
return self._table_name_hint_fun
41+
return self._table_schema_template["name"] if self._table_schema_template else self._name
42+
43+
@table_name.setter
44+
def table_name(self, value: TTableHintTemplate[str]) -> None:
45+
self.apply_hints(table_name=value)
4246

4347
@property
4448
def write_disposition(self) -> TWriteDisposition:

dlt/pipeline/pipeline.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -779,10 +779,7 @@ def apply_hint_args(resource: DltResource) -> None:
779779
columns_dict = {c["name"]:c for c in columns}
780780
# apply hints only if any of the hints is present, table_name must be always present
781781
if table_name or parent_table_name or write_disposition or columns or primary_key:
782-
resource_table_name: str = None
783-
with contextlib.suppress(DataItemRequiredForDynamicTableHints):
784-
resource_table_name = resource.table_name
785-
resource.apply_hints(table_name or resource_table_name or resource.name, parent_table_name, write_disposition, columns_dict, primary_key)
782+
resource.apply_hints(table_name or resource.table_name or resource.name, parent_table_name, write_disposition, columns_dict, primary_key)
786783

787784
def choose_schema() -> Schema:
788785
"""Except of explicitly passed schema, use a clone that will get discarded if extraction fails"""

docs/website/docs/dlt-ecosystem/destinations/snowflake.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ The **password authentication** is not any different from other databases like P
7575
You can also pass credentials as a database connection string. For example:
7676
```toml
7777
# keep it at the top of your toml file! before any section starts
78-
destination.postgres.snowflake="snowflake://loader:<password>@kgiotue-wn98412/dlt_data?warehouse=COMPUTE_WH&role=DLT_LOADER_ROLE"
78+
destination.snowflake.credentials="snowflake://loader:<password>@kgiotue-wn98412/dlt_data?warehouse=COMPUTE_WH&role=DLT_LOADER_ROLE"
7979
```
8080

8181
In **key pair authentication** you replace password with a private key exported in PEM format. The key may be encrypted. In that case you must provide a passphrase.
@@ -95,7 +95,7 @@ private_key_passphrase="passphrase"
9595
We allow to pass private key and passphrase in connection string. Please url encode the private key and passphrase.
9696
```toml
9797
# keep it at the top of your toml file! before any section starts
98-
destination.postgres.snowflake="snowflake://loader:<password>@kgiotue-wn98412/dlt_data?private_key=<url encoded pem>&private_key_passphrase=<url encoded passphrase>"
98+
destination.snowflake.credentials="snowflake://loader:<password>@kgiotue-wn98412/dlt_data?private_key=<url encoded pem>&private_key_passphrase=<url encoded passphrase>"
9999
```
100100

101101
## Write disposition

docs/website/docs/general-usage/incremental-loading.md

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -168,25 +168,32 @@ def repo_issues(
168168
repository,
169169
created_at = dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z")
170170
):
171-
# get issues from created from last "created_at" value
172-
for page in _get_issues_page(access_token, repository, since=created_at.last_value):
171+
# get issues since "created_at" stored in state on previous run (or initial_value on first run)
172+
for page in _get_issues_page(access_token, repository, since=created_at.start_value):
173173
yield page
174+
# last_value is updated after every page
175+
print(created_at.last_value)
174176
```
175177

176178
Here we add `created_at` argument that will receive incremental state, initialized to
177179
`1970-01-01T00:00:00Z`. It is configured to track `created_at` field in issues returned by
178180
`_get_issues_page` and then yielded. It will store the newest `created_at` value in `dlt`
179-
[state](state.md) and make it available in `created_at.last_value` on next pipeline
181+
[state](state.md) and make it available in `created_at.start_value` on next pipeline
180182
run. This value is used to request only issues newer (or equal) via GitHub API.
181183

182-
On the first run of this resource, all the issues (we use "1970-01-01T00:00:00Z" as initial to get
183-
all of them) will be loaded and the `created_at.last_value` will get the `created_at` of most recent
184-
issue. On the second run we'll pass this value to `_get_issues_page` to get only the newer issues.
184+
In essence, `dlt.sources.incremental` instance above
185+
* **created_at.initial_value** which is always equal to "1970-01-01T00:00:00Z" passed in constructor
186+
* **created_at.start_value** a maximum `created_at` value from the previous run or the **initial_value** on first run
187+
* **created_at.last_value** a "real time" `created_at` value updated with each yielded item or page. before first yield it equals **start_value**
188+
* **created_at.end_value** (here not used) [marking end of backfill range](#using-dltsourcesincremental-for-backfill)
189+
190+
When paginating you probably need **start_value** which does not change during the execution of the resource, however
191+
most paginators will return a **next page** link which you should use.
185192

186193
Behind the scenes, `dlt` will deduplicate the results ie. in case the last issue is returned again
187194
(`created_at` filter is inclusive) and skip already loaded ones. In the example below we
188195
incrementally load the GitHub events, where API does not let us filter for the newest events - it
189-
always returns all of them. Nevertheless, `dlt` will load only the incremental part, skipping all the
196+
always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the
190197
duplicates and past issues.
191198

192199
```python
@@ -215,7 +222,13 @@ The `start_out_of_range` boolean flag is set when the first such element is yiel
215222
since we know that github returns results ordered from newest to oldest, we know that all subsequent
216223
items will be filtered out anyway and there's no need to fetch more data.
217224

218-
`dlt.sources.incremental` allows to define custom `last_value` function. This lets you define
225+
### max, min or custom `last_value_func`
226+
227+
`dlt.sources.incremental` allows to choose a function that orders (compares) values coming from the items to current `last_value`.
228+
* The default function is built-in `max` which returns bigger value of the two
229+
* Another built-in `min` returns smaller value.
230+
231+
You can pass your custom function as well. This lets you define
219232
`last_value` on complex types i.e. dictionaries and store indexes of last values, not just simple
220233
types. The `last_value` argument is a [JSON Path](https://github.com/json-path/JsonPath#operators)
221234
and lets you select nested and complex data (including the whole data item when `$` is used).
@@ -244,6 +257,8 @@ def get_events(last_created_at = dlt.sources.incremental("$", last_value_func=by
244257
yield json.load(f)
245258
```
246259

260+
### Deduplication primary_key
261+
247262
`dlt.sources.incremental` let's you optionally set a `primary_key` that is used exclusively to
248263
deduplicate and which does not become a table hint. The same setting lets you disable the
249264
deduplication altogether when empty tuple is passed. Below we pass `primary_key` directly to
@@ -304,7 +319,7 @@ def repo_issues(
304319
created_at = dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z", end_value="2022-07-01T00:00:00Z")
305320
):
306321
# get issues from created from last "created_at" value
307-
for page in _get_issues_page(access_token, repository, since=created_at.last_value, until=created_at.end_value):
322+
for page in _get_issues_page(access_token, repository, since=created_at.start_value, until=created_at.end_value):
308323
yield page
309324
```
310325
Above we use `initial_value` and `end_value` arguments of the `incremental` to define the range of issues that we want to retrieve
@@ -345,7 +360,7 @@ def tickets(
345360
),
346361
):
347362
for page in zendesk_client.get_pages(
348-
"/api/v2/incremental/tickets", "tickets", start_time=updated_at.last_value
363+
"/api/v2/incremental/tickets", "tickets", start_time=updated_at.start_value
349364
):
350365
yield page
351366
```
@@ -464,7 +479,7 @@ def tickets(
464479
),
465480
):
466481
for page in zendesk_client.get_pages(
467-
"/api/v2/incremental/tickets", "tickets", start_time=updated_at.last_value
482+
"/api/v2/incremental/tickets", "tickets", start_time=updated_at.start_value
468483
):
469484
yield page
470485

docs/website/docs/general-usage/resource.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,12 @@ tables.users.apply_hints(
242242
pipeline.run(tables)
243243
```
244244

245+
To just change a name of a table to which resource will load data, do the following:
246+
```python
247+
tables = sql_database()
248+
tables.users.table_name = "other_users"
249+
```
250+
245251
## Load resources
246252

247253
You can pass individual resources or list of resources to the `dlt.pipeline` object. The resources

docs/website/docs/walkthroughs/adjust-a-schema.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,12 @@ players_games:
100100
Run the pipeline script again and make sure that the change is visible in export schema. Then,
101101
[launch the Streamlit app](../dlt-ecosystem/visualizations/exploring-the-data.md) to see the changed data.
102102
103+
:::note
104+
Do not rename the tables or columns in the yaml file. `dlt` infers those from the data so the schema will be recreated.
105+
You can [adjust the schema](../general-usage/resource.md#adjust-schema) in Python before resource is loaded.
106+
:::
107+
108+
103109
### Load data as json instead of generating child table or columns from flattened dicts
104110

105111
In the export schema, you can see that white and black players properties got flattened into:

0 commit comments

Comments
 (0)