Skip to content

Commit 30f0416

Browse files
sultanimanVioletM
andauthored
Accept :memory: mode for credentials parameter in duckdb factory (#1297)
* Accept :memory: mode for credentials parameter in duckdb factory * Check instance of native_value and add check in test for in-memory mode * Add a separate test for in-memory mode when using duckdb * Fix failing test, bind dataset_name to configuration * Adjust test, take ownership when :memory: has been passed * Revert changes * Adjust documentation of duckdb * Add a new exception for duckdb * Adjust error message * Remove backticks * Fix typo * Update docs * Update duckdb docs, merge examples * Remove the mention of :memory: from docstrings * Adjust the message in exception * Catch :memory: in DuckDbCredentials.on_resolve * Update tests * Rename exception * Update docs * Use Destination.from_reference in code snippet * Add one more test for Destination.from_reference and update docs * Format code and ignore mypy error * Use standard way to initialize destination via factory * Preserve environment * Cleanup duckdb docs code snippets * Adjust memo about :pipeline: connection string * Show tables from in-memory schema * Explicitly mention python script instead of pipeline * Fix typo * Reword the description of :pipeline: value * Update docs/website/docs/dlt-ecosystem/destinations/duckdb.md Co-authored-by: VioletM <[email protected]> * Re-arrange text blocks * Add example on how to use :pipeline: connection string * Fix typo * Adjust warning message --------- Co-authored-by: VioletM <[email protected]>
1 parent e329ab9 commit 30f0416

File tree

5 files changed

+108
-10
lines changed

5 files changed

+108
-10
lines changed

dlt/destinations/impl/duckdb/configuration.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
import os
22
import dataclasses
33
import threading
4-
from pathvalidate import is_valid_filepath
4+
55
from typing import Any, ClassVar, Dict, Final, List, Optional, Tuple, Type, Union
66

7+
from pathvalidate import is_valid_filepath
78
from dlt.common import logger
89
from dlt.common.configuration import configspec
910
from dlt.common.configuration.specs import ConnectionStringCredentials
1011
from dlt.common.configuration.specs.exceptions import InvalidConnectionString
1112
from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration
1213
from dlt.common.typing import TSecretValue
14+
from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials
1315

1416
try:
1517
from duckdb import DuckDBPyConnection
@@ -117,6 +119,9 @@ def is_partial(self) -> bool:
117119
return self.database == ":pipeline:"
118120

119121
def on_resolved(self) -> None:
122+
if isinstance(self.database, str) and self.database == ":memory:":
123+
raise InvalidInMemoryDuckdbCredentials()
124+
120125
# do not set any paths for external database
121126
if self.database == ":external:":
122127
return
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from dlt.common.destination.exceptions import DestinationTerminalException
2+
3+
4+
class InvalidInMemoryDuckdbCredentials(DestinationTerminalException):
5+
def __init__(self) -> None:
6+
super().__init__(
7+
"To use in-memory instance of duckdb, "
8+
"please instantiate it first and then pass to destination factory\n"
9+
'\nconn = duckdb.connect(":memory:")\n'
10+
'dlt.pipeline(pipeline_name="...", destination=dlt.destinations.duckdb(conn)'
11+
)

dlt/destinations/impl/duckdb/factory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(
3737
3838
Args:
3939
credentials: Credentials to connect to the duckdb database. Can be an instance of `DuckDbCredentials` or
40-
a path to a database file. Use `:memory:` to create an in-memory database or :pipeline: to create a duckdb
40+
a path to a database file. Use :pipeline: to create a duckdb
4141
in the working folder of the pipeline
4242
create_indexes: Should unique indexes be created, defaults to False
4343
**kwargs: Additional arguments passed to the destination config

docs/website/docs/dlt-ecosystem/destinations/duckdb.md

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,30 +102,72 @@ p = dlt.pipeline(
102102
)
103103
```
104104

105-
The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use. `:memory:` databases are supported.
105+
The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use.
106+
106107
```py
107108
import duckdb
109+
108110
db = duckdb.connect()
109111
p = dlt.pipeline(
110-
pipeline_name='chess',
112+
pipeline_name="chess",
111113
destination=dlt.destinations.duckdb(db),
112-
dataset_name='chess_data',
114+
dataset_name="chess_data",
113115
full_refresh=False,
114116
)
117+
118+
# Or if you would like to use in-memory duckdb instance
119+
db = duckdb.connect(":memory:")
120+
p = pipeline_one = dlt.pipeline(
121+
pipeline_name="in_memory_pipeline",
122+
destination=dlt.destinations.duckdb(db),
123+
dataset_name="chess_data",
124+
)
125+
126+
print(db.sql("DESCRIBE;"))
127+
128+
# Example output
129+
# ┌──────────┬───────────────┬─────────────────────┬──────────────────────┬───────────────────────┬───────────┐
130+
# │ database │ schema │ name │ column_names │ column_types │ temporary │
131+
# │ varchar │ varchar │ varchar │ varchar[] │ varchar[] │ boolean │
132+
# ├──────────┼───────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────┤
133+
# │ memory │ chess_data │ _dlt_loads │ [load_id, schema_n… │ [VARCHAR, VARCHAR, … │ false │
134+
# │ memory │ chess_data │ _dlt_pipeline_state │ [version, engine_v… │ [BIGINT, BIGINT, VA… │ false │
135+
# │ memory │ chess_data │ _dlt_version │ [version, engine_v… │ [BIGINT, BIGINT, TI… │ false │
136+
# │ memory │ chess_data │ my_table │ [a, _dlt_load_id, … │ [BIGINT, VARCHAR, V… │ false │
137+
# └──────────┴───────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────┘
115138
```
116139

140+
:::note
141+
Be careful! The in-memory instance of the database will be destroyed, once your Python script exits.
142+
:::
143+
117144
This destination accepts database connection strings in the format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration).
118145

119146
You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g., using a `secrets.toml` file)
120147
```toml
121148
destination.duckdb.credentials="duckdb:///_storage/test_quack.duckdb"
122149
```
150+
123151
The **duckdb://** URL above creates a **relative** path to `_storage/test_quack.duckdb`. To define an **absolute** path, you need to specify four slashes, i.e., `duckdb:////_storage/test_quack.duckdb`.
124152

125-
A few special connection strings are supported:
126-
* **:pipeline:** creates the database in the working directory of the pipeline with the name `quack.duckdb`.
127-
* **:memory:** creates an in-memory database. This may be useful for testing.
153+
Dlt supports a unique connection string that triggers specific behavior for duckdb destination:
154+
* **:pipeline:** creates the database in the working directory of the pipeline, naming it `quack.duckdb`.
155+
156+
Please see the code snippets below showing how to use it
128157

158+
1. Via `config.toml`
159+
```toml
160+
destination.duckdb.credentials=":pipeline:"
161+
```
162+
163+
2. In Python code
164+
```py
165+
p = pipeline_one = dlt.pipeline(
166+
pipeline_name="my_pipeline",
167+
destination="duckdb",
168+
credentials=":pipeline:",
169+
)
170+
```
129171

130172
### Additional configuration
131173
Unique indexes may be created during loading if the following config value is set:

tests/load/duckdb/test_duckdb_client.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@
66
from dlt.common.configuration.resolve import resolve_configuration
77
from dlt.common.configuration.utils import get_resolved_traces
88

9+
from dlt.common.destination.reference import Destination
910
from dlt.destinations.impl.duckdb.configuration import (
10-
DUCK_DB_NAME,
1111
DuckDbClientConfiguration,
12-
DuckDbCredentials,
1312
DEFAULT_DUCK_DB_NAME,
1413
)
1514
from dlt.destinations import duckdb
1615

16+
from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials
17+
from dlt.pipeline.exceptions import PipelineStepFailed
1718
from tests.load.pipeline.utils import drop_pipeline
1819
from tests.pipeline.utils import assert_table
1920
from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, TEST_STORAGE_ROOT
@@ -56,6 +57,44 @@ def test_duckdb_open_conn_default() -> None:
5657
delete_quack_db()
5758

5859

60+
def test_duckdb_in_memory_mode_via_factory(preserve_environ):
61+
delete_quack_db()
62+
try:
63+
import duckdb
64+
65+
# Check if passing external duckdb connection works fine
66+
db = duckdb.connect(":memory:")
67+
dlt.pipeline(pipeline_name="booboo", destination=dlt.destinations.duckdb(db))
68+
69+
# Check if passing :memory: to factory fails
70+
with pytest.raises(PipelineStepFailed) as exc:
71+
p = dlt.pipeline(pipeline_name="booboo", destination="duckdb", credentials=":memory:")
72+
p.run([1, 2, 3])
73+
74+
assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials)
75+
76+
os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = ":memory:"
77+
with pytest.raises(PipelineStepFailed):
78+
p = dlt.pipeline(
79+
pipeline_name="booboo",
80+
destination="duckdb",
81+
)
82+
p.run([1, 2, 3])
83+
84+
assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials)
85+
86+
with pytest.raises(PipelineStepFailed) as exc:
87+
p = dlt.pipeline(
88+
pipeline_name="booboo",
89+
destination=Destination.from_reference("duckdb", credentials=":memory:"), # type: ignore[arg-type]
90+
)
91+
p.run([1, 2, 3], table_name="numbers")
92+
93+
assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials)
94+
finally:
95+
delete_quack_db()
96+
97+
5998
def test_duckdb_database_path() -> None:
6099
# resolve without any path provided
61100
c = resolve_configuration(
@@ -257,6 +296,7 @@ def test_external_duckdb_database() -> None:
257296
assert c.credentials._conn_owner is False
258297
assert hasattr(c.credentials, "_conn")
259298
conn.close()
299+
assert not os.path.exists(":memory:")
260300

261301

262302
def test_default_duckdb_dataset_name() -> None:

0 commit comments

Comments
 (0)