Skip to content

Commit 9f17a21

Browse files
authored
1272 Support ClickHouse GCS S3 compatibility mode in filesystem destination (#1423)
* Add HMAC credentials and update Clickhouse configuration Signed-off-by: Marcel Coetzee <[email protected]> * Revert "Add HMAC credentials and update Clickhouse configuration" This reverts commit cb80c6b. * Refactor error handling for storage authentication in Clickhouse Signed-off-by: Marcel Coetzee <[email protected]> * Revert "Refactor error handling for storage authentication in Clickhouse" This reverts commit f24eb1d. * Remove GCS ClickHouse buckets in CI until named destinations are supported Signed-off-by: Marcel Coetzee <[email protected]> * Add GCS S3 compatibility test, remove GCP credentials from Clickhouse Signed-off-by: Marcel Coetzee <[email protected]> * Refactor ClickHouse test code for better readability Signed-off-by: Marcel Coetzee <[email protected]> * Refactor endpoint handling and update GCS bucket configuration Signed-off-by: Marcel Coetzee <[email protected]> * Refactor test for clickhouse gcs_s3 compatibility Signed-off-by: Marcel Coetzee <[email protected]> * Update ClickHouse docs and tests for S3-compatible staging Signed-off-by: Marcel Coetzee <[email protected]> * Update ClickHouse documentation on staging areas Signed-off-by: Marcel Coetzee <[email protected]> --------- Signed-off-by: Marcel Coetzee <[email protected]>
1 parent 829b558 commit 9f17a21

File tree

5 files changed

+70
-62
lines changed

5 files changed

+70
-62
lines changed

dlt/destinations/impl/clickhouse/clickhouse.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import re
33
from copy import deepcopy
4+
from textwrap import dedent
45
from typing import ClassVar, Optional, Dict, List, Sequence, cast, Tuple
56
from urllib.parse import urlparse
67

@@ -201,22 +202,23 @@ def __init__(
201202
compression = "none" if config.get("data_writer.disable_compression") else "gz"
202203

203204
if bucket_scheme in ("s3", "gs", "gcs"):
204-
# get auth and bucket url
205-
bucket_http_url = convert_storage_to_http_scheme(bucket_url)
206-
access_key_id: str = None
207-
secret_access_key: str = None
208205
if isinstance(staging_credentials, AwsCredentialsWithoutDefaults):
206+
bucket_http_url = convert_storage_to_http_scheme(
207+
bucket_url, endpoint=staging_credentials.endpoint_url
208+
)
209209
access_key_id = staging_credentials.aws_access_key_id
210210
secret_access_key = staging_credentials.aws_secret_access_key
211-
elif isinstance(staging_credentials, GcpCredentials):
212-
access_key_id = client.credentials.gcp_access_key_id
213-
secret_access_key = client.credentials.gcp_secret_access_key
214-
if not access_key_id or not secret_access_key:
215-
raise DestinationTransientException(
216-
"You have tried loading from gcs with clickhouse. Please provide valid"
217-
" 'gcp_access_key_id' and 'gcp_secret_access_key' to connect to gcs as"
218-
" outlined in the dlthub docs."
219-
)
211+
else:
212+
raise LoadJobTerminalException(
213+
file_path,
214+
dedent(
215+
"""
216+
Google Cloud Storage buckets must be configured using the S3 compatible access pattern.
217+
Please provide the necessary S3 credentials (access key ID and secret access key), to access the GCS bucket through the S3 API.
218+
Refer to https://dlthub.com/docs/dlt-ecosystem/destinations/filesystem#using-s3-compatible-storage.
219+
""",
220+
).strip(),
221+
)
220222

221223
auth = "NOSIGN"
222224
if access_key_id and secret_access_key:

dlt/destinations/impl/clickhouse/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,10 @@ def convert_storage_to_http_scheme(
2525
protocol = "https" if use_https else "http"
2626

2727
if endpoint:
28-
domain = endpoint
28+
domain = endpoint.replace("https://", "").replace("http://", "")
2929
elif region and parsed_url.scheme == "s3":
3030
domain = f"s3-{region}.amazonaws.com"
3131
else:
32-
# TODO: Incorporate dlt.config endpoint.
3332
storage_domains = {
3433
"s3": "s3.amazonaws.com",
3534
"gs": "storage.googleapis.com",

docs/website/docs/dlt-ecosystem/destinations/clickhouse.md

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -115,12 +115,14 @@ destination.
115115

116116
The `clickhouse` destination has a few specific deviations from the default sql destinations:
117117

118-
1. `Clickhouse` has an experimental `object` datatype, but we have found it to be a bit unpredictable, so the dlt clickhouse destination will load the complex datatype to a `text` column. If you need this feature, get in touch with our Slack community, and we will consider adding it.
118+
1. `Clickhouse` has an experimental `object` datatype, but we have found it to be a bit unpredictable, so the dlt clickhouse destination will load the complex datatype to a `text` column. If you need
119+
this feature, get in touch with our Slack community, and we will consider adding it.
119120
2. `Clickhouse` does not support the `time` datatype. Time will be loaded to a `text` column.
120121
3. `Clickhouse` does not support the `binary` datatype. Binary will be loaded to a `text` column. When loading from `jsonl`, this will be a base64 string, when loading from parquet this will be
121122
the `binary` object converted to `text`.
122123
4. `Clickhouse` accepts adding columns to a populated table that are not null.
123-
5. `Clickhouse` can produce rounding errors under certain conditions when using the float / double datatype. Make sure to use decimal if you cannot afford to have rounding errors. Loading the value 12.7001 to a double column with the loader file format jsonl set will predictbly produce a rounding error for example.
124+
5. `Clickhouse` can produce rounding errors under certain conditions when using the float / double datatype. Make sure to use decimal if you cannot afford to have rounding errors. Loading the value
125+
12.7001 to a double column with the loader file format jsonl set will predictbly produce a rounding error for example.
124126

125127
## Supported column hints
126128

@@ -173,51 +175,42 @@ pipeline = dlt.pipeline(
173175
)
174176
```
175177

176-
### Using Google Cloud Storage as a Staging Area
178+
### Using Google Cloud or S3-Compatible Storage as a Staging Area
177179

178-
dlt supports using Google Cloud Storage (GCS) as a staging area when loading data into ClickHouse. This is handled automatically by
179-
ClickHouse's [GCS table function](https://clickhouse.com/docs/en/sql-reference/table-functions/gcs) which dlt uses under the hood.
180+
dlt supports using S3-compatible storage services, including Google Cloud Storage (GCS), as a staging area when loading data into ClickHouse.
181+
This is handled automatically by
182+
ClickHouse's [GCS table function](https://clickhouse.com/docs/en/sql-reference/table-functions/gcs), which dlt uses under the hood.
180183

181-
The clickhouse GCS table function only supports authentication using Hash-based Message Authentication Code (HMAC) keys. To enable this, GCS provides an S3 compatibility mode that emulates
182-
the Amazon S3
183-
API. ClickHouse takes advantage of this to allow accessing GCS buckets via its S3 integration.
184+
The ClickHouse GCS table function only supports authentication using Hash-based Message Authentication Code (HMAC) keys, which is compatible with the Amazon S3 API.
185+
To enable this, GCS provides an S3
186+
compatibility mode that emulates the S3 API, allowing ClickHouse to access GCS buckets via its S3 integration.
187+
188+
For detailed instructions on setting up S3-compatible storage with dlt, including AWS S3, MinIO, and Cloudflare R2, refer to
189+
the [dlt documentation on filesystem destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/filesystem#using-s3-compatible-storage).
184190

185191
To set up GCS staging with HMAC authentication in dlt:
186192

187193
1. Create HMAC keys for your GCS service account by following the [Google Cloud guide](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create).
188194

189-
2. Configure the HMAC keys as well as the `client_email`, `project_id` and `private_key` for your service account in your dlt project's ClickHouse destination settings in `config.toml`:
195+
2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3
196+
credentials:
190197

191198
```toml
192199
[destination.filesystem]
193-
bucket_url = "gs://dlt-ci"
200+
bucket_url = "s3://my_awesome_bucket"
194201

195202
[destination.filesystem.credentials]
196-
project_id = "a-cool-project"
197-
client_email = "[email protected]"
198-
private_key = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkaslkdjflasjnkdcopauihj...wEiEx7y+mx\nNffxQBqVVej2n/D93xY99pM=\n-----END PRIVATE KEY-----\n"
199-
200-
[destination.clickhouse.credentials]
201-
database = "dlt"
202-
username = "dlt"
203-
password = "Dlt*12345789234567"
204-
host = "localhost"
205-
port = 9440
206-
secure = 1
207-
gcp_access_key_id = "JFJ$$*f2058024835jFffsadf"
208-
gcp_secret_access_key = "DFJdwslf2hf57)%$02jaflsedjfasoi"
203+
aws_access_key_id = "JFJ$$*f2058024835jFffsadf"
204+
aws_secret_access_key = "DFJdwslf2hf57)%$02jaflsedjfasoi"
205+
project_id = "my-awesome-project"
206+
endpoint_url = "https://storage.googleapis.com"
209207
```
210208

211-
Note: In addition to the HMAC keys (`gcp_access_key_id` and `gcp_secret_access_key`), you now need to provide the `client_email`, `project_id` and `private_key` for your service account
212-
under `[destination.filesystem.credentials]`.
213-
This is because the GCS staging support is now implemented as a temporary workaround and is still unoptimized.
214-
215-
dlt will pass these credentials to ClickHouse which will handle the authentication and GCS access.
216-
217-
There is active work in progress to simplify and improve the GCS staging setup for the ClickHouse dlt destination in the future. Proper GCS staging support is being tracked in these GitHub issues:
218-
219-
- [Make filesystem destination work with gcs in s3 compatibility mode](https://github.com/dlt-hub/dlt/issues/1272)
220-
- [GCS staging area support](https://github.com/dlt-hub/dlt/issues/1181)
209+
:::caution
210+
When configuring the `bucket_url` for S3-compatible storage services like Google Cloud Storage (GCS) with ClickHouse in dlt, ensure that the URL is prepended with `s3://` instead of `gs://`. This is
211+
because the ClickHouse GCS table function requires the use of HMAC credentials, which are compatible with the S3 API. Prepending with `s3://` allows the HMAC credentials to integrate properly with
212+
dlt's staging mechanisms for ClickHouse.
213+
:::
221214

222215
### dbt support
223216

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from typing import Generator, Dict
2+
3+
import pytest
4+
5+
import dlt
6+
from dlt.destinations import filesystem
7+
from tests.load.utils import GCS_BUCKET
8+
from tests.pipeline.utils import assert_load_info
9+
10+
11+
@pytest.mark.essential
12+
def test_clickhouse_gcs_s3_compatibility() -> None:
13+
@dlt.resource
14+
def dummy_data() -> Generator[Dict[str, int], None, None]:
15+
yield {"field1": 1, "field2": 2}
16+
17+
gcp_bucket = filesystem(
18+
GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp"
19+
)
20+
21+
pipe = dlt.pipeline(
22+
pipeline_name="gcs_s3_compatibility",
23+
destination="clickhouse",
24+
staging=gcp_bucket,
25+
full_refresh=True,
26+
)
27+
pack = pipe.run([dummy_data])
28+
assert_load_info(pack)

tests/load/utils.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -345,13 +345,6 @@ def destinations_configs(
345345
extra_info="az-authorization",
346346
disable_compression=True,
347347
),
348-
DestinationTestConfiguration(
349-
destination="clickhouse",
350-
staging="filesystem",
351-
file_format="parquet",
352-
bucket_url=GCS_BUCKET,
353-
extra_info="gcs-authorization",
354-
),
355348
DestinationTestConfiguration(
356349
destination="clickhouse",
357350
staging="filesystem",
@@ -373,13 +366,6 @@ def destinations_configs(
373366
bucket_url=AZ_BUCKET,
374367
extra_info="az-authorization",
375368
),
376-
DestinationTestConfiguration(
377-
destination="clickhouse",
378-
staging="filesystem",
379-
file_format="jsonl",
380-
bucket_url=GCS_BUCKET,
381-
extra_info="gcs-authorization",
382-
),
383369
DestinationTestConfiguration(
384370
destination="clickhouse",
385371
staging="filesystem",

0 commit comments

Comments
 (0)