Skip to content

Commit fc718d5

Browse files
committed
adds tests cases for windows extended paths + docs
1 parent 0b806f2 commit fc718d5

File tree

6 files changed

+55
-15
lines changed

6 files changed

+55
-15
lines changed

dlt/destinations/impl/filesystem/filesystem.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ def __init__(self, schema: Schema, config: FilesystemDestinationClientConfigurat
124124
super().__init__(schema, config)
125125
self.fs_client, fs_path = fsspec_from_config(config)
126126
self.is_local_filesystem = config.protocol == "file"
127-
#
128127
self.bucket_path = (
129128
config.make_local_path(config.bucket_url) if self.is_local_filesystem else fs_path
130129
)

docs/website/docs/dlt-ecosystem/destinations/filesystem.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,20 @@ bucket_url="file://localhost/c$/a/b/c"
213213
bucket_url="file:////localhost/c$/a/b/c"
214214
```
215215

216+
:::caution
217+
Windows supports paths up to 255 characters. When you access a path longer than 255 characters you'll see `FileNotFound` exception.
218+
219+
To go over this limit you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry). `dlt` recognizes both regular and UNC extended paths
220+
221+
```toml
222+
[destination.regular_extended]
223+
bucket_url = '\\?\C:\a\b\c'
224+
225+
[destination.unc_extended]
226+
bucket_url='\\?\UNC\localhost\c$\a\b\c'
227+
```
228+
:::
229+
216230
## Write disposition
217231
The filesystem destination handles the write dispositions as follows:
218232
- `append` - files belonging to such tables are added to the dataset folder

docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,18 @@ You can use both native local file system paths and in form of `file:` uri. Abso
151151
You can find relevant examples in [filesystem destination documentation](../destinations/filesystem.md#local-file-system) which follows
152152
the same rules to specify the `bucket_url`.
153153

154+
:::caution
155+
Windows supports paths up to 255 characters. When you access a path longer than 255 characters you'll see `FileNotFound` exception.
156+
157+
To go over this limit you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry).
158+
**Note that Python glob does not work with extended UNC paths** so you will not be able to use them
159+
160+
```toml
161+
[sources.filesystem]
162+
bucket_url = '\\?\C:\a\b\c'
163+
```
164+
:::
165+
154166
## Run the pipeline
155167

156168
1. Before running the pipeline, ensure that you have installed all the necessary dependencies by

tests/common/storages/test_local_filesystem.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from tests.utils import skipifnotwindows, skipifwindows
1313

1414
UNC_LOCAL_PATH = r"\\localhost\c$\tests\common\test.csv"
15+
UNC_LOCAL_EXT_PATH = r"\\?\UNC\localhost\c$\tests\common\test.csv"
1516
UNC_WSL_PATH = r"\\wsl.localhost\Ubuntu-18.04\home\rudolfix\ .dlt"
1617

1718

@@ -20,9 +21,10 @@
2021
"bucket_url,file_url",
2122
(
2223
(UNC_LOCAL_PATH, pathlib.PureWindowsPath(UNC_LOCAL_PATH).as_uri()),
24+
(UNC_LOCAL_EXT_PATH, pathlib.PureWindowsPath(UNC_LOCAL_EXT_PATH).as_uri()),
2325
(UNC_WSL_PATH, pathlib.PureWindowsPath(UNC_WSL_PATH).as_uri()),
2426
(r"C:\hello", "file:///C:/hello"),
25-
# (r"\\?\C:\hello", "file:///C:/hello"),
27+
(r"\\?\C:\hello", "file://%3F/C%3A/hello"),
2628
(r"a\b $\b", "file:///" + pathlib.Path(r"a\\" + quote("b $") + r"\b").resolve().as_posix()),
2729
# same paths but with POSIX separators
2830
(
@@ -232,7 +234,9 @@ def test_filesystem_decompress() -> None:
232234

233235
# create windows UNC paths, on POSIX systems they are not used
234236
WIN_ABS_PATH = os.path.abspath(TEST_SAMPLE_FILES)
237+
WIN_ABS_EXT_PATH = "\\\\?\\" + os.path.abspath(TEST_SAMPLE_FILES)
235238
WIN_UNC_PATH = "\\\\localhost\\" + WIN_ABS_PATH.replace(":", "$").lower()
239+
WIN_UNC_EXT_PATH = "\\\\?\\UNC\\localhost\\" + WIN_ABS_PATH.replace(":", "$").lower()
236240

237241

238242
@skipifnotwindows
@@ -242,8 +246,13 @@ def test_filesystem_decompress() -> None:
242246
WIN_UNC_PATH,
243247
"file:///" + pathlib.Path(WIN_UNC_PATH).as_posix(),
244248
"file://localhost/" + pathlib.Path(WIN_ABS_PATH).as_posix().replace(":", "$"),
249+
# WIN_UNC_EXT_PATH,
250+
# "file:///" + pathlib.Path(WIN_UNC_EXT_PATH).as_posix(),
251+
# "file://localhost/" + pathlib.Path(WIN_UNC_EXT_PATH).as_posix().replace(":", "$"),
245252
WIN_ABS_PATH,
246-
"file:///" + pathlib.Path(WIN_ABS_PATH).as_posix(),
253+
WIN_ABS_EXT_PATH,
254+
pathlib.Path(WIN_ABS_PATH).as_uri(),
255+
pathlib.Path(WIN_ABS_EXT_PATH).as_uri(),
247256
# r"\\wsl.localhost\Ubuntu-18.04\home\rudolfix\src\dlt\tests\common\storages\samples"
248257
),
249258
)

tests/load/pipeline/test_filesystem_pipeline.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def test_pipeline_csv_filesystem_destination(item_type: TestDataItemFormat) -> N
8383
os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True"
8484
os.environ["RESTORE_FROM_DESTINATION"] = "False"
8585
# store locally
86-
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://_storage"
86+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage"
8787

8888
pipeline = dlt.pipeline(
8989
pipeline_name="parquet_test_" + uniq_id(),
@@ -110,7 +110,7 @@ def test_csv_options(item_type: TestDataItemFormat) -> None:
110110
os.environ["NORMALIZE__DATA_WRITER__DELIMITER"] = "|"
111111
os.environ["NORMALIZE__DATA_WRITER__INCLUDE_HEADER"] = "False"
112112
# store locally
113-
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://_storage"
113+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage"
114114
pipeline = dlt.pipeline(
115115
pipeline_name="parquet_test_" + uniq_id(),
116116
destination="filesystem",
@@ -139,7 +139,7 @@ def test_csv_quoting_style(item_type: TestDataItemFormat) -> None:
139139
os.environ["NORMALIZE__DATA_WRITER__QUOTING"] = "quote_all"
140140
os.environ["NORMALIZE__DATA_WRITER__INCLUDE_HEADER"] = "False"
141141
# store locally
142-
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://_storage"
142+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage"
143143
pipeline = dlt.pipeline(
144144
pipeline_name="parquet_test_" + uniq_id(),
145145
destination="filesystem",
@@ -170,7 +170,7 @@ def test_pipeline_parquet_filesystem_destination() -> None:
170170
import pyarrow.parquet as pq # Module is evaluated by other tests
171171

172172
# store locally
173-
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://_storage"
173+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage"
174174
pipeline = dlt.pipeline(
175175
pipeline_name="parquet_test_" + uniq_id(),
176176
destination="filesystem",
@@ -264,7 +264,7 @@ def count(*args, **kwargs) -> Any:
264264
"hiphip": counter("Hurraaaa"),
265265
}
266266
now = pendulum.now()
267-
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://_storage"
267+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage"
268268
os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE"
269269

270270
# the reason why we are patching pendulum.from_timestamp is that

tests/pipeline/test_pipeline.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2165,12 +2165,18 @@ def test_yielding_empty_list_creates_table() -> None:
21652165
assert rows[0] == (1, None)
21662166

21672167

2168-
@skipifnotwindows
2169-
def test_local_filesystem_destination() -> None:
2170-
# make it unc path
2171-
unc_path = "\\\\localhost\\" + os.path.abspath("_storage").replace(":", "$")
2172-
print(unc_path)
2168+
local_paths = [os.path.abspath("_storage"), "_storage"]
2169+
if os.name == "nt":
2170+
local_paths += [
2171+
# UNC extended path
2172+
"\\\\?\\UNC\\localhost\\" + os.path.abspath("_storage").replace(":", "$"),
2173+
# UNC path
2174+
"\\\\localhost\\" + os.path.abspath("_storage").replace(":", "$"),
2175+
]
2176+
21732177

2178+
@pytest.mark.parametrize("local_path", local_paths)
2179+
def test_local_filesystem_destination(local_path: str) -> None:
21742180
dataset_name = "mydata_" + uniq_id()
21752181

21762182
@dlt.resource
@@ -2180,7 +2186,7 @@ def stateful_resource():
21802186

21812187
pipeline = dlt.pipeline(
21822188
pipeline_name="local_files",
2183-
destination=dlt.destinations.filesystem(unc_path),
2189+
destination=dlt.destinations.filesystem(local_path),
21842190
dataset_name=dataset_name,
21852191
)
21862192
info = pipeline.run(stateful_resource(), table_name="numbers", write_disposition="replace")
@@ -2217,7 +2223,7 @@ def stateful_resource():
22172223
# all path formats we use must lead to "_storage" relative to tests
22182224
assert (
22192225
pathlib.Path(fs_client.dataset_path).resolve()
2220-
== pathlib.Path(unc_path).joinpath(dataset_name).resolve()
2226+
== pathlib.Path(local_path).joinpath(dataset_name).resolve()
22212227
)
22222228
# same for client
22232229
assert len(fs_client.list_table_files("numbers")) == 1

0 commit comments

Comments
 (0)