Skip to content

Commit dfd4d89

Browse files
authored
fixes filesystem dest on windows (#1335)
* uses local path and pathlib to handle local filesystem in filesystem destination * tests windows paths for filesystem destination * uses datetime as load_package_timestamp * splits remote path and remote uri in filesystem load job * adds tests cases for windows extended paths + docs
1 parent e48da74 commit dfd4d89

File tree

13 files changed

+224
-73
lines changed

13 files changed

+224
-73
lines changed

dlt/common/storages/configuration.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,5 +168,4 @@ def make_file_uri(local_path: str) -> str:
168168
"""
169169
p_ = pathlib.Path(local_path)
170170
p_ = p_.expanduser().resolve()
171-
# return "file:///" + p_.as_posix().lstrip("/")
172171
return p_.as_uri()

dlt/destinations/fs_client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
class FSClientBase(ABC):
77
fs_client: AbstractFileSystem
88

9+
@property
10+
@abstractmethod
11+
def dataset_path(self) -> str:
12+
pass
13+
914
@abstractmethod
1015
def get_table_dir(self, table_name: str) -> str:
1116
"""returns directory for given table"""

dlt/destinations/impl/filesystem/filesystem.py

Lines changed: 68 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import posixpath
2+
import pathlib
23
import os
34
import base64
45
from types import TracebackType
@@ -8,8 +9,6 @@
89
from dlt.common import json, pendulum
910
from dlt.common.typing import DictStrAny
1011

11-
import re
12-
1312
import dlt
1413
from dlt.common import logger, time
1514
from dlt.common.schema import Schema, TSchemaTables, TTableSchema
@@ -52,42 +51,47 @@ def __init__(
5251
file_name = FileStorage.get_file_name_from_file_path(local_path)
5352
self.config = config
5453
self.dataset_path = dataset_path
54+
self.is_local_filesystem = config.protocol == "file"
55+
# pick local filesystem pathlib or posix for buckets
56+
self.pathlib = os.path if self.is_local_filesystem else posixpath
5557
self.destination_file_name = path_utils.create_path(
5658
config.layout,
5759
file_name,
5860
schema_name,
5961
load_id,
6062
current_datetime=config.current_datetime,
61-
load_package_timestamp=dlt.current.load_package()["state"]["created_at"], # type: ignore
63+
load_package_timestamp=dlt.current.load_package()["state"]["created_at"],
6264
extra_placeholders=config.extra_placeholders,
6365
)
6466

6567
super().__init__(file_name)
6668
fs_client, _ = fsspec_from_config(config)
67-
self.destination_file_name = path_utils.create_path(
68-
config.layout,
69-
file_name,
70-
schema_name,
71-
load_id,
72-
current_datetime=config.current_datetime,
73-
load_package_timestamp=dlt.current.load_package()["state"]["created_at"], # type: ignore
74-
extra_placeholders=config.extra_placeholders,
75-
)
76-
7769
# We would like to avoid failing for local filesystem where
7870
# deeply nested directory will not exist before writing a file.
7971
# It `auto_mkdir` is disabled by default in fsspec so we made some
8072
# trade offs between different options and decided on this.
8173
item = self.make_remote_path()
82-
if self.config.protocol == "file":
83-
fs_client.makedirs(posixpath.dirname(item), exist_ok=True)
74+
if self.is_local_filesystem:
75+
fs_client.makedirs(self.pathlib.dirname(item), exist_ok=True)
8476
fs_client.put_file(local_path, item)
8577

8678
def make_remote_path(self) -> str:
87-
return (
88-
f"{self.config.protocol}://{posixpath.join(self.dataset_path, self.destination_file_name)}"
79+
"""Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used"""
80+
# path.join does not normalize separators and available
81+
# normalization functions are very invasive and may string the trailing separator
82+
return self.pathlib.join( # type: ignore[no-any-return]
83+
self.dataset_path,
84+
path_utils.normalize_path_sep(self.pathlib, self.destination_file_name),
8985
)
9086

87+
def make_remote_uri(self) -> str:
88+
"""Returns uri to the remote filesystem to which copy the file"""
89+
remote_path = self.make_remote_path()
90+
if self.is_local_filesystem:
91+
return self.config.make_file_uri(remote_path)
92+
else:
93+
return f"{self.config.protocol}://{remote_path}"
94+
9195
def state(self) -> TLoadJobState:
9296
return "completed"
9397

@@ -100,7 +104,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]:
100104
jobs = super().create_followup_jobs(final_state)
101105
if final_state == "completed":
102106
ref_job = NewReferenceJob(
103-
file_name=self.file_name(), status="running", remote_path=self.make_remote_path()
107+
file_name=self.file_name(), status="running", remote_path=self.make_remote_uri()
104108
)
105109
jobs.append(ref_job)
106110
return jobs
@@ -111,36 +115,49 @@ class FilesystemClient(FSClientBase, JobClientBase, WithStagingDataset, WithStat
111115

112116
capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
113117
fs_client: AbstractFileSystem
114-
fs_path: str
118+
# a path (without the scheme) to a location in the bucket where dataset is present
119+
bucket_path: str
120+
# name of the dataset
121+
dataset_name: str
115122

116123
def __init__(self, schema: Schema, config: FilesystemDestinationClientConfiguration) -> None:
117124
super().__init__(schema, config)
118-
self.fs_client, self.fs_path = fsspec_from_config(config)
125+
self.fs_client, fs_path = fsspec_from_config(config)
126+
self.is_local_filesystem = config.protocol == "file"
127+
self.bucket_path = (
128+
config.make_local_path(config.bucket_url) if self.is_local_filesystem else fs_path
129+
)
130+
# pick local filesystem pathlib or posix for buckets
131+
self.pathlib = os.path if self.is_local_filesystem else posixpath
132+
119133
self.config: FilesystemDestinationClientConfiguration = config
120134
# verify files layout. we need {table_name} and only allow {schema_name} before it, otherwise tables
121135
# cannot be replaced and we cannot initialize folders consistently
122136
self.table_prefix_layout = path_utils.get_table_prefix_layout(config.layout)
123-
self._dataset_path = self.config.normalize_dataset_name(self.schema)
137+
self.dataset_name = self.config.normalize_dataset_name(self.schema)
124138

125139
def drop_storage(self) -> None:
126140
if self.is_storage_initialized():
127141
self.fs_client.rm(self.dataset_path, recursive=True)
128142

129143
@property
130144
def dataset_path(self) -> str:
131-
return posixpath.join(self.fs_path, self._dataset_path)
145+
"""A path within a bucket to tables in a dataset
146+
NOTE: dataset_name changes if with_staging_dataset is active
147+
"""
148+
return self.pathlib.join(self.bucket_path, self.dataset_name) # type: ignore[no-any-return]
132149

133150
@contextmanager
134151
def with_staging_dataset(self) -> Iterator["FilesystemClient"]:
135-
current_dataset_path = self._dataset_path
152+
current_dataset_name = self.dataset_name
136153
try:
137-
self._dataset_path = self.schema.naming.normalize_table_identifier(
138-
current_dataset_path + "_staging"
154+
self.dataset_name = self.schema.naming.normalize_table_identifier(
155+
current_dataset_name + "_staging"
139156
)
140157
yield self
141158
finally:
142159
# restore previous dataset name
143-
self._dataset_path = current_dataset_path
160+
self.dataset_name = current_dataset_name
144161

145162
def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
146163
# clean up existing files for tables selected for truncating
@@ -152,7 +169,7 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
152169

153170
# we mark the storage folder as initialized
154171
self.fs_client.makedirs(self.dataset_path, exist_ok=True)
155-
self.fs_client.touch(posixpath.join(self.dataset_path, INIT_FILE_NAME))
172+
self.fs_client.touch(self.pathlib.join(self.dataset_path, INIT_FILE_NAME))
156173

157174
def truncate_tables(self, table_names: List[str]) -> None:
158175
"""Truncate table with given name"""
@@ -161,7 +178,7 @@ def truncate_tables(self, table_names: List[str]) -> None:
161178
for table_dir in table_dirs:
162179
for table_file in self.list_files_with_prefixes(table_dir, table_prefixes):
163180
# NOTE: deleting in chunks on s3 does not raise on access denied, file non existing and probably other errors
164-
# print(f"DEL {item}")
181+
# print(f"DEL {table_file}")
165182
try:
166183
# NOTE: must use rm_file to get errors on delete
167184
self.fs_client.rm_file(table_file)
@@ -188,7 +205,7 @@ def update_stored_schema(
188205
self.fs_client.makedirs(directory, exist_ok=True)
189206
# we need to mark the folders of the data tables as initialized
190207
if tables_name in self.schema.dlt_table_names():
191-
self.fs_client.touch(posixpath.join(directory, INIT_FILE_NAME))
208+
self.fs_client.touch(self.pathlib.join(directory, INIT_FILE_NAME))
192209

193210
# don't store schema when used as staging
194211
if not self.config.as_staging:
@@ -199,17 +216,21 @@ def update_stored_schema(
199216
def get_table_dir(self, table_name: str) -> str:
200217
# dlt tables do not respect layout (for now)
201218
table_prefix = self.get_table_prefix(table_name)
202-
return posixpath.dirname(table_prefix)
219+
return self.pathlib.dirname(table_prefix) # type: ignore[no-any-return]
203220

204221
def get_table_prefix(self, table_name: str) -> str:
205222
# dlt tables do not respect layout (for now)
206223
if table_name.startswith(self.schema._dlt_tables_prefix):
207-
table_prefix = posixpath.join(table_name, "")
224+
# dlt tables get layout where each tables is a folder
225+
# it is crucial to append and keep "/" at the end
226+
table_prefix = self.pathlib.join(table_name, "")
208227
else:
209228
table_prefix = self.table_prefix_layout.format(
210229
schema_name=self.schema.name, table_name=table_name
211230
)
212-
return posixpath.join(self.dataset_path, table_prefix)
231+
return self.pathlib.join( # type: ignore[no-any-return]
232+
self.dataset_path, path_utils.normalize_path_sep(self.pathlib, table_prefix)
233+
)
213234

214235
def get_table_dirs(self, table_names: Iterable[str]) -> List[str]:
215236
"""Gets directories where table data is stored."""
@@ -227,15 +248,20 @@ def list_files_with_prefixes(self, table_dir: str, prefixes: List[str]) -> List[
227248
result = []
228249
for current_dir, _dirs, files in self.fs_client.walk(table_dir, detail=False, refresh=True):
229250
for file in files:
230-
filename = posixpath.join(current_dir, file)
251+
# skip INIT files
252+
if file == INIT_FILE_NAME:
253+
continue
254+
filepath = self.pathlib.join(
255+
path_utils.normalize_path_sep(self.pathlib, current_dir), file
256+
)
231257
for p in prefixes:
232-
if filename.startswith(p):
233-
result.append(posixpath.join(current_dir, file))
234-
continue
258+
if filepath.startswith(p):
259+
result.append(filepath)
260+
break
235261
return result
236262

237263
def is_storage_initialized(self) -> bool:
238-
return self.fs_client.exists(posixpath.join(self.dataset_path, INIT_FILE_NAME)) # type: ignore[no-any-return]
264+
return self.fs_client.exists(self.pathlib.join(self.dataset_path, INIT_FILE_NAME)) # type: ignore[no-any-return]
239265

240266
def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob:
241267
# skip the state table, we create a jsonl file in the complete_load step
@@ -272,7 +298,7 @@ def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool:
272298
#
273299

274300
def _write_to_json_file(self, filepath: str, data: DictStrAny) -> None:
275-
dirname = posixpath.dirname(filepath)
301+
dirname = self.pathlib.dirname(filepath)
276302
if not self.fs_client.isdir(dirname):
277303
return
278304
self.fs_client.write_text(filepath, json.dumps(data), "utf-8")
@@ -283,7 +309,7 @@ def _to_path_safe_string(self, s: str) -> str:
283309

284310
def _list_dlt_table_files(self, table_name: str) -> Iterator[Tuple[str, List[str]]]:
285311
dirname = self.get_table_dir(table_name)
286-
if not self.fs_client.exists(posixpath.join(dirname, INIT_FILE_NAME)):
312+
if not self.fs_client.exists(self.pathlib.join(dirname, INIT_FILE_NAME)):
287313
raise DestinationUndefinedEntity({"dir": dirname})
288314
for filepath in self.list_table_files(table_name):
289315
filename = os.path.splitext(os.path.basename(filepath))[0]
@@ -302,7 +328,7 @@ def _store_load(self, load_id: str) -> None:
302328
"inserted_at": pendulum.now().isoformat(),
303329
"schema_version_hash": self.schema.version_hash,
304330
}
305-
filepath = posixpath.join(
331+
filepath = self.pathlib.join(
306332
self.dataset_path,
307333
self.schema.loads_table_name,
308334
f"{self.schema.name}{FILENAME_SEPARATOR}{load_id}.jsonl",
@@ -320,7 +346,7 @@ def complete_load(self, load_id: str) -> None:
320346

321347
def _get_state_file_name(self, pipeline_name: str, version_hash: str, load_id: str) -> str:
322348
"""gets full path for schema file for a given hash"""
323-
return posixpath.join(
349+
return self.pathlib.join( # type: ignore[no-any-return]
324350
self.get_table_dir(self.schema.state_table_name),
325351
f"{pipeline_name}{FILENAME_SEPARATOR}{load_id}{FILENAME_SEPARATOR}{self._to_path_safe_string(version_hash)}.jsonl",
326352
)
@@ -370,7 +396,7 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
370396
def _get_schema_file_name(self, version_hash: str, load_id: str) -> str:
371397
"""gets full path for schema file for a given hash"""
372398

373-
return posixpath.join(
399+
return self.pathlib.join( # type: ignore[no-any-return]
374400
self.get_table_dir(self.schema.version_table_name),
375401
f"{self.schema.name}{FILENAME_SEPARATOR}{load_id}{FILENAME_SEPARATOR}{self._to_path_safe_string(version_hash)}.jsonl",
376402
)

dlt/destinations/path_utils.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,15 @@
7575
SUPPORTED_TABLE_NAME_PREFIX_PLACEHOLDERS = ("schema_name",)
7676

7777

78+
def normalize_path_sep(pathlib: Any, path: str) -> str:
79+
"""Normalizes path in `path` separator to one used by `pathlib`"""
80+
if pathlib.sep == "/":
81+
return path.replace("\\", "/")
82+
if pathlib.sep == "\\":
83+
return path.replace("/", "\\")
84+
return path
85+
86+
7887
def get_placeholders(layout: str) -> List[str]:
7988
return re.findall(r"\{(.*?)\}", layout)
8089

@@ -89,7 +98,7 @@ def get_unused_placeholders(
8998

9099
def prepare_datetime_params(
91100
current_datetime: Optional[pendulum.DateTime] = None,
92-
load_package_timestamp: Optional[str] = None,
101+
load_package_timestamp: Optional[pendulum.DateTime] = None,
93102
) -> Dict[str, str]:
94103
params: Dict[str, str] = {}
95104
current_timestamp: pendulum.DateTime = None
@@ -205,7 +214,7 @@ def create_path(
205214
file_name: str,
206215
schema_name: str,
207216
load_id: str,
208-
load_package_timestamp: Optional[str] = None,
217+
load_package_timestamp: Optional[pendulum.DateTime] = None,
209218
current_datetime: Optional[TCurrentDateTime] = None,
210219
extra_placeholders: Optional[Dict[str, Any]] = None,
211220
) -> str:

docs/website/docs/dlt-ecosystem/destinations/filesystem.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,20 @@ bucket_url="file://localhost/c$/a/b/c"
213213
bucket_url="file:////localhost/c$/a/b/c"
214214
```
215215

216+
:::caution
217+
Windows supports paths up to 255 characters. When you access a path longer than 255 characters you'll see `FileNotFound` exception.
218+
219+
To go over this limit you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry). `dlt` recognizes both regular and UNC extended paths
220+
221+
```toml
222+
[destination.regular_extended]
223+
bucket_url = '\\?\C:\a\b\c'
224+
225+
[destination.unc_extended]
226+
bucket_url='\\?\UNC\localhost\c$\a\b\c'
227+
```
228+
:::
229+
216230
## Write disposition
217231
The filesystem destination handles the write dispositions as follows:
218232
- `append` - files belonging to such tables are added to the dataset folder

docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,18 @@ You can use both native local file system paths and in form of `file:` uri. Abso
151151
You can find relevant examples in [filesystem destination documentation](../destinations/filesystem.md#local-file-system) which follows
152152
the same rules to specify the `bucket_url`.
153153

154+
:::caution
155+
Windows supports paths up to 255 characters. When you access a path longer than 255 characters you'll see `FileNotFound` exception.
156+
157+
To go over this limit you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry).
158+
**Note that Python glob does not work with extended UNC paths** so you will not be able to use them
159+
160+
```toml
161+
[sources.filesystem]
162+
bucket_url = '\\?\C:\a\b\c'
163+
```
164+
:::
165+
154166
## Run the pipeline
155167

156168
1. Before running the pipeline, ensure that you have installed all the necessary dependencies by

0 commit comments

Comments
 (0)