11import posixpath
2+ import pathlib
23import os
34import base64
45from types import TracebackType
89from dlt .common import json , pendulum
910from dlt .common .typing import DictStrAny
1011
11- import re
12-
1312import dlt
1413from dlt .common import logger , time
1514from dlt .common .schema import Schema , TSchemaTables , TTableSchema
@@ -52,42 +51,47 @@ def __init__(
5251 file_name = FileStorage .get_file_name_from_file_path (local_path )
5352 self .config = config
5453 self .dataset_path = dataset_path
54+ self .is_local_filesystem = config .protocol == "file"
55+ # pick local filesystem pathlib or posix for buckets
56+ self .pathlib = os .path if self .is_local_filesystem else posixpath
5557 self .destination_file_name = path_utils .create_path (
5658 config .layout ,
5759 file_name ,
5860 schema_name ,
5961 load_id ,
6062 current_datetime = config .current_datetime ,
61- load_package_timestamp = dlt .current .load_package ()["state" ]["created_at" ], # type: ignore
63+ load_package_timestamp = dlt .current .load_package ()["state" ]["created_at" ],
6264 extra_placeholders = config .extra_placeholders ,
6365 )
6466
6567 super ().__init__ (file_name )
6668 fs_client , _ = fsspec_from_config (config )
67- self .destination_file_name = path_utils .create_path (
68- config .layout ,
69- file_name ,
70- schema_name ,
71- load_id ,
72- current_datetime = config .current_datetime ,
73- load_package_timestamp = dlt .current .load_package ()["state" ]["created_at" ], # type: ignore
74- extra_placeholders = config .extra_placeholders ,
75- )
76-
7769 # We would like to avoid failing for local filesystem where
7870 # deeply nested directory will not exist before writing a file.
7971 # It `auto_mkdir` is disabled by default in fsspec so we made some
8072 # trade offs between different options and decided on this.
8173 item = self .make_remote_path ()
82- if self .config . protocol == "file" :
83- fs_client .makedirs (posixpath .dirname (item ), exist_ok = True )
74+ if self .is_local_filesystem :
75+ fs_client .makedirs (self . pathlib .dirname (item ), exist_ok = True )
8476 fs_client .put_file (local_path , item )
8577
8678 def make_remote_path (self ) -> str :
87- return (
88- f"{ self .config .protocol } ://{ posixpath .join (self .dataset_path , self .destination_file_name )} "
79+ """Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used"""
80+ # path.join does not normalize separators and available
81+ # normalization functions are very invasive and may string the trailing separator
82+ return self .pathlib .join ( # type: ignore[no-any-return]
83+ self .dataset_path ,
84+ path_utils .normalize_path_sep (self .pathlib , self .destination_file_name ),
8985 )
9086
87+ def make_remote_uri (self ) -> str :
88+ """Returns uri to the remote filesystem to which copy the file"""
89+ remote_path = self .make_remote_path ()
90+ if self .is_local_filesystem :
91+ return self .config .make_file_uri (remote_path )
92+ else :
93+ return f"{ self .config .protocol } ://{ remote_path } "
94+
9195 def state (self ) -> TLoadJobState :
9296 return "completed"
9397
@@ -100,7 +104,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]:
100104 jobs = super ().create_followup_jobs (final_state )
101105 if final_state == "completed" :
102106 ref_job = NewReferenceJob (
103- file_name = self .file_name (), status = "running" , remote_path = self .make_remote_path ()
107+ file_name = self .file_name (), status = "running" , remote_path = self .make_remote_uri ()
104108 )
105109 jobs .append (ref_job )
106110 return jobs
@@ -111,36 +115,49 @@ class FilesystemClient(FSClientBase, JobClientBase, WithStagingDataset, WithStat
111115
112116 capabilities : ClassVar [DestinationCapabilitiesContext ] = capabilities ()
113117 fs_client : AbstractFileSystem
114- fs_path : str
118+ # a path (without the scheme) to a location in the bucket where dataset is present
119+ bucket_path : str
120+ # name of the dataset
121+ dataset_name : str
115122
116123 def __init__ (self , schema : Schema , config : FilesystemDestinationClientConfiguration ) -> None :
117124 super ().__init__ (schema , config )
118- self .fs_client , self .fs_path = fsspec_from_config (config )
125+ self .fs_client , fs_path = fsspec_from_config (config )
126+ self .is_local_filesystem = config .protocol == "file"
127+ self .bucket_path = (
128+ config .make_local_path (config .bucket_url ) if self .is_local_filesystem else fs_path
129+ )
130+ # pick local filesystem pathlib or posix for buckets
131+ self .pathlib = os .path if self .is_local_filesystem else posixpath
132+
119133 self .config : FilesystemDestinationClientConfiguration = config
120134 # verify files layout. we need {table_name} and only allow {schema_name} before it, otherwise tables
121135 # cannot be replaced and we cannot initialize folders consistently
122136 self .table_prefix_layout = path_utils .get_table_prefix_layout (config .layout )
123- self ._dataset_path = self .config .normalize_dataset_name (self .schema )
137+ self .dataset_name = self .config .normalize_dataset_name (self .schema )
124138
125139 def drop_storage (self ) -> None :
126140 if self .is_storage_initialized ():
127141 self .fs_client .rm (self .dataset_path , recursive = True )
128142
129143 @property
130144 def dataset_path (self ) -> str :
131- return posixpath .join (self .fs_path , self ._dataset_path )
145+ """A path within a bucket to tables in a dataset
146+ NOTE: dataset_name changes if with_staging_dataset is active
147+ """
148+ return self .pathlib .join (self .bucket_path , self .dataset_name ) # type: ignore[no-any-return]
132149
133150 @contextmanager
134151 def with_staging_dataset (self ) -> Iterator ["FilesystemClient" ]:
135- current_dataset_path = self ._dataset_path
152+ current_dataset_name = self .dataset_name
136153 try :
137- self ._dataset_path = self .schema .naming .normalize_table_identifier (
138- current_dataset_path + "_staging"
154+ self .dataset_name = self .schema .naming .normalize_table_identifier (
155+ current_dataset_name + "_staging"
139156 )
140157 yield self
141158 finally :
142159 # restore previous dataset name
143- self ._dataset_path = current_dataset_path
160+ self .dataset_name = current_dataset_name
144161
145162 def initialize_storage (self , truncate_tables : Iterable [str ] = None ) -> None :
146163 # clean up existing files for tables selected for truncating
@@ -152,7 +169,7 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
152169
153170 # we mark the storage folder as initialized
154171 self .fs_client .makedirs (self .dataset_path , exist_ok = True )
155- self .fs_client .touch (posixpath .join (self .dataset_path , INIT_FILE_NAME ))
172+ self .fs_client .touch (self . pathlib .join (self .dataset_path , INIT_FILE_NAME ))
156173
157174 def truncate_tables (self , table_names : List [str ]) -> None :
158175 """Truncate table with given name"""
@@ -161,7 +178,7 @@ def truncate_tables(self, table_names: List[str]) -> None:
161178 for table_dir in table_dirs :
162179 for table_file in self .list_files_with_prefixes (table_dir , table_prefixes ):
163180 # NOTE: deleting in chunks on s3 does not raise on access denied, file non existing and probably other errors
164- # print(f"DEL {item }")
181+ # print(f"DEL {table_file }")
165182 try :
166183 # NOTE: must use rm_file to get errors on delete
167184 self .fs_client .rm_file (table_file )
@@ -188,7 +205,7 @@ def update_stored_schema(
188205 self .fs_client .makedirs (directory , exist_ok = True )
189206 # we need to mark the folders of the data tables as initialized
190207 if tables_name in self .schema .dlt_table_names ():
191- self .fs_client .touch (posixpath .join (directory , INIT_FILE_NAME ))
208+ self .fs_client .touch (self . pathlib .join (directory , INIT_FILE_NAME ))
192209
193210 # don't store schema when used as staging
194211 if not self .config .as_staging :
@@ -199,17 +216,21 @@ def update_stored_schema(
199216 def get_table_dir (self , table_name : str ) -> str :
200217 # dlt tables do not respect layout (for now)
201218 table_prefix = self .get_table_prefix (table_name )
202- return posixpath . dirname (table_prefix )
219+ return self . pathlib . dirname (table_prefix ) # type: ignore[no-any-return]
203220
204221 def get_table_prefix (self , table_name : str ) -> str :
205222 # dlt tables do not respect layout (for now)
206223 if table_name .startswith (self .schema ._dlt_tables_prefix ):
207- table_prefix = posixpath .join (table_name , "" )
224+ # dlt tables get layout where each tables is a folder
225+ # it is crucial to append and keep "/" at the end
226+ table_prefix = self .pathlib .join (table_name , "" )
208227 else :
209228 table_prefix = self .table_prefix_layout .format (
210229 schema_name = self .schema .name , table_name = table_name
211230 )
212- return posixpath .join (self .dataset_path , table_prefix )
231+ return self .pathlib .join ( # type: ignore[no-any-return]
232+ self .dataset_path , path_utils .normalize_path_sep (self .pathlib , table_prefix )
233+ )
213234
214235 def get_table_dirs (self , table_names : Iterable [str ]) -> List [str ]:
215236 """Gets directories where table data is stored."""
@@ -227,15 +248,20 @@ def list_files_with_prefixes(self, table_dir: str, prefixes: List[str]) -> List[
227248 result = []
228249 for current_dir , _dirs , files in self .fs_client .walk (table_dir , detail = False , refresh = True ):
229250 for file in files :
230- filename = posixpath .join (current_dir , file )
251+ # skip INIT files
252+ if file == INIT_FILE_NAME :
253+ continue
254+ filepath = self .pathlib .join (
255+ path_utils .normalize_path_sep (self .pathlib , current_dir ), file
256+ )
231257 for p in prefixes :
232- if filename .startswith (p ):
233- result .append (posixpath . join ( current_dir , file ) )
234- continue
258+ if filepath .startswith (p ):
259+ result .append (filepath )
260+ break
235261 return result
236262
237263 def is_storage_initialized (self ) -> bool :
238- return self .fs_client .exists (posixpath .join (self .dataset_path , INIT_FILE_NAME )) # type: ignore[no-any-return]
264+ return self .fs_client .exists (self . pathlib .join (self .dataset_path , INIT_FILE_NAME )) # type: ignore[no-any-return]
239265
240266 def start_file_load (self , table : TTableSchema , file_path : str , load_id : str ) -> LoadJob :
241267 # skip the state table, we create a jsonl file in the complete_load step
@@ -272,7 +298,7 @@ def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool:
272298 #
273299
274300 def _write_to_json_file (self , filepath : str , data : DictStrAny ) -> None :
275- dirname = posixpath .dirname (filepath )
301+ dirname = self . pathlib .dirname (filepath )
276302 if not self .fs_client .isdir (dirname ):
277303 return
278304 self .fs_client .write_text (filepath , json .dumps (data ), "utf-8" )
@@ -283,7 +309,7 @@ def _to_path_safe_string(self, s: str) -> str:
283309
284310 def _list_dlt_table_files (self , table_name : str ) -> Iterator [Tuple [str , List [str ]]]:
285311 dirname = self .get_table_dir (table_name )
286- if not self .fs_client .exists (posixpath .join (dirname , INIT_FILE_NAME )):
312+ if not self .fs_client .exists (self . pathlib .join (dirname , INIT_FILE_NAME )):
287313 raise DestinationUndefinedEntity ({"dir" : dirname })
288314 for filepath in self .list_table_files (table_name ):
289315 filename = os .path .splitext (os .path .basename (filepath ))[0 ]
@@ -302,7 +328,7 @@ def _store_load(self, load_id: str) -> None:
302328 "inserted_at" : pendulum .now ().isoformat (),
303329 "schema_version_hash" : self .schema .version_hash ,
304330 }
305- filepath = posixpath .join (
331+ filepath = self . pathlib .join (
306332 self .dataset_path ,
307333 self .schema .loads_table_name ,
308334 f"{ self .schema .name } { FILENAME_SEPARATOR } { load_id } .jsonl" ,
@@ -320,7 +346,7 @@ def complete_load(self, load_id: str) -> None:
320346
321347 def _get_state_file_name (self , pipeline_name : str , version_hash : str , load_id : str ) -> str :
322348 """gets full path for schema file for a given hash"""
323- return posixpath . join (
349+ return self . pathlib . join ( # type: ignore[no-any-return]
324350 self .get_table_dir (self .schema .state_table_name ),
325351 f"{ pipeline_name } { FILENAME_SEPARATOR } { load_id } { FILENAME_SEPARATOR } { self ._to_path_safe_string (version_hash )} .jsonl" ,
326352 )
@@ -370,7 +396,7 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
370396 def _get_schema_file_name (self , version_hash : str , load_id : str ) -> str :
371397 """gets full path for schema file for a given hash"""
372398
373- return posixpath . join (
399+ return self . pathlib . join ( # type: ignore[no-any-return]
374400 self .get_table_dir (self .schema .version_table_name ),
375401 f"{ self .schema .name } { FILENAME_SEPARATOR } { load_id } { FILENAME_SEPARATOR } { self ._to_path_safe_string (version_hash )} .jsonl" ,
376402 )
0 commit comments