gz extension now visible in staged files

anuunchin · anuunchin · commit 1ca666472476 · 2025-07-03T10:45:12.000+02:00
diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py
@@ -228,9 +228,13 @@ def _buffer_items_with_row_count(self, item: TDataItems) -> int:
 
     def _rotate_file(self, allow_empty_file: bool = False) -> DataWriterMetrics:
         metrics = self._flush_and_close_file(allow_empty_file)
-        self._file_name = (
-            self.file_name_template % new_file_id() + "." + self.writer_spec.file_extension
-        )
+        base_filename = self.file_name_template % new_file_id()
+        file_extension = self.writer_spec.file_extension
+        # Add .gz if compression is enabled
+        if self.writer_spec.supports_compression and self.open == gzip.open:
+            self._file_name = f"{base_filename}.{file_extension}.gz"
+        else:
+            self._file_name = f"{base_filename}.{file_extension}"
         self._created = time.time()
         return metrics
 
diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py
@@ -156,14 +156,18 @@ class ParsedLoadJobFileName(NamedTuple):
     file_id: str
     retry_count: int
     file_format: TJobFileFormat
+    is_compressed: bool = False
 
     def job_id(self) -> str:
         """Unique identifier of the job"""
         return f"{self.table_name}.{self.file_id}.{self.file_format}"
 
     def file_name(self) -> str:
         """A name of the file with the data to be loaded"""
-        return f"{self.table_name}.{self.file_id}.{int(self.retry_count)}.{self.file_format}"
+        base_name = f"{self.table_name}.{self.file_id}.{int(self.retry_count)}.{self.file_format}"
+        if self.is_compressed:
+            return f"{base_name}.gz"
+        return base_name
 
     def with_retry(self) -> "ParsedLoadJobFileName":
         """Returns a job with increased retry count"""
@@ -173,12 +177,18 @@ def with_retry(self) -> "ParsedLoadJobFileName":
     def parse(file_name: str) -> "ParsedLoadJobFileName":
         p = PurePath(file_name)
         parts = p.name.split(".")
-        if len(parts) != 4:
-            raise TerminalValueError(parts)
 
-        return ParsedLoadJobFileName(
-            parts[0], parts[1], int(parts[2]), cast(TJobFileFormat, parts[3])
-        )
+        if len(parts) == 4:
+            # Uncompressed
+            return ParsedLoadJobFileName(
+                parts[0], parts[1], int(parts[2]), cast(TJobFileFormat, parts[3]), False
+            )
+        elif len(parts) == 5 and parts[4] == "gz":
+            return ParsedLoadJobFileName(
+                parts[0], parts[1], int(parts[2]), cast(TJobFileFormat, parts[3]), True
+            )
+        else:
+            raise TerminalValueError(parts)
 
     @staticmethod
     def new_file_id() -> str:
diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py
@@ -79,7 +79,10 @@ def job_id(self) -> str:
 
 class ReferenceFollowupJobRequest(FollowupJobRequestImpl):
     def __init__(self, original_file_name: str, remote_paths: List[str]) -> None:
-        file_name = os.path.splitext(original_file_name)[0] + "." + "reference"
+        # Parse the original filename to handle compressed files correctly
+        job_info = ParsedLoadJobFileName.parse(original_file_name)
+        # Reference files are not compressed, so we build the filename without .gz
+        file_name = f"{job_info.table_name}.{job_info.file_id}.{job_info.retry_count}.reference"
         self._remote_paths = remote_paths
         super().__init__(file_name)
         self._save_text_file("\n".join(remote_paths))
diff --git a/dlt/destinations/path_utils.py b/dlt/destinations/path_utils.py
@@ -139,7 +139,11 @@ def prepare_params(
     if job_info:
         table_name = job_info.table_name
         file_id = job_info.file_id
-        ext = job_info.file_format
+        # For compressed files, ext should include .gz
+        if job_info.is_compressed:
+            ext = f"{job_info.file_format}.gz"
+        else:
+            ext = job_info.file_format
         params.update(
             {
                 "table_name": table_name,
@@ -241,7 +245,10 @@ def create_path(
 
     # if extension is not defined, we append it at the end
     if "ext" not in placeholders:
-        path += f".{job_info.file_format}"
+        if job_info.is_compressed:
+            path += f".{job_info.file_format}.gz"
+        else:
+            path += f".{job_info.file_format}"
 
     return path
 
diff --git a/dlt/load/load.py b/dlt/load/load.py
@@ -121,7 +121,8 @@ def get_staging_destination_client(self, schema: Schema) -> JobClientBase:
         return self.staging_destination.client(schema, self.initial_staging_client_config)
 
     def is_staging_destination_job(self, file_path: str) -> bool:
-        file_type = os.path.splitext(file_path)[1][1:]
+        job_info = ParsedLoadJobFileName.parse(file_path)
+        file_type = job_info.file_format
         # for now we know that reference and model jobs always go do the main destination
         if file_type in ["reference", "model"]:
             return False