[Data] Add optional filesystem parameter to download expression (ray-project#60677)

xyuzh · bveeramani · cursoragent · tiennguyentony · commit 40c665c92d7f · 2026-02-07T15:14:57.000-08:00
## Summary - Add optional `filesystem` parameter to the `download()` expression in Ray Data - Allows users to provide custom PyArrow filesystems with custom authentication credentials - If not specified, the filesystem is auto-detected from the path scheme (existing behavior) ## Test plan - [x] Verify existing download tests still pass - [x] Test with custom S3FileSystem with explicit credentials <sup><a href="https://cursor.com/dashboard?tab=bugbot">Cursor Bugbot</a> reviewed your changes and found no issues for commit <u>cfb1db1</u></sup> --------- Signed-off-by: xyuzh <xinyzng@gmail.com> Signed-off-by: Xinyu Zhang <60529799+xyuzh@users.noreply.github.com> Co-authored-by: Balaji Veeramani <bveeramani@berkeley.edu> Co-authored-by: Cursor <cursoragent@cursor.com> Signed-off-by: tiennguyentony <46289799+tiennguyentony@users.noreply.github.com>
diff --git a/python/ray/data/_internal/logical/operators/one_to_one_operator.py b/python/ray/data/_internal/logical/operators/one_to_one_operator.py
@@ -8,6 +8,7 @@
 from ray.data.block import BlockMetadata
 
 if TYPE_CHECKING:
+    import pyarrow
 
     from ray.data.block import Schema
 
@@ -115,6 +116,7 @@ def __init__(
         uri_column_names: List[str],
         output_bytes_column_names: List[str],
         ray_remote_args: Optional[Dict[str, Any]] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
     ):
         super().__init__("Download", input_op, can_modify_num_rows=False)
         if len(uri_column_names) != len(output_bytes_column_names):
@@ -125,3 +127,4 @@ def __init__(
         self.uri_column_names = uri_column_names
         self.output_bytes_column_names = output_bytes_column_names
         self.ray_remote_args = ray_remote_args or {}
+        self.filesystem = filesystem
diff --git a/python/ray/data/_internal/planner/plan_download_op.py b/python/ray/data/_internal/planner/plan_download_op.py
@@ -1,7 +1,7 @@
 import logging
 import math
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Iterator, List
+from typing import Iterator, List, Optional
 from urllib.parse import urlparse
 
 import pyarrow as pa
@@ -48,6 +48,7 @@ def plan_download_op(
     uri_column_names_str = ", ".join(uri_column_names)
     output_bytes_column_names = op.output_bytes_column_names
     ray_remote_args = op.ray_remote_args
+    filesystem = op.filesystem
 
     # Import _get_udf from the main planner file
     from ray.data._internal.planner.plan_udf_map_op import (
@@ -70,7 +71,7 @@ def plan_download_op(
             PartitionActor,
             (),
             {},
-            (uri_column_names, data_context),
+            (uri_column_names, data_context, filesystem),
             {},
             compute=partition_compute,
         )
@@ -108,7 +109,7 @@ def plan_download_op(
 
     fn, init_fn = _get_udf(
         download_bytes_threaded,
-        (uri_column_names, output_bytes_column_names, data_context),
+        (uri_column_names, output_bytes_column_names, data_context, filesystem),
         {},
         None,
         None,
@@ -167,10 +168,22 @@ def download_bytes_threaded(
     uri_column_names: List[str],
     output_bytes_column_names: List[str],
     data_context: DataContext,
+    filesystem: Optional["pa.fs.FileSystem"] = None,
 ) -> Iterator[pa.Table]:
     """Optimized version that uses make_async_gen for concurrent downloads.
 
     Supports downloading from multiple URI columns in a single operation.
+
+    Args:
+        block: Input PyArrow table containing URI columns.
+        uri_column_names: Names of columns containing URIs to download.
+        output_bytes_column_names: Names for the output columns containing downloaded bytes.
+        data_context: Ray Data context for configuration.
+        filesystem: PyArrow filesystem to use for reading remote files.
+            If None, the filesystem is auto-detected from the path scheme.
+
+    Yields:
+        pa.Table: PyArrow table with the downloaded bytes added as new columns.
     """
     if not isinstance(block, pa.Table):
         block = BlockAccessor.for_block(block).to_arrow()
@@ -192,8 +205,9 @@ def load_uri_bytes(uri_iterator):
 
             Takes an iterator of URIs and yields bytes for each.
             Uses lazy filesystem resolution - resolves once and reuses for subsequent URIs.
+            If a filesystem was provided explicitly, it will be used for all URIs.
             """
-            cached_fs = None
+            cached_fs = filesystem
             for uri in uri_iterator:
                 read_bytes = None
                 try:
@@ -267,9 +281,15 @@ class PartitionActor:
 
     INIT_SAMPLE_BATCH_SIZE = 25
 
-    def __init__(self, uri_column_names: List[str], data_context: DataContext):
+    def __init__(
+        self,
+        uri_column_names: List[str],
+        data_context: DataContext,
+        filesystem: Optional["pa.fs.FileSystem"] = None,
+    ):
         self._uri_column_names = uri_column_names
         self._data_context = data_context
+        self._filesystem = filesystem
         self._batch_size_estimate = None
 
     def __call__(self, block: pa.Table) -> Iterator[pa.Table]:
@@ -345,7 +365,7 @@ def get_file_size(uri_path, fs):
         # Get the filesystem from the URIs (assumes all URIs use same filesystem for sampling)
         # This is for sampling the file sizes which doesn't require a full resolution of the paths.
         try:
-            paths, fs = _resolve_paths_and_filesystem(uris)
+            paths, fs = _resolve_paths_and_filesystem(uris, filesystem=self._filesystem)
             fs = RetryingPyFileSystem.wrap(
                 fs, retryable_errors=self._data_context.retried_io_errors
             )
diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
@@ -896,6 +896,7 @@ def with_column(
                 uri_column_names=[expr.uri_column_name],
                 output_bytes_column_names=[column_name],
                 ray_remote_args=ray_remote_args,
+                filesystem=expr.filesystem,
             )
             logical_plan = LogicalPlan(download_op, self.context)
         else:
diff --git a/python/ray/data/expressions.py b/python/ray/data/expressions.py
@@ -821,14 +821,39 @@ class _CallableClassSpec:
         cls: The original callable class type
         args: Positional arguments for the constructor
         kwargs: Keyword arguments for the constructor
+        _cached_key: Pre-computed key that survives serialization
     """
 
     cls: type
     args: Tuple[Any, ...] = ()
     kwargs: Dict[str, Any] = field(default_factory=dict)
+    _cached_key: Optional[Tuple] = field(default=None, compare=False, repr=False)
+
+    def __post_init__(self):
+        """Pre-compute and cache the key at construction time.
+
+        This ensures the same key survives serialization, since the cached
+        key tuple (containing the already-computed repr strings) gets pickled
+        and unpickled as-is.
+        """
+        if self._cached_key is None:
+            class_id = f"{self.cls.__module__}.{self.cls.__qualname__}"
+            try:
+                key = (
+                    class_id,
+                    self.args,
+                    tuple(sorted(self.kwargs.items())),
+                )
+                # Verify the key is actually hashable (args may contain lists)
+                hash(key)
+            except TypeError:
+                # Fallback for unhashable args/kwargs - use repr for comparison
+                key = (class_id, repr(self.args), repr(self.kwargs))
+            # Use object.__setattr__ since dataclass is frozen
+            object.__setattr__(self, "_cached_key", key)
 
     def make_key(self) -> Tuple:
-        """Create a hashable key for UDF instance lookup.
+        """Return the pre-computed hashable key for UDF instance lookup.
 
         The key uniquely identifies a UDF by its class and constructor arguments.
         This ensures that the same class with different constructor args
@@ -837,18 +862,7 @@ def make_key(self) -> Tuple:
         Returns:
             A hashable tuple that uniquely identifies this UDF configuration.
         """
-        try:
-            key = (
-                id(self.cls),
-                self.args,
-                tuple(sorted(self.kwargs.items())),
-            )
-            # Verify the key is actually hashable (args may contain lists)
-            hash(key)
-            return key
-        except TypeError:
-            # Fallback for unhashable args/kwargs - use repr for comparison
-            return (id(self.cls), repr(self.args), repr(self.kwargs))
+        return self._cached_key
 
 
 class _CallableClassUDF:
@@ -1304,6 +1318,7 @@ class DownloadExpr(Expr):
     """Expression that represents a download operation."""
 
     uri_column_name: str
+    filesystem: "pyarrow.fs.FileSystem" = None
     data_type: DataType = field(default_factory=lambda: DataType.binary(), init=False)
 
     def structurally_equals(self, other: Any) -> bool:
@@ -1448,7 +1463,11 @@ def star() -> StarExpr:
 
 
 @PublicAPI(stability="alpha")
-def download(uri_column_name: str) -> DownloadExpr:
+def download(
+    uri_column_name: str,
+    *,
+    filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+) -> DownloadExpr:
     """
     Create a download expression that downloads content from URIs.
 
@@ -1458,6 +1477,8 @@ def download(uri_column_name: str) -> DownloadExpr:
 
     Args:
         uri_column_name: The name of the column containing URIs to download from
+        filesystem: PyArrow filesystem to use for reading remote files.
+            If None, the filesystem is auto-detected from the path scheme.
     Returns:
         A DownloadExpr that will download content from the specified URI column
 
@@ -1472,7 +1493,7 @@ def download(uri_column_name: str) -> DownloadExpr:
         >>> # Add downloaded bytes column
         >>> ds_with_bytes = ds.with_column("bytes", download("uri"))
     """
-    return DownloadExpr(uri_column_name=uri_column_name)
+    return DownloadExpr(uri_column_name=uri_column_name, filesystem=filesystem)
 
 
 # ──────────────────────────────────────
diff --git a/python/ray/data/tests/test_download_expression.py b/python/ray/data/tests/test_download_expression.py
@@ -250,6 +250,39 @@ def test_download_expression_with_pandas_blocks(self, tmp_path):
         finally:
             ctx.enable_pandas_block = old_enable_pandas_block
 
+    def test_download_expression_with_custom_filesystem(self, tmp_path):
+        import pyarrow.fs as pafs
+
+        # 1. Setup paths
+        subdir = tmp_path / "data"
+        subdir.mkdir()
+
+        file_name = "test_file.txt"
+        file_path = subdir / file_name
+        sample_content = b"File content with custom fs"
+        file_path.write_bytes(sample_content)
+
+        # 2. Setup SubTreeFileSystem
+        # This treats 'subdir' as the root '/'
+        base_fs = pafs.LocalFileSystem()
+        custom_fs = pafs.SubTreeFileSystem(str(subdir), base_fs)
+
+        # 3. Create Dataset
+        # Note: We use the relative 'file_name' because the FS is rooted at 'subdir'
+        ds = ray.data.from_items([{"file_uri": file_name, "file_id": 0}])
+
+        # 4. Execute Download
+        ds_with_downloads = ds.with_column(
+            "content", download("file_uri", filesystem=custom_fs)
+        )
+
+        # 5. Assertions
+        results = ds_with_downloads.take_all()
+
+        assert len(results) == 1
+        assert results[0]["content"] == sample_content
+        assert results[0]["file_id"] == 0
+
 
 class TestDownloadExpressionErrors:
     """Test error conditions and edge cases for download expressions."""

Original file line number	Diff line number	Diff line change
`@@ -896,6 +896,7 @@ def with_column(`
`896`	`896`	`uri_column_names=[expr.uri_column_name],`
`897`	`897`	`output_bytes_column_names=[column_name],`
`898`	`898`	`ray_remote_args=ray_remote_args,`
	`899`	`+ filesystem=expr.filesystem,`
`899`	`900`	`)`
`900`	`901`	`logical_plan = LogicalPlan(download_op, self.context)`
`901`	`902`	`else:`