[serve][llm] Disable model downloading for RunAI streamer, introduce optimized download function (ray-project#57854)

hao-aaron · YoussefEssDS · commit 7e48795ee596 · 2025-11-07T19:03:04.000-05:00
Signed-off-by: ahao-anyscale &lt;ahao@anyscale.com&gt;
diff --git a/python/ray/llm/_internal/common/callbacks/cloud_downloader.py b/python/ray/llm/_internal/common/callbacks/cloud_downloader.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from typing import Any, List, Tuple
 
 from pydantic import BaseModel, field_validator
@@ -76,20 +77,12 @@ def on_before_download_model_files_distributed(self) -> None:
         from ray.llm._internal.common.utils.cloud_utils import CloudFileSystem
 
         paths = self.kwargs["paths"]
+        start_time = time.monotonic()
+        for cloud_uri, local_path in paths:
+            CloudFileSystem.download_files_parallel(
+                path=local_path, bucket_uri=cloud_uri
+            )
+        end_time = time.monotonic()
         logger.info(
-            f"CloudDownloader: Starting download of {len(paths)} files from cloud storage"
+            f"CloudDownloader: Files downloaded in {end_time - start_time} seconds"
         )
-
-        for cloud_uri, local_path in paths:
-            try:
-                logger.info(f"CloudDownloader: Downloading {cloud_uri} to {local_path}")
-                CloudFileSystem.download_files(path=local_path, bucket_uri=cloud_uri)
-                logger.info(
-                    f"CloudDownloader: Successfully downloaded {cloud_uri} to {local_path}"
-                )
-            except Exception as e:
-                logger.error(
-                    f"CloudDownloader: Failed to download {cloud_uri} to {local_path}: {e}"
-                )
-                if self.raise_error_on_callback:
-                    raise
diff --git a/python/ray/llm/_internal/common/utils/cloud_utils.py b/python/ray/llm/_internal/common/utils/cloud_utils.py
@@ -2,6 +2,7 @@
 import inspect
 import os
 import time
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import (
     Any,
@@ -345,6 +346,53 @@ def list_subfolders(folder_uri: str) -> List[str]:
             logger.info(f"Error listing subfolders in {folder_uri}: {e}")
             return []
 
+    @staticmethod
+    def _filter_files(
+        fs: pa_fs.FileSystem,
+        source_path: str,
+        destination_path: str,
+        substrings_to_include: Optional[List[str]] = None,
+        suffixes_to_exclude: Optional[List[str]] = None,
+    ) -> List[Tuple[str, str]]:
+        """Filter files from cloud storage based on inclusion and exclusion criteria.
+
+        Args:
+            fs: PyArrow filesystem instance
+            source_path: Source path in cloud storage
+            destination_path: Local destination path
+            substrings_to_include: Only include files containing these substrings
+            suffixes_to_exclude: Exclude files ending with these suffixes
+
+        Returns:
+            List of tuples containing (source_file_path, destination_file_path)
+        """
+        file_selector = pa_fs.FileSelector(source_path, recursive=True)
+        file_infos = fs.get_file_info(file_selector)
+
+        path_pairs = []
+        for file_info in file_infos:
+            if file_info.type != pa_fs.FileType.File:
+                continue
+
+            rel_path = file_info.path[len(source_path) :].lstrip("/")
+
+            # Apply filters
+            if substrings_to_include:
+                if not any(
+                    substring in rel_path for substring in substrings_to_include
+                ):
+                    continue
+
+            if suffixes_to_exclude:
+                if any(rel_path.endswith(suffix) for suffix in suffixes_to_exclude):
+                    continue
+
+            path_pairs.append(
+                (file_info.path, os.path.join(destination_path, rel_path))
+            )
+
+        return path_pairs
+
     @staticmethod
     def download_files(
         path: str,
@@ -366,40 +414,104 @@ def download_files(
             # Ensure the destination directory exists
             os.makedirs(path, exist_ok=True)
 
-            # List all files in the bucket
-            file_selector = pa_fs.FileSelector(source_path, recursive=True)
-            file_infos = fs.get_file_info(file_selector)
+            # Get filtered files to download
+            files_to_download = CloudFileSystem._filter_files(
+                fs, source_path, path, substrings_to_include, suffixes_to_exclude
+            )
 
             # Download each file
-            for file_info in file_infos:
-                if file_info.type != pa_fs.FileType.File:
-                    continue
+            for source_file_path, dest_file_path in files_to_download:
+                # Create destination directory if needed
+                dest_dir = os.path.dirname(dest_file_path)
+                if dest_dir:
+                    os.makedirs(dest_dir, exist_ok=True)
+
+                # Download the file
+                with fs.open_input_file(source_file_path) as source_file:
+                    with open(dest_file_path, "wb") as dest_file:
+                        dest_file.write(source_file.read())
 
-                # Get relative path from source prefix
-                rel_path = file_info.path[len(source_path) :].lstrip("/")
+        except Exception as e:
+            logger.exception(f"Error downloading files from {bucket_uri}: {e}")
+            raise
 
-                # Check if file matches substring filters
-                if substrings_to_include:
-                    if not any(
-                        substring in rel_path for substring in substrings_to_include
-                    ):
-                        continue
+    @staticmethod
+    def download_files_parallel(
+        path: str,
+        bucket_uri: str,
+        substrings_to_include: Optional[List[str]] = None,
+        suffixes_to_exclude: Optional[List[str]] = None,
+        max_concurrency: int = 10,
+        chunk_size: int = 64 * 1024 * 1024,
+    ) -> None:
+        """Multi-threaded download of files from cloud storage.
 
-                # Check if file matches suffixes to exclude filter
-                if suffixes_to_exclude:
-                    if any(rel_path.endswith(suffix) for suffix in suffixes_to_exclude):
-                        continue
+        Args:
+            path: Local directory where files will be downloaded
+            bucket_uri: URI of cloud directory
+            substrings_to_include: Only include files containing these substrings
+            suffixes_to_exclude: Exclude certain files from download
+            max_concurrency: Maximum number of concurrent files to download (default: 10)
+            chunk_size: Size of transfer chunks (default: 64MB)
+        """
+        try:
+            fs, source_path = CloudFileSystem.get_fs_and_path(bucket_uri)
+
+            # Ensure destination exists
+            os.makedirs(path, exist_ok=True)
+
+            # If no filters, use direct copy_files
+            if not substrings_to_include and not suffixes_to_exclude:
+                pa_fs.copy_files(
+                    source=source_path,
+                    destination=path,
+                    source_filesystem=fs,
+                    destination_filesystem=pa_fs.LocalFileSystem(),
+                    use_threads=True,
+                    chunk_size=chunk_size,
+                )
+                return
 
+            # List and filter files
+            files_to_download = CloudFileSystem._filter_files(
+                fs, source_path, path, substrings_to_include, suffixes_to_exclude
+            )
+
+            if not files_to_download:
+                logger.info("Filters do not match any of the files, skipping download")
+                return
+
+            def download_single_file(file_paths):
+                source_file_path, dest_file_path = file_paths
                 # Create destination directory if needed
-                if "/" in rel_path:
-                    dest_dir = os.path.join(path, os.path.dirname(rel_path))
+                dest_dir = os.path.dirname(dest_file_path)
+                if dest_dir:
                     os.makedirs(dest_dir, exist_ok=True)
 
-                # Download the file
-                dest_path = os.path.join(path, rel_path)
-                with fs.open_input_file(file_info.path) as source_file:
-                    with open(dest_path, "wb") as dest_file:
-                        dest_file.write(source_file.read())
+                # Use PyArrow's copy_files for individual files,
+                pa_fs.copy_files(
+                    source=source_file_path,
+                    destination=dest_file_path,
+                    source_filesystem=fs,
+                    destination_filesystem=pa_fs.LocalFileSystem(),
+                    use_threads=True,
+                    chunk_size=chunk_size,
+                )
+                return dest_file_path
+
+            max_workers = min(max_concurrency, len(files_to_download))
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                futures = [
+                    executor.submit(download_single_file, file_paths)
+                    for file_paths in files_to_download
+                ]
+
+                for future in futures:
+                    try:
+                        future.result()
+                    except Exception as e:
+                        logger.error(f"Failed to download file: {e}")
+                        raise
 
         except Exception as e:
             logger.exception(f"Error downloading files from {bucket_uri}: {e}")
@@ -464,11 +576,12 @@ def download_model(
 
             safetensors_to_exclude = [".safetensors"] if exclude_safetensors else None
 
-            CloudFileSystem.download_files(
+            CloudFileSystem.download_files_parallel(
                 path=destination_dir,
                 bucket_uri=bucket_uri,
                 substrings_to_include=tokenizer_file_substrings,
                 suffixes_to_exclude=safetensors_to_exclude,
+                chunk_size=64 * 1024 * 1024,  # 64MB chunks for large model files
             )
 
         except Exception as e:
diff --git a/python/ray/llm/_internal/common/utils/download_utils.py b/python/ray/llm/_internal/common/utils/download_utils.py
@@ -19,7 +19,7 @@
 
 logger = get_logger(__name__)
 
-STREAMING_LOAD_FORMATS = ["runai_streamer", "tensorizer"]
+STREAMING_LOAD_FORMATS = ["runai_streamer", "runai_streamer_sharded", "tensorizer"]
 
 
 class NodeModelDownloadable(enum.Enum):
@@ -267,7 +267,7 @@ def download_model_files(
     # cannot be created by torch if the parent directory doesn't exist.
     torch_cache_home = torch.hub._get_torch_home()
     os.makedirs(os.path.join(torch_cache_home, "kernels"), exist_ok=True)
-    model_path_or_id = None
+    model_path_or_id = model_id
 
     if callback is not None:
         callback.run_callback_sync("on_before_download_model_files_distributed")
diff --git a/python/ray/llm/_internal/serve/core/configs/llm_config.py b/python/ray/llm/_internal/serve/core/configs/llm_config.py
@@ -26,7 +26,10 @@
     CloudMirrorConfig,
     is_remote_path,
 )
-from ray.llm._internal.common.utils.download_utils import NodeModelDownloadable
+from ray.llm._internal.common.utils.download_utils import (
+    STREAMING_LOAD_FORMATS,
+    NodeModelDownloadable,
+)
 from ray.llm._internal.common.utils.import_utils import load_class, try_import
 from ray.llm._internal.serve.constants import (
     DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S,
@@ -297,6 +300,10 @@ def get_or_create_callback(self) -> Optional[CallbackBase]:
         assert engine_config is not None
         pg = engine_config.get_or_create_pg()
         runtime_env = engine_config.get_runtime_env_with_local_env_vars()
+        if self.engine_kwargs.get("load_format", None) in STREAMING_LOAD_FORMATS:
+            worker_node_download_model = NodeModelDownloadable.NONE
+        else:
+            worker_node_download_model = NodeModelDownloadable.MODEL_AND_TOKENIZER
 
         # Create new instance
         if isinstance(self.callback_config.callback_class, str):
@@ -308,7 +315,7 @@ def get_or_create_callback(self) -> Optional[CallbackBase]:
             raise_error_on_callback=self.callback_config.raise_error_on_callback,
             llm_config=self,
             ctx_kwargs={
-                "worker_node_download_model": NodeModelDownloadable.MODEL_AND_TOKENIZER,
+                "worker_node_download_model": worker_node_download_model,
                 "placement_group": pg,
                 "runtime_env": runtime_env,
             },
diff --git a/python/ray/llm/_internal/serve/utils/node_initialization_utils.py b/python/ray/llm/_internal/serve/utils/node_initialization_utils.py
@@ -1,13 +1,12 @@
 import asyncio
-import os
 from typing import Optional
 
 import ray
 from ray.llm._internal.common.utils.download_utils import (
     download_model_files,
 )
 from ray.llm._internal.common.utils.import_utils import try_import
-from ray.llm._internal.serve.core.configs.llm_config import LLMConfig, LLMEngine
+from ray.llm._internal.serve.core.configs.llm_config import LLMConfig
 from ray.llm._internal.serve.observability.logging import get_logger
 
 torch = try_import("torch")
@@ -33,24 +32,6 @@ def initialize_remote_node(llm_config: LLMConfig) -> Optional[str]:
     if local_path and local_path != engine_config.actual_hf_model_id:
         engine_config.hf_model_id = local_path
 
-    # Download the tokenizer if it isn't a local file path
-    if not isinstance(local_path, str) or not os.path.exists(local_path):
-        logger.info(f"Downloading the tokenizer for {engine_config.actual_hf_model_id}")
-
-    if llm_config.llm_engine == LLMEngine.vLLM:
-        from vllm.transformers_utils.tokenizer import get_tokenizer
-
-        _ = get_tokenizer(
-            engine_config.actual_hf_model_id,
-            tokenizer_mode=engine_config.engine_kwargs.get("tokenizer_mode", None),
-            trust_remote_code=engine_config.trust_remote_code,
-        )
-    else:
-        _ = transformers.AutoTokenizer.from_pretrained(
-            engine_config.actual_hf_model_id,
-            trust_remote_code=engine_config.trust_remote_code,
-        )
-
     return local_path
 
 
diff --git a/python/ray/llm/tests/common/utils/test_cloud_utils.py b/python/ray/llm/tests/common/utils/test_cloud_utils.py
diff --git a/python/ray/llm/tests/serve/cpu/configs/test_models.py b/python/ray/llm/tests/serve/cpu/configs/test_models.py