feat: prefetch offloader weights using batched memcpy async

xiaobao520123 · xiaobao520123 · commit ccd66dd438dd · 2026-05-01T18:38:28.000Z
diff --git a/vllm/model_executor/offloader/prefetch.py b/vllm/model_executor/offloader/prefetch.py
@@ -16,6 +16,7 @@
 
 import torch
 import torch.nn as nn
+import vllm._custom_ops as ops
 
 # Import prefetch_ops to register custom ops at module load time
 import vllm.model_executor.offloader.prefetch_ops  # noqa: F401
@@ -390,6 +391,10 @@ def __init__(
         # Used for per-layer synchronization (both eager and capture modes).
         self._copy_done_event = torch.cuda.Event()
 
+        # Fork: record event on compute stream, copy_stream waits on it.
+        # This joins copy_stream to any active CUDA graph capture.
+        self._fork_event = torch.cuda.Event()
+
         # Track whether _copy_done_event is valid for eager-mode wait_event.
         # False when: (1) never recorded, or (2) last recorded during a
         # cudagraph capture (events become invalid after capture ends).
@@ -409,6 +414,12 @@ def __init__(
         self._buffer_pool: StaticBufferPool | None = None
         self._buffer_slot_idx: int = 0
 
+        # Buffer pointers
+        # Grouped pointers enable batch copy from cuMemcpyBatchAsync.
+        self._buffer_src_ptrs: torch.Tensor | None = None
+        self._buffer_dst_ptrs: torch.Tensor | None = None
+        self._buffer_sizes: torch.Tensor | None = None
+
         param_dict = dict(self.module.named_parameters())
         assert all(name in param_dict for name in whitelist_param_names), (
             f"Whitelist params {whitelist_param_names} not found in module params "
@@ -485,6 +496,12 @@ def assign_buffer_slot(self, pool: StaticBufferPool, slot_idx: int):
         """
         self._buffer_pool = pool
         self._buffer_slot_idx = slot_idx
+        
+        pin_memory = should_pin_memory()
+        
+        src_ptrs: list[int] = []
+        dst_ptrs: list[int] = []
+        sizes: list[int] = []
 
         # Assign static buffers to parameters
         # Use CPU storage shape/stride/dtype since param.data is now empty
@@ -500,6 +517,37 @@ def assign_buffer_slot(self, pool: StaticBufferPool, slot_idx: int):
             )
             offloader.assign_static_buffer(buffer)
 
+            # IMPORTANT: Update pointer.
+            cpu_storage = offloader._cpu_storage
+            assert cpu_storage is not None, "CPU storage not initialized"
+            assert not pin_memory or cpu_storage.is_pinned(), (
+                f"CPU storage for {name} is not pinned, but pin_memory is "
+                "enabled. The batched H2D prefetch path requires pinned "
+                "source memory; otherwise cuMemcpyBatchAsync degrades to a "
+                "synchronous copy and breaks event-based fork "
+                "synchronization with the compute stream."
+            )
+
+            src_ptrs.append(cpu_storage.data_ptr())
+            dst_ptrs.append(buffer.data_ptr())
+            sizes.append(cpu_storage.numel() * cpu_storage.element_size())
+
+        # Group buffer's pointer.
+        if not src_ptrs:
+            self._buffer_src_ptrs = None
+            self._buffer_dst_ptrs = None
+            self._buffer_sizes = None
+        else:
+            self._buffer_src_ptrs = torch.tensor(
+                src_ptrs, dtype=torch.int64, pin_memory=pin_memory
+            )
+            self._buffer_dst_ptrs = torch.tensor(
+                dst_ptrs, dtype=torch.int64, pin_memory=pin_memory
+            )
+            self._buffer_sizes = torch.tensor(
+                sizes, dtype=torch.int64, pin_memory=pin_memory
+            )
+
     def start_onload_to_static(self):
         """Start async copy from CPU storage to GPU buffer.
 
@@ -514,33 +562,52 @@ def start_onload_to_static(self):
         assert self._buffer_pool is not None, "Buffer pool not assigned"
 
         # Track if this prefetch is being captured (for _wait_for_layer logic)
-        self._prefetch_in_capture = torch.cuda.is_current_stream_capturing()
+        in_capture = torch.cuda.is_current_stream_capturing()
+        self._prefetch_in_capture = in_capture
 
         # Fork: record event on compute stream, copy_stream waits on it
-        # This joins copy_stream to any active CUDA graph capture
-        fork_event = torch.cuda.Event()
-        torch.cuda.current_stream().record_event(fork_event)
-        self.copy_stream.wait_event(fork_event)
+        # This joins copy_stream to any active CUDA graph capture.
+        torch.cuda.current_stream().record_event(self._fork_event)
+        self.copy_stream.wait_event(self._fork_event)
 
         with torch.cuda.stream(self.copy_stream):
-            for name, offloader in self._param_offloaders.items():
-                cpu_storage = offloader._cpu_storage
-                gpu_buffer = offloader._gpu_buffer
-                assert cpu_storage is not None, "CPU storage not initialized"
-                assert gpu_buffer is not None, "GPU buffer not assigned"
-                assert not should_pin_memory() or cpu_storage.is_pinned(), (
-                    f"CPU storage for {name} is not pinned! "
-                    "non_blocking=True H2D copy from non-pinned memory "
-                    "causes stream synchronization that breaks "
-                    "event-based fork synchronization."
+            if in_capture:
+                # cuMemcpyBatchAsync is not capture-safe.
+                # Slow path: Fallbacks to per-param copy_() so they can get recorded into the graph.
+                for name, offloader in self._param_offloaders.items():
+                    cpu_storage = offloader._cpu_storage
+                    gpu_buffer = offloader._gpu_buffer
+                    assert cpu_storage is not None, "CPU storage not initialized"
+                    assert gpu_buffer is not None, "GPU buffer not assigned"
+                    assert not should_pin_memory() or cpu_storage.is_pinned(), (
+                        f"CPU storage for {name} is not pinned! "
+                        "non_blocking=True H2D copy from non-pinned memory "
+                        "causes stream synchronization that breaks "
+                        "event-based fork synchronization."
+                    )
+                    gpu_buffer.copy_(cpu_storage, non_blocking=True)
+            elif (
+                self._buffer_src_ptrs is not None
+                and self._buffer_dst_ptrs is not None
+                and self._buffer_sizes is not None
+            ):
+                # Fast path: batched copy using custom op (single cuMemcpyBatchAsync call on CUDA 12.8+)
+                # cuMemcpyBatchAsync can have less driver-call overhead and better performance.
+                # swap_blocks_batch() will fallback to per-param copy_() if cuMemcpyBatchAsync is not available.
+                ops.swap_blocks_batch(
+                    src_ptrs=self._buffer_src_ptrs,
+                    dst_ptrs=self._buffer_dst_ptrs,
+                    sizes=self._buffer_sizes
                 )
-                gpu_buffer.copy_(cpu_storage, non_blocking=True)
+            else:
+                # No params to copy (shouldn't normally happen).
+                pass
 
         # Record completion event for _wait_for_layer to use
         self._copy_done_event.record(self.copy_stream)
         # Event is only valid for eager wait_event if recorded outside capture.
         # Events recorded during capture become invalid after capture ends.
-        self._event_valid_for_eager = not torch.cuda.is_current_stream_capturing()
+        self._event_valid_for_eager = not in_capture
 
 
 class _BaseParamOffloader(ABC):