vllm-project · Etelis · Apr 7, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/csrc/cache.h b/csrc/cache.h
@@ -12,7 +12,7 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 
 void swap_blocks_batch(const torch::Tensor& src_ptrs,
                        const torch::Tensor& dst_ptrs,
-                       const torch::Tensor& sizes);
+                       const torch::Tensor& sizes, bool src_access_order_any);
 
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -77,7 +77,7 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 
 void swap_blocks_batch(const torch::Tensor& src_ptrs,
                        const torch::Tensor& dst_ptrs,
-                       const torch::Tensor& sizes) {
+                       const torch::Tensor& sizes, bool src_access_order_any) {
   TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU");
   TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU");
   TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU");
@@ -124,7 +124,14 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
 
   if (batch_fn != nullptr) {
     CUmemcpyAttributes attr = {};
-    attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
+    // ANY lets the DMA engine prefetch source bytes out of stream order,
+    // which is only safe when no GPU stream is concurrently writing the
+    // source (e.g. CPU->GPU reads from host-owned pinned memory). For
+    // GPU->CPU we must keep STREAM so source reads are gated by the
+    // transfer stream's wait_stream(compute) / wait_event barriers.
-    // source (e.g. CPU->GPU reads from host-owned pinned memory). For
-    // GPU->CPU we must keep STREAM so source reads are gated by the
-    // transfer stream's wait_stream(compute) / wait_event barriers.
+    // source. 
-    // source (e.g. CPU->GPU reads from host-owned pinned memory). For
-    // GPU->CPU we must keep STREAM so source reads are gated by the
-    // transfer stream's wait_stream(compute) / wait_event barriers.
+    // source. 
+    attr.srcAccessOrder = src_access_order_any
+                              ? CU_MEMCPY_SRC_ACCESS_ORDER_ANY
+                              : CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
     size_t attrs_idx = 0;
     size_t fail_idx = 0;
     CUresult result = batch_fn(reinterpret_cast<CUdeviceptr*>(dst_data),

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -535,7 +535,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   // Batch swap: submit all block copies in a single driver call.
   cache_ops.def(
       "swap_blocks_batch(Tensor src_ptrs, Tensor dst_ptrs,"
-      "                  Tensor sizes) -> ()");
+      "                  Tensor sizes,"
+      "                  bool src_access_order_any=False) -> ()");
   cache_ops.impl("swap_blocks_batch", torch::kCPU, &swap_blocks_batch);
 
   // Reshape the key and value tensors and cache them.

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -2782,6 +2782,7 @@ def swap_blocks_batch(
     src_ptrs: torch.Tensor,
     dst_ptrs: torch.Tensor,
     sizes: torch.Tensor,
+    src_access_order_any: bool = False,
 ) -> None:
     """
     Batch version of swap_blocks: submit all copies in a single driver call.
@@ -2790,8 +2791,17 @@ def swap_blocks_batch(
     of sizes[i] bytes. All three tensors must be int64 CPU tensors.
     On CUDA 12.8+ this uses cuMemcpyBatchAsync for minimal submission
     overhead; on older CUDA it falls back to a loop of cudaMemcpyAsync.
+
+    src_access_order_any: if True, pass CU_MEMCPY_SRC_ACCESS_ORDER_ANY to
+        cuMemcpyBatchAsync, letting the DMA engine prefetch source bytes
+        out of stream order. Only safe when no GPU stream is concurrently
+        writing to the source (e.g. CPU->GPU, where the source is host
+        pinned memory). Defaults to False (STREAM ordering), which is
+        always safe.
     """
-    torch.ops._C_cache_ops.swap_blocks_batch(src_ptrs, dst_ptrs, sizes)
+    torch.ops._C_cache_ops.swap_blocks_batch(
+        src_ptrs, dst_ptrs, sizes, src_access_order_any
+    )
 
 
 def convert_fp8(

diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -261,10 +261,22 @@ def transfer_async(self, job_id: int, transfer_spec: TransferSpec) -> bool:
             last_event = last_transfer.end_event
             # assure job will start only after the previous one completes
             stream.wait_event(last_event)
+        # CPU->GPU reads from host pinned memory, which is never written
+        # by a concurrent GPU stream, so CU_MEMCPY_SRC_ACCESS_ORDER_ANY is
+        # safe and lets the driver pipeline source reads. GPU->CPU reads
+        # from the live GPU KV cache, which the compute stream keeps
+        # writing; we must keep STREAM ordering so source reads are gated
+        # by the transfer stream's wait_stream(compute) barrier.
+        src_access_order_any = not self.gpu_to_cpu
         with torch.cuda.stream(stream):
             start_event.record(stream)
             if total > 0:
-                ops.swap_blocks_batch(batch_src, batch_dst, batch_sizes)
+                ops.swap_blocks_batch(
+                    batch_src,
+                    batch_dst,
+                    batch_sizes,
+                    src_access_order_any=src_access_order_any,
+                )
             end_event.record(stream)
 
         self._transfer_events[job_id] = end_event