Add swap_blocks_batch op with batched async memcpy

chaojun-zhang · chaojun-zhang · commit 775df35f8cec · 2026-04-10T03:56:26.000Z
Signed-off-by: chaojun-zhang &lt;chaojun.zhang@intel.com&gt;
diff --git a/csrc/cache.cpp b/csrc/cache.cpp
@@ -1175,6 +1175,40 @@ void swap_blocks(
   return;
 }
 
+/**
+ * @brief Batch version of swap_blocks: copies N independent (src, dst, size)
+ *        triples in a single call, amortising per-copy overhead.
+ *
+ * Thin wrapper that validates tensor inputs, sets the XPU device guard,
+ * and delegates to vllm::xpu::xpuAsyncMemcpyBatch for the actual copy logic.
+ *
+ * @param device  XPU device index — required because all input tensors are
+ *                CPU tensors so PyTorch cannot infer the target device.
+ */
+void swap_blocks_batch(
+    const torch::Tensor& src_ptrs,
+    const torch::Tensor& dst_ptrs,
+    const torch::Tensor& sizes) {
+  TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU");
+  TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU");
+  TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU");
+  TORCH_CHECK(src_ptrs.dtype() == torch::kUInt64, "src_ptrs must be uint64");
+  TORCH_CHECK(dst_ptrs.dtype() == torch::kUInt64, "dst_ptrs must be uint64");
+  TORCH_CHECK(sizes.dtype() == torch::kUInt64, "sizes must be uint64");
+
+  const int64_t n = src_ptrs.size(0);
+  TORCH_CHECK(dst_ptrs.size(0) == n, "dst_ptrs length must match src_ptrs");
+  TORCH_CHECK(sizes.size(0) == n, "sizes length must match src_ptrs");
+
+  if (n == 0) return;
+
+  vllm::xpu::xpuAsyncMemcpyBatch(
+      src_ptrs.data_ptr<uint64_t>(),
+      dst_ptrs.data_ptr<uint64_t>(),
+      sizes.data_ptr<uint64_t>(),
+      n);
+}
+
 namespace vllm {
 
 // Kernel for FP8 conversion (matches CUDA convert_fp8_kernel pattern).
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -158,6 +158,11 @@ void swap_blocks(
     int64_t block_size_in_bytes,
     const torch::Tensor& block_mapping);
 
+void swap_blocks_batch(
+    const torch::Tensor& src_ptrs,
+    const torch::Tensor& dst_ptrs,
+    const torch::Tensor& sizes);
+
 void top_k_per_row_decode(
     const torch::Tensor& logits,
     int64_t next_n,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -206,6 +206,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "swap_blocks(Tensor src, Tensor! dst,"
       "            int block_size_in_bytes, Tensor block_mapping) -> ()");
   cache_ops.impl("swap_blocks", torch::kXPU, &swap_blocks);
+  // Batch swap: copies N (src_ptr, dst_ptr, size) triples in one call.
+  cache_ops.def(
+      "swap_blocks_batch(Tensor src_ptrs, Tensor dst_ptrs, Tensor sizes) -> "
+      "()");
+  cache_ops.impl("swap_blocks_batch", torch::kCPU, &swap_blocks_batch);
   cache_ops.def(
       "indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache,"
       "Tensor slot_mapping, int quant_block_size, str scale_fmt) -> ()");
diff --git a/csrc/utils/mem_cpy.cpp b/csrc/utils/mem_cpy.cpp
@@ -47,9 +47,7 @@ inline void async_h2d_with_staging(
 
   memcpy_async(queue, dst_device, staging_ptr, n_bytes);
 
-  // The staging buffer is managed by the allocator,
-  // so record the event on it to ensure the staging buffer remains alive
-  // until the DMA transfer completes.
+  // Keep staging buffer alive until the DMA transfer completes.
   record_host_alloc_event_if_possible(staging_ptr, staging.get_context());
 }
 
@@ -151,6 +149,86 @@ void xpuAsyncMemcpy(
   }
 }
 
+void xpuAsyncMemcpyBatch(
+    const uint64_t* src_ptrs,
+    const uint64_t* dst_ptrs,
+    const uint64_t* sizes,
+    int64_t n) {
+  if (n == 0) return;
+
+  auto& queue = vllm::xpu::vllmGetQueue();
+  auto sycl_ctx = queue.get_context();
+
+  // Determine copy direction from the first non-zero entry's pointer types.
+  // All entries in a batch are expected to share the same direction.
+  // Staging is only needed for H2D: the source is host memory that the caller
+  // may mutate after this function returns, while the DMA is still in flight.
+  bool needs_staging = false;
+  for (int64_t i = 0; i < n; i++) {
+    if (sizes[i] > 0) {
+      const void* first_src = reinterpret_cast<const void*>(src_ptrs[i]);
+      const void* first_dst = reinterpret_cast<const void*>(dst_ptrs[i]);
+      auto src_type = sycl::get_pointer_type(first_src, sycl_ctx);
+      auto dst_type = sycl::get_pointer_type(first_dst, sycl_ctx);
+      bool src_is_host =
+          (src_type == sycl::usm::alloc::host ||
+           src_type == sycl::usm::alloc::unknown);
+      bool dst_is_device = (dst_type == sycl::usm::alloc::device);
+      needs_staging = src_is_host && dst_is_device;
+      break;
+    }
+  }
+
+  // Compute total bytes for staging allocation.
+  uint64_t total_bytes = 0;
+  for (int64_t i = 0; i < n; i++) {
+    total_bytes += sizes[i];
+  }
+  if (total_bytes == 0) return;
+
+  if (needs_staging) {
+    // H2D: allocate one contiguous pinned staging buffer, snapshot all source
+    // blocks, then submit all async DMAs.  This avoids N separate allocator
+    // round-trips and protects against caller mutation after return.
+    auto staging = at::getHostAllocator(at::kXPU)->allocate(
+        static_cast<size_t>(total_bytes));
+    char* staging_ptr = static_cast<char*>(staging.get());
+    TORCH_CHECK(staging_ptr, "Failed to allocate pinned staging buffer");
+
+    size_t staging_offset = 0;
+    for (int64_t i = 0; i < n; i++) {
+      size_t sz = static_cast<size_t>(sizes[i]);
+      if (sz == 0) continue;
+
+      const void* src = reinterpret_cast<const void*>(src_ptrs[i]);
+      void* dst = reinterpret_cast<void*>(dst_ptrs[i]);
+
+      std::memcpy(staging_ptr + staging_offset, src, sz);
+      queue.memcpy(dst, staging_ptr + staging_offset, sz);
+      staging_offset += sz;
+    }
+
+    // Keep the staging buffer alive until all submitted DMAs complete.
+    if (staging.get_context() != nullptr) {
+      at::getHostAllocator(at::kXPU)->record_event(
+          staging_ptr,
+          const_cast<void*>(staging.get_context()),
+          at::xpu::getCurrentXPUStream());
+    }
+  } else {
+    // D2H / D2D: direct async DMA, no staging needed.
+    for (int64_t i = 0; i < n; i++) {
+      size_t sz = static_cast<size_t>(sizes[i]);
+      if (sz == 0) continue;
+
+      const void* src = reinterpret_cast<const void*>(src_ptrs[i]);
+      void* dst = reinterpret_cast<void*>(dst_ptrs[i]);
+
+      queue.memcpy(dst, src, sz);
+    }
+  }
+}
+
 }  // namespace xpu
 }  // namespace vllm
 
diff --git a/csrc/utils/mem_cpy.h b/csrc/utils/mem_cpy.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <cstddef>
+#include <cstdint>
 
 namespace vllm {
 namespace xpu {
@@ -32,5 +33,27 @@ void xpuAsyncMemcpy(
     const void* hctx,
     bool is_pinned);
 
+/**
+ * @brief Batch async memcpy: copies N independent (src, dst, size) triples
+ *        in a single call, amortising per-copy overhead.
+ *
+ * The copy direction is auto-detected from the first non-zero entry's USM
+ * pointer types.  All entries must share the same direction.
+ *
+ * For H2D: snapshots all source blocks through a single contiguous pinned
+ *   staging buffer so the caller may safely mutate host memory immediately.
+ * For D2H / D2D: direct async DMA without staging.
+ *
+ * @param src_ptrs   Array of N raw source addresses
+ * @param dst_ptrs   Array of N raw destination addresses
+ * @param sizes      Array of N byte counts
+ * @param n          Number of entries
+ */
+void xpuAsyncMemcpyBatch(
+    const uint64_t* src_ptrs,
+    const uint64_t* dst_ptrs,
+    const uint64_t* sizes,
+    int64_t n);
+
 }  // namespace xpu
 }  // namespace vllm
diff --git a/tests/register_ops.py b/tests/register_ops.py
@@ -487,6 +487,16 @@ def swap_blocks(
                                        block_mapping)
 
 
+def swap_blocks_batch(
+    src_ptrs: torch.Tensor,
+    dst_ptrs: torch.Tensor,
+    sizes: torch.Tensor,
+) -> None:
+    """Batch version of swap_blocks: copies N independent (src, dst, size)
+    triples in a single call."""
+    torch.ops._C_cache_ops.swap_blocks_batch(src_ptrs, dst_ptrs, sizes)
+
+
 def topk_sigmoid(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indices: torch.Tensor,
                  gating_output: torch.Tensor, renormalize: bool,
diff --git a/tests/test_cache.py b/tests/test_cache.py
@@ -2,6 +2,7 @@
 
 import random
 
+import numpy as np
 import pytest
 import torch
 
@@ -101,6 +102,13 @@
         "device": ["xpu:0"],
         "kv_cache_dtype": KV_CACHE_DTYPE,
     },
+    "test_swap_blocks_batch": {
+        "direction": [("cpu", "xpu")],
+        "device": ["xpu:0"],
+    },
+    "test_swap_blocks_batch_h2d_mutation_race": {
+        "device": ["xpu:0"],
+    },
 }
 
 
@@ -948,3 +956,149 @@ def test_swap_blocks_mla(
             msg=f"Block {src} from src should have been swapped to block "
             f"{dst} in dst_cache.",
         )
+
+
+# ---------------------------------------------------------------------------
+#  swap_blocks_batch tests
+# ---------------------------------------------------------------------------
+
+
+def _build_batch_args(
+    src_cache: torch.Tensor,
+    dst_cache: torch.Tensor,
+    block_mapping: list[tuple[int, int]],
+    block_size_in_bytes: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Build (src_ptrs, dst_ptrs, sizes) tensors for swap_blocks_batch."""
+    n = len(block_mapping)
+    src_arr = np.empty(n, dtype=np.uint64)
+    dst_arr = np.empty(n, dtype=np.uint64)
+    sz_arr = np.full(n, block_size_in_bytes, dtype=np.uint64)
+
+    src_base = src_cache.data_ptr()
+    dst_base = dst_cache.data_ptr()
+    stride = src_cache.stride(0) * src_cache.element_size()
+
+    for i, (sb, db) in enumerate(block_mapping):
+        src_arr[i] = src_base + sb * stride
+        dst_arr[i] = dst_base + db * stride
+
+    return (torch.from_numpy(src_arr), torch.from_numpy(dst_arr),
+            torch.from_numpy(sz_arr))
+
+
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_swap_blocks_batch(
+    direction: tuple[str, str],
+    device: str,
+) -> None:
+    """Test swap_blocks_batch for H2D, D2H and D2D directions."""
+    num_mappings = 64
+    num_heads = 8
+    head_size = 64
+    block_size = 8
+    num_blocks = 256
+    dtype = torch.bfloat16
+    seed = 0
+
+    seed_everything(seed)
+
+    src_device = device if direction[0] == "xpu" else "cpu"
+    dst_device = device if direction[1] == "xpu" else "cpu"
+
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    if src_device == dst_device:
+        remaining = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+    block_mapping = list(zip(src_blocks, dst_blocks))
+
+    src_key, src_val = create_kv_caches_with_random(num_blocks, block_size, 1,
+                                                    num_heads, head_size,
+                                                    "auto", dtype, seed,
+                                                    src_device)
+    dst_key, dst_val = create_kv_caches_with_random(num_blocks, block_size, 1,
+                                                    num_heads, head_size,
+                                                    "auto", dtype, seed,
+                                                    dst_device)
+
+    src_key_clone = src_key[0].clone()
+    src_val_clone = src_val[0].clone()
+
+    block_size_in_bytes = src_key[0].element_size() * src_key[0].stride(0)
+
+    # Build batch args and call
+    for src_cache, dst_cache in [(src_key[0], dst_key[0]),
+                                 (src_val[0], dst_val[0])]:
+        sp, dp, sz = _build_batch_args(src_cache, dst_cache, block_mapping,
+                                       block_size_in_bytes)
+        ops.swap_blocks_batch(sp, dp, sz)
+
+    torch.xpu.synchronize()
+
+    for sb, db in block_mapping:
+        torch.testing.assert_close(src_key_clone[sb].cpu(),
+                                   dst_key[0][db].cpu())
+        torch.testing.assert_close(src_val_clone[sb].cpu(),
+                                   dst_val[0][db].cpu())
+
+
+@torch.inference_mode()
+def test_swap_blocks_batch_h2d_mutation_race() -> None:
+    """Verify staging buffer protects against caller mutation for H2D batch."""
+    num_mappings = 256
+    num_heads = 8
+    head_size = 128
+    block_size = 32
+    num_blocks = 512
+    dtype = torch.bfloat16
+    seed = 0
+
+    seed_everything(seed)
+
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    dst_blocks = random.sample(range(num_blocks), num_mappings)
+    block_mapping = list(zip(src_blocks, dst_blocks))
+
+    # Source: pinned CPU memory
+    src_key, src_val = create_kv_caches_with_pinned(num_blocks, block_size, 1,
+                                                    num_heads, head_size,
+                                                    "auto", dtype, seed, "cpu")
+    assert src_key[0].is_pinned()
+
+    # Destination: XPU
+    dst_key, dst_val = create_kv_caches_with_random(num_blocks, block_size, 1,
+                                                    num_heads, head_size,
+                                                    "auto", dtype, seed)
+
+    src_key_clone = src_key[0].clone()
+    src_val_clone = src_val[0].clone()
+
+    block_size_in_bytes = src_key[0].element_size() * src_key[0].stride(0)
+
+    for src_cache, dst_cache in [(src_key[0], dst_key[0]),
+                                 (src_val[0], dst_val[0])]:
+        sp, dp, sz = _build_batch_args(src_cache, dst_cache, block_mapping,
+                                       block_size_in_bytes)
+        ops.swap_blocks_batch(sp, dp, sz)
+
+    # Immediately mutate source — should not affect destination.
+    src_key[0].fill_(0)
+    src_val[0].fill_(0)
+
+    torch.xpu.synchronize()
+
+    for sb, db in block_mapping:
+        torch.testing.assert_close(
+            src_key_clone[sb].cpu(),
+            dst_key[0][db].cpu(),
+            msg=f"Key block {sb}→{db} corrupted by post-call mutation",
+        )
+        torch.testing.assert_close(
+            src_val_clone[sb].cpu(),
+            dst_val[0][db].cpu(),
+            msg=f"Value block {sb}→{db} corrupted by post-call mutation",
+        )