[Performance]Batch kvcache offloading via aclrtMemcpyBatchAsync (vllm-project#7819)

HF-001 · 01267596 · wangxiyuan · web-flow · commit b2a02684ac12 · 2026-04-21T16:58:03.000+08:00
### What this PR does / why we need it? refer to vllm-project/vllm#38460 and vllm-project/vllm#38915 , cann 8.5.0+ use aclrtMemcpyBatchAsync, old cann version use aclrtMemcpyAsync to do kvcache offloading. It can automatically compile and select the appropriate transmission function based on the CANN environment, and also supports manual parameter transmission to choose the suitable transmission function. manual parameter : 1. batch memcpy（need CANN ≥ 8.5): export VLLM_ASCEND_ENABLE_BATCH_MEMCPY=1 pip install -e . 2. normal memcpy: export VLLM_ASCEND_ENABLE_BATCH_MEMCPY=0 pip install -e . ### How was this patch tested? test results: main : TTFT 307 ms TPOT 49.96ms this pr : TTFT 272.82ms TPOT 41.04ms model script: export TP=1 export MODEL_PATH=/nas/disk1/Qwen3-14B export MODEL_NAME=Qwen3-14B export PORT=10113 export CUDA_VISIBLE_DEVICES=3 export ASCEND_RT_VISIBLE_DEVICES=3 python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port ${PORT} --dtype bfloat16 --model ${MODEL_PATH} --served-model-name ${MODEL_NAME} --tensor-parallel-size ${TP} --gpu-memory-utilization 0.7 --no-enable-prefix-caching --max-model-len 32768 --trust-remote-code \ --block-size 128 \ --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 128, "num_cpu_blocks": 1000, "spec_name":"NPUOffloadingSpec", "spec_module_path": "vllm_ascend.kv_offload.npu"}}' test script: export MODEL_NAME=/nas/disk1/Qwen3-14B python /model/xk/vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py --url http://127.0.0.1:10113 --model $MODEL_NAME --served-model-name Qwen3-14B --seed 1234 --input-file /model/xk/vllm/benchmarks/multi_turn/generate_multi_turn.json \ --num-clients 8 --max-active-conversations 24 - vLLM version: v0.18.0 - vLLM main: vllm-project/vllm@35141a7 --------- Signed-off-by: 01267596 <xiongkai123@cmbchina.com> Signed-off-by: HF-001 <1670186653@qq.com> Signed-off-by: kx <1670186653@qq.com> Co-authored-by: 01267596 <xiongkai123@cmbchina.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -111,6 +111,47 @@ set(
 
 pybind11_add_module(vllm_ascend_C ${VLLM_ASCEND_SRC})
 
+# Detect aclrtMemcpyBatchAsync availability (CANN 8.5+)
+# Can be overridden via VLLM_ASCEND_ENABLE_BATCH_MEMCPY env var (registered
+# in vllm_ascend/envs.py, forwarded by setup.py as a CMake variable):
+#   VLLM_ASCEND_ENABLE_BATCH_MEMCPY=1  -> force enable
+#   VLLM_ASCEND_ENABLE_BATCH_MEMCPY=0  -> force disable
+#   unset                               -> auto-detect from CANN headers
+include(CheckCXXSourceCompiles)
+set(CMAKE_REQUIRED_INCLUDES ${ASCEND_HOME_PATH}/include)
+set(CMAKE_REQUIRED_LIBRARIES ascendcl)
+set(CMAKE_REQUIRED_LINK_OPTIONS "-L${ASCEND_HOME_PATH}/lib64")
+
+if(DEFINED VLLM_ASCEND_ENABLE_BATCH_MEMCPY)
+  if("${VLLM_ASCEND_ENABLE_BATCH_MEMCPY}" STREQUAL "1")
+    message(STATUS "aclrtMemcpyBatchAsync: force enabled via VLLM_ASCEND_ENABLE_BATCH_MEMCPY=1")
+    target_compile_definitions(vllm_ascend_C PRIVATE CANN_MEMCPY_BATCH_ASYNC)
+  else()
+    message(STATUS "aclrtMemcpyBatchAsync: force disabled via VLLM_ASCEND_ENABLE_BATCH_MEMCPY=0")
+  endif()
+else()
+  # Test the full code pattern we actually use, including struct member access.
+  # This ensures the macro is only defined when the API is fully compatible.
+  check_cxx_source_compiles("
+    #include <acl/acl_rt.h>
+    int main() {
+      aclrtMemLocation loc = {};
+      loc.type = ACL_MEM_LOCATION_TYPE_HOST;
+      loc.id = 0;
+      aclrtMemcpyBatchAttr attr = {};
+      attr.srcLoc = loc;
+      attr.dstLoc = loc;
+      (void)aclrtMemcpyBatchAsync;
+      return 0;
+    }
+  " HAVE_ACLRT_MEMCPY_BATCH_ASYNC)
+  if(HAVE_ACLRT_MEMCPY_BATCH_ASYNC)
+    message(STATUS "aclrtMemcpyBatchAsync: detected in CANN headers, enabling batch memcpy path")
+    target_compile_definitions(vllm_ascend_C PRIVATE CANN_MEMCPY_BATCH_ASYNC)
+  else()
+    message(STATUS "aclrtMemcpyBatchAsync: not found in CANN headers, using fallback aclrtMemcpyAsync loop")
+  endif()
+endif()
 # Prefer the CANN ACL headers over torch_npu's bundled third_party ACL copy.
 # torch_npu 2.9.0 ships an older acl_rt.h that does not declare
 # aclrtLaunchHostFunc, which breaks host-print compilation.
diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
@@ -168,6 +168,128 @@ void swap_blocks(torch::Tensor &x, torch::Tensor &y, const torch::Tensor &z)
     return;
 }
 
+void swap_blocks_batch(const torch::Tensor& src_ptrs,
+                       const torch::Tensor& dst_ptrs,
+                       const torch::Tensor& sizes,
+                       int64_t direction) {
+
+    TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU");
+    TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU");
+    TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU");
+    TORCH_CHECK(src_ptrs.dtype() == torch::kInt64, "src_ptrs must be int64");
+    TORCH_CHECK(dst_ptrs.dtype() == torch::kInt64, "dst_ptrs must be int64");
+    TORCH_CHECK(sizes.dtype() == torch::kInt64, "sizes must be int64");
+
+    const int64_t n = src_ptrs.size(0);
+    TORCH_CHECK(dst_ptrs.size(0) == n, "dst_ptrs length must match src_ptrs");
+    TORCH_CHECK(sizes.size(0) == n, "sizes length must match src_ptrs");
+
+    if (n == 0) return;
+
+    const int64_t* src_data = src_ptrs.data_ptr<int64_t>();
+    const int64_t* dst_data = dst_ptrs.data_ptr<int64_t>();
+    const int64_t* size_data = sizes.data_ptr<int64_t>();
+
+    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
+
+    aclrtMemcpyKind memcpy_kind;
+    switch (direction) {
+        case 0:
+            memcpy_kind = ACL_MEMCPY_HOST_TO_DEVICE;
+            break;
+        case 1:
+            memcpy_kind = ACL_MEMCPY_DEVICE_TO_HOST;
+            break;
+        case 2:
+            memcpy_kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+            break;
+        default:
+            TORCH_CHECK(false,
+                        "swap_blocks_batch: invalid direction ", direction,
+                        " (expected 0=H2D, 1=D2H, 2=D2D)");
+    }
+
+    // =========================================================================
+    // path 1: aclrtMemcpyBatchAsync (CANN 8.5+)
+    // =========================================================================
+#if defined(CANN_MEMCPY_BATCH_ASYNC)
+    if (memcpy_kind != ACL_MEMCPY_DEVICE_TO_DEVICE) {
+        static_assert(sizeof(void*) == sizeof(int64_t),
+                      "void* and int64_t must be the same size");
+        static_assert(sizeof(size_t) == sizeof(int64_t),
+                      "size_t and int64_t must be the same size");
+
+        void** dst_arr = reinterpret_cast<void**>(
+            const_cast<int64_t*>(dst_data));
+        void** src_arr = reinterpret_cast<void**>(
+            const_cast<int64_t*>(src_data));
+        size_t* size_arr = reinterpret_cast<size_t*>(
+            const_cast<int64_t*>(size_data));
+        size_t* dest_maxs = size_arr;
+
+        // aclrtMemcpyBatchAttr uses srcLoc/dstLoc (aclrtMemLocation)
+        // to specify memory locations, not aclrtMemcpyKind.
+        int32_t device_id = 0;
+        aclrtGetDevice(&device_id);
+
+        aclrtMemLocation host_loc = {};
+        host_loc.type = ACL_MEM_LOCATION_TYPE_HOST;
+        host_loc.id = 0;
+
+        aclrtMemLocation device_loc = {};
+        device_loc.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+        device_loc.id = device_id;
+
+        aclrtMemcpyBatchAttr attr = {};
+        if (memcpy_kind == ACL_MEMCPY_HOST_TO_DEVICE) {
+            attr.srcLoc = host_loc;
+            attr.dstLoc = device_loc;
+        } else {  // ACL_MEMCPY_DEVICE_TO_HOST
+            attr.srcLoc = device_loc;
+            attr.dstLoc = host_loc;
+        }
+
+        size_t attrs_index = 0;
+        size_t fail_index = 0;
+
+        aclError result = aclrtMemcpyBatchAsync(
+            dst_arr, dest_maxs, src_arr, size_arr,
+            static_cast<size_t>(n),
+            &attr, &attrs_index, 1,
+            &fail_index, stream);
+
+        TORCH_CHECK(result == ACL_SUCCESS,
+                    "aclrtMemcpyBatchAsync failed at index ", fail_index,
+                    " with error code ", result);
+        return;
+    }
+#endif
+
+    // =========================================================================
+    // path 2: aclrtMemcpyAsync
+    // =========================================================================
+    for (int64_t i = 0; i < n; i++) {
+        void* dst = reinterpret_cast<void*>(dst_data[i]);
+        const void* src = reinterpret_cast<const void*>(src_data[i]);
+        size_t copy_size = static_cast<size_t>(size_data[i]);
+
+        aclError ret = aclrtMemcpyAsync(
+            dst,                
+            copy_size,          
+            src,                
+            copy_size,          
+            memcpy_kind,        
+            stream);            
+
+        TORCH_CHECK(ret == ACL_SUCCESS,
+                    "aclrtMemcpyAsync failed at index ", i,
+                    " with error code ", ret,
+                    ", src=", src_data[i],
+                    ", dst=", dst_data[i],
+                    ", size=", size_data[i]);
+    }
+}
+
 AscendType get_dtype_from_torch(at::ScalarType scalarType)
 {
     if (scalarType == at::ScalarType::Float) {
@@ -962,6 +1084,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
     ops.def("swap_blocks(Tensor! x, Tensor! y, Tensor z) -> ()");    
     ops.impl("swap_blocks", torch::kPrivateUse1, &vllm_ascend::swap_blocks);
 
+    // swap_blocks_batch takes CPU tensors (int64 pointer/size arrays), not NPU
+    // tensors, so dispatch must be registered on the CPU backend. The function
+    // internally submits async memcpy on the current NPU stream.
+    ops.def("swap_blocks_batch(Tensor x, Tensor y, Tensor z, int direction) -> ()");
+    ops.impl("swap_blocks_batch", torch::kCPU, &vllm_ascend::swap_blocks_batch);
     ops.def("device_print(str msg) -> ()");
     ops.impl("device_print", c10::DispatchKey::CompositeExplicitAutograd,
              static_cast<void (*)(c10::string_view)>(&vllm_ascend::device_print));
diff --git a/setup.py b/setup.py
@@ -338,6 +338,11 @@ def configure(self, ext: CMakeExtension) -> None:
         # add TORCH_NPU_PATH
         cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"]
 
+        # Pass VLLM_ASCEND_ENABLE_BATCH_MEMCPY to CMake if explicitly set.
+        # When unset (None), CMake will auto-detect from CANN headers.
+        if envs.VLLM_ASCEND_ENABLE_BATCH_MEMCPY is not None:
+            cmake_args += [f"-DVLLM_ASCEND_ENABLE_BATCH_MEMCPY={envs.VLLM_ASCEND_ENABLE_BATCH_MEMCPY}"]
+
         build_tool = []
         # TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build
         # if which('ninja') is not None:
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -111,6 +111,9 @@
     "VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK": lambda: bool(
         int(os.getenv("VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK", "1"))
     ),
+    # Control the aclrtMemcpyBatchAsync compile path for KV cache offloading.
+    # "1": force enable, "0": force disable, None: auto-detect from CANN headers.
+    "VLLM_ASCEND_ENABLE_BATCH_MEMCPY": lambda: os.getenv("VLLM_ASCEND_ENABLE_BATCH_MEMCPY", None),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py

Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,9 @@`
`111`	`111`	`"VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK": lambda: bool(`
`112`	`112`	`int(os.getenv("VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK", "1"))`
`113`	`113`	`),`
	`114`	`+ # Control the aclrtMemcpyBatchAsync compile path for KV cache offloading.`
	`115`	`+ # "1": force enable, "0": force disable, None: auto-detect from CANN headers.`
	`116`	`+ "VLLM_ASCEND_ENABLE_BATCH_MEMCPY": lambda: os.getenv("VLLM_ASCEND_ENABLE_BATCH_MEMCPY", None),`
`114`	`117`	`}`
`115`	`118`
`116`	`119`	`# end-env-vars-definition`