From c79a4de9b3aeee50fc9e95d71e9659f9df031fa4 Mon Sep 17 00:00:00 2001 From: sunwenhan Date: Fri, 13 Mar 2026 15:27:57 +0800 Subject: [PATCH 1/6] fix: fix heterogeneous rdma transport bug and add test case --- build_gpu.sh | 17 +++++++++++++++++ build_npu.sh | 16 ++++++++++++++++ ...gine_heterogeneous_ascend_perf_initiator.cpp | 5 +++-- .../transport/rdma_transport/rdma_transport.h | 3 ++- .../heterogeneous_rdma_transport.cpp | 12 +++++++++--- run_gpu.sh | 10 ++++++++++ run_npu.sh | 11 +++++++++++ 7 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 build_gpu.sh create mode 100644 build_npu.sh create mode 100644 run_gpu.sh create mode 100644 run_npu.sh diff --git a/build_gpu.sh b/build_gpu.sh new file mode 100644 index 0000000000..4980a14349 --- /dev/null +++ b/build_gpu.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +VLLM_PYTHON="${VLLM_PYTHON:-$(python3 -c 'import sys; print(sys.executable)')}" +echo "[INFO] 使用 Python 解释器: ${VLLM_PYTHON}" +"${VLLM_PYTHON}" -c 'import sys; print(f"[INFO] Python 版本: {sys.version.split()[0]}")' + +rm -rf build +mkdir build +cd build +cmake .. \ + -DUSE_CUDA=ON \ + -DUSE_MNNVL=ON \ + -DPython3_EXECUTABLE="${VLLM_PYTHON}" \ + -DPython_EXECUTABLE="${VLLM_PYTHON}" \ + -DPYTHON_EXECUTABLE="${VLLM_PYTHON}" +make -j"$(nproc)" \ No newline at end of file diff --git a/build_npu.sh b/build_npu.sh new file mode 100644 index 0000000000..79b8389e1c --- /dev/null +++ b/build_npu.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +VLLM_PYTHON="${VLLM_PYTHON:-$(python3 -c 'import sys; print(sys.executable)')}" +echo "[INFO] 使用 Python 解释器: ${VLLM_PYTHON}" +"${VLLM_PYTHON}" -c 'import sys; print(f"[INFO] Python 版本: {sys.version.split()[0]}")' + +rm -rf build +mkdir build +cd build +cmake .. \ + -DUSE_ASCEND_HETEROGENEOUS=ON \ + -DPython3_EXECUTABLE="${VLLM_PYTHON}" \ + -DPython_EXECUTABLE="${VLLM_PYTHON}" \ + -DPYTHON_EXECUTABLE="${VLLM_PYTHON}" +make -j"$(nproc)" \ No newline at end of file diff --git a/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp b/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp index efbd30ea72..60235c1107 100644 --- a/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp +++ b/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp @@ -159,8 +159,9 @@ std::string loadNicPriorityMatrix() { "], []], " " \"cpu:1\": [[" + device_names + - "], []], " - " \"cuda:0\": [[" + + "], []], " " \"npu:0\": [[" + + device_names + + "], []], " " \"cuda:0\": [[" + device_names + "], []], " " \"musa:0\": [[" + diff --git a/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h b/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h index e39c31a530..bcd6b4774b 100644 --- a/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h +++ b/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h @@ -37,12 +37,13 @@ class RdmaContext; class RdmaEndPoint; class TransferMetadata; class WorkerPool; +class HeterogeneousRdmaTransport; class RdmaTransport : public Transport { friend class RdmaContext; friend class RdmaEndPoint; friend class WorkerPool; - + friend class HeterogeneousRdmaTransport; public: using BufferDesc = TransferMetadata::BufferDesc; using SegmentDesc = TransferMetadata::SegmentDesc; diff --git a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp index 2ba7455249..9abc396421 100644 --- a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp +++ b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp @@ -7,10 +7,16 @@ namespace { bool isCpuMemory(void *addr) { aclrtPtrAttributes attributes{}; if (int ret = aclrtPointerGetAttributes(addr, &attributes)) { - LOG(ERROR) << "aclrtPointrtGetAttributes error, ret: " << ret; - return false; + // If ACL cannot identify the pointer, it is not ACL-managed device + // memory, so treat it as regular CPU memory. + LOG(WARNING) << "aclrtPointerGetAttributes failed for addr " << addr + << ", ret: " << ret + << ". Treating as CPU memory."; + return true; } - return (attributes.location.type == 0); + // location.type: 0 = ACL host memory, 1 = device memory, 2 = regular + // CPU memory (malloc). Only type 1 is device memory; all others are CPU. + return (attributes.location.type != 1); } } // namespace diff --git a/run_gpu.sh b/run_gpu.sh new file mode 100644 index 0000000000..1ccb73c2ff --- /dev/null +++ b/run_gpu.sh @@ -0,0 +1,10 @@ +cd build/mooncake-transfer-engine/tests +./rdma_transport_test \ + --mode=target \ + --local_server_name=7.6.16.150 \ + --metadata_server=P2PHANDSHAKE \ + --operation=write \ + --protocol=rdma \ + --device_name=ibp22s0 \ + --use_vram=true \ + --gpu_id=2 \ No newline at end of file diff --git a/run_npu.sh b/run_npu.sh new file mode 100644 index 0000000000..ff37d163df --- /dev/null +++ b/run_npu.sh @@ -0,0 +1,11 @@ +cd build/mooncake-transfer-engine/example +./transfer_engine_heterogeneous_ascend_perf_initiator \ + --mode=initiator \ + --local_server_name=7.6.16.155 \ + --metadata_server=P2PHANDSHAKE \ + --operation=write \ + --npu_id=0 \ + --segment_id=7.6.16.150:16901 \ + --device_name=ibp56s0 \ + --block_size=1024 \ + --batch_size=5242880 \ No newline at end of file From 7041ac939d79de7d7f8d4202f02072ccf261e502 Mon Sep 17 00:00:00 2001 From: sunwenhan Date: Sat, 14 Mar 2026 16:04:01 +0800 Subject: [PATCH 2/6] delete test code --- build_gpu.sh | 17 ----------------- build_npu.sh | 16 ---------------- run_gpu.sh | 10 ---------- run_npu.sh | 11 ----------- 4 files changed, 54 deletions(-) delete mode 100644 build_gpu.sh delete mode 100644 build_npu.sh delete mode 100644 run_gpu.sh delete mode 100644 run_npu.sh diff --git a/build_gpu.sh b/build_gpu.sh deleted file mode 100644 index 4980a14349..0000000000 --- a/build_gpu.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -VLLM_PYTHON="${VLLM_PYTHON:-$(python3 -c 'import sys; print(sys.executable)')}" -echo "[INFO] 使用 Python 解释器: ${VLLM_PYTHON}" -"${VLLM_PYTHON}" -c 'import sys; print(f"[INFO] Python 版本: {sys.version.split()[0]}")' - -rm -rf build -mkdir build -cd build -cmake .. \ - -DUSE_CUDA=ON \ - -DUSE_MNNVL=ON \ - -DPython3_EXECUTABLE="${VLLM_PYTHON}" \ - -DPython_EXECUTABLE="${VLLM_PYTHON}" \ - -DPYTHON_EXECUTABLE="${VLLM_PYTHON}" -make -j"$(nproc)" \ No newline at end of file diff --git a/build_npu.sh b/build_npu.sh deleted file mode 100644 index 79b8389e1c..0000000000 --- a/build_npu.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -VLLM_PYTHON="${VLLM_PYTHON:-$(python3 -c 'import sys; print(sys.executable)')}" -echo "[INFO] 使用 Python 解释器: ${VLLM_PYTHON}" -"${VLLM_PYTHON}" -c 'import sys; print(f"[INFO] Python 版本: {sys.version.split()[0]}")' - -rm -rf build -mkdir build -cd build -cmake .. \ - -DUSE_ASCEND_HETEROGENEOUS=ON \ - -DPython3_EXECUTABLE="${VLLM_PYTHON}" \ - -DPython_EXECUTABLE="${VLLM_PYTHON}" \ - -DPYTHON_EXECUTABLE="${VLLM_PYTHON}" -make -j"$(nproc)" \ No newline at end of file diff --git a/run_gpu.sh b/run_gpu.sh deleted file mode 100644 index 1ccb73c2ff..0000000000 --- a/run_gpu.sh +++ /dev/null @@ -1,10 +0,0 @@ -cd build/mooncake-transfer-engine/tests -./rdma_transport_test \ - --mode=target \ - --local_server_name=7.6.16.150 \ - --metadata_server=P2PHANDSHAKE \ - --operation=write \ - --protocol=rdma \ - --device_name=ibp22s0 \ - --use_vram=true \ - --gpu_id=2 \ No newline at end of file diff --git a/run_npu.sh b/run_npu.sh deleted file mode 100644 index ff37d163df..0000000000 --- a/run_npu.sh +++ /dev/null @@ -1,11 +0,0 @@ -cd build/mooncake-transfer-engine/example -./transfer_engine_heterogeneous_ascend_perf_initiator \ - --mode=initiator \ - --local_server_name=7.6.16.155 \ - --metadata_server=P2PHANDSHAKE \ - --operation=write \ - --npu_id=0 \ - --segment_id=7.6.16.150:16901 \ - --device_name=ibp56s0 \ - --block_size=1024 \ - --batch_size=5242880 \ No newline at end of file From 7a68c8133c9ae9e933aa78bf9539933dbf4f9896 Mon Sep 17 00:00:00 2001 From: sunwenhan Date: Mon, 16 Mar 2026 09:56:12 +0800 Subject: [PATCH 3/6] Use constant when judging memory locations --- .../heterogeneous_rdma_transport.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp index 9abc396421..cbf53b155d 100644 --- a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp +++ b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp @@ -4,6 +4,10 @@ namespace mooncake { namespace { +// ACL memory location types for aclrtPtrAttributes::location.type: +// 0 = ACL host memory, 1 = device memory, 2 = regular CPU memory (malloc) +static constexpr int kDeviceMemoryLocationType = 1; + bool isCpuMemory(void *addr) { aclrtPtrAttributes attributes{}; if (int ret = aclrtPointerGetAttributes(addr, &attributes)) { @@ -14,9 +18,7 @@ bool isCpuMemory(void *addr) { << ". Treating as CPU memory."; return true; } - // location.type: 0 = ACL host memory, 1 = device memory, 2 = regular - // CPU memory (malloc). Only type 1 is device memory; all others are CPU. - return (attributes.location.type != 1); + return (attributes.location.type != kDeviceMemoryLocationType); } } // namespace From 7e3f8e5aabb18ec10946ddedab3469f350cd38c3 Mon Sep 17 00:00:00 2001 From: sunwenhan Date: Tue, 17 Mar 2026 09:43:53 +0800 Subject: [PATCH 4/6] fix: format string concatenation --- .../heterogeneous_rdma_transport.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp index cbf53b155d..d1338b4b37 100644 --- a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp +++ b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp @@ -11,8 +11,9 @@ static constexpr int kDeviceMemoryLocationType = 1; bool isCpuMemory(void *addr) { aclrtPtrAttributes attributes{}; if (int ret = aclrtPointerGetAttributes(addr, &attributes)) { - // If ACL cannot identify the pointer, it is not ACL-managed device - // memory, so treat it as regular CPU memory. + // If ACL cannot identify the pointer, + // it is not ACL-managed device memory, + // so treat it as regular CPU memory. LOG(WARNING) << "aclrtPointerGetAttributes failed for addr " << addr << ", ret: " << ret << ". Treating as CPU memory."; From eb740f83a8545c105c49440d52d0f399a7d87366 Mon Sep 17 00:00:00 2001 From: sunwenhan Date: Tue, 17 Mar 2026 10:00:39 +0800 Subject: [PATCH 5/6] fix: format the code issues by the code check --- ...ransfer_engine_heterogeneous_ascend_perf_initiator.cpp | 8 +++++--- .../include/transport/rdma_transport/rdma_transport.h | 1 + .../heterogeneous_rdma_transport.cpp | 7 ++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp b/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp index 60235c1107..d2a9ec52d1 100644 --- a/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp +++ b/mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp @@ -159,9 +159,11 @@ std::string loadNicPriorityMatrix() { "], []], " " \"cpu:1\": [[" + device_names + - "], []], " " \"npu:0\": [[" - + device_names + - "], []], " " \"cuda:0\": [[" + + "], []], " + " \"npu:0\": [[" + + device_names + + "], []], " + " \"cuda:0\": [[" + device_names + "], []], " " \"musa:0\": [[" + diff --git a/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h b/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h index bcd6b4774b..4352e17514 100644 --- a/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h +++ b/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h @@ -44,6 +44,7 @@ class RdmaTransport : public Transport { friend class RdmaEndPoint; friend class WorkerPool; friend class HeterogeneousRdmaTransport; + public: using BufferDesc = TransferMetadata::BufferDesc; using SegmentDesc = TransferMetadata::SegmentDesc; diff --git a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp index d1338b4b37..943332a919 100644 --- a/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp +++ b/mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp @@ -11,12 +11,9 @@ static constexpr int kDeviceMemoryLocationType = 1; bool isCpuMemory(void *addr) { aclrtPtrAttributes attributes{}; if (int ret = aclrtPointerGetAttributes(addr, &attributes)) { - // If ACL cannot identify the pointer, - // it is not ACL-managed device memory, - // so treat it as regular CPU memory. + // If ACL cannot identify the pointer, treat it as regular CPU memory. LOG(WARNING) << "aclrtPointerGetAttributes failed for addr " << addr - << ", ret: " << ret - << ". Treating as CPU memory."; + << ", ret: " << ret << ". Treating as CPU memory."; return true; } return (attributes.location.type != kDeviceMemoryLocationType); From 1c2c9451027c9c71f1fef338cc8da0bb8cf7dc28 Mon Sep 17 00:00:00 2001 From: sunwenhan Date: Tue, 17 Mar 2026 11:09:02 +0800 Subject: [PATCH 6/6] fix: update main branch and delete friend for HeterogeneousRdmaTransport --- .../include/transport/rdma_transport/rdma_transport.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h b/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h index c76f8c47a6..d292a27e93 100644 --- a/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h +++ b/mooncake-transfer-engine/include/transport/rdma_transport/rdma_transport.h @@ -37,13 +37,11 @@ class RdmaContext; class RdmaEndPoint; class TransferMetadata; class WorkerPool; -class HeterogeneousRdmaTransport; class RdmaTransport : public Transport { friend class RdmaContext; friend class RdmaEndPoint; friend class WorkerPool; - friend class HeterogeneousRdmaTransport; public: using BufferDesc = TransferMetadata::BufferDesc;