Skip to content

Commit fa94caa

Browse files
XingSong-Sunsunwenhan
authored andcommitted
[TE] fix: fix heterogeneous rdma transport error (kvcache-ai#1657)
--------- Co-authored-by: sunwenhan <sunwenhan@xfusion.com>
1 parent bafdce1 commit fa94caa

2 files changed

Lines changed: 12 additions & 3 deletions

File tree

mooncake-transfer-engine/example/transfer_engine_heterogeneous_ascend_perf_initiator.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ std::string loadNicPriorityMatrix() {
160160
" \"cpu:1\": [[" +
161161
device_names +
162162
"], []], "
163+
" \"npu:0\": [[" +
164+
device_names +
165+
"], []], "
163166
" \"cuda:0\": [[" +
164167
device_names +
165168
"], []], "

mooncake-transfer-engine/src/transport/ascend_transport/heterogeneous_rdma_transport/heterogeneous_rdma_transport.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44
namespace mooncake {
55

66
namespace {
7+
// ACL memory location types for aclrtPtrAttributes::location.type:
8+
// 0 = ACL host memory, 1 = device memory, 2 = regular CPU memory (malloc)
9+
static constexpr int kDeviceMemoryLocationType = 1;
10+
711
bool isCpuMemory(void *addr) {
812
aclrtPtrAttributes attributes{};
913
if (int ret = aclrtPointerGetAttributes(addr, &attributes)) {
10-
LOG(ERROR) << "aclrtPointrtGetAttributes error, ret: " << ret;
11-
return false;
14+
// If ACL cannot identify the pointer, treat it as regular CPU memory.
15+
LOG(WARNING) << "aclrtPointerGetAttributes failed for addr " << addr
16+
<< ", ret: " << ret << ". Treating as CPU memory.";
17+
return true;
1218
}
13-
return (attributes.location.type == 0);
19+
return (attributes.location.type != kDeviceMemoryLocationType);
1420
}
1521
} // namespace
1622

0 commit comments

Comments
 (0)