Skip to content

Commit f4225cb

Browse files
UNIDY2002EC2 Default User
authored andcommitted
[PG] force register local memory for P2P memory regions (kvcache-ai#1690)
Similar to kvcache-ai#1656.
1 parent fa94caa commit f4225cb

3 files changed

Lines changed: 15 additions & 11 deletions

File tree

mooncake-pg/include/p2p_proxy.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class P2PProxy {
6666
int rank = 0;
6767
int size = 0;
6868
int cuda_device_index = -1;
69+
std::string location;
6970
};
7071

7172
struct SendOp {
@@ -231,6 +232,7 @@ class P2PProxy {
231232
int rank_ = 0;
232233
int size_ = 0;
233234
int cuda_device_index_ = -1;
235+
std::string location_;
234236
P2PResources resources_;
235237

236238
std::queue<SendOpContext> send_queue_;

mooncake-pg/src/mooncake_backend.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ MooncakeBackend::MooncakeBackend(
185185
.rank = rank_,
186186
.size = size_,
187187
.cuda_device_index = cuda_device_index,
188+
.location = location,
188189
});
189190
p2p_device_worker_->registerProxy(p2p_proxy_);
190191

mooncake-pg/src/p2p_proxy.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ P2PProxy::P2PProxy(TransferEngine* engine, const Options& options)
4040
is_cpu_(options.is_cpu),
4141
rank_(options.rank),
4242
size_(options.size),
43-
cuda_device_index_(options.cuda_device_index) {
43+
cuda_device_index_(options.cuda_device_index),
44+
location_(options.location) {
4445
if (!is_cpu_ && cuda_device_index_ < 0) {
4546
int current_device = -1;
4647
const cudaError_t get_device_error = cudaGetDevice(&current_device);
@@ -71,15 +72,15 @@ void P2PProxy::AllocateResources() {
7172
resources_.send_buffer_ = std::malloc(kP2PTotalBufferSize);
7273
TORCH_CHECK(resources_.send_buffer_ != nullptr,
7374
"Failed to allocate CPU P2P send buffer");
74-
int rc = engine_->registerLocalMemory(
75-
resources_.send_buffer_, kP2PTotalBufferSize, kWildcardLocation);
75+
int rc = engine_->registerLocalMemory(resources_.send_buffer_,
76+
kP2PTotalBufferSize, location_);
7677
TORCH_CHECK(rc == 0, "Failed to register CPU P2P send buffer");
7778

7879
resources_.recv_buffer_ = std::malloc(kP2PTotalBufferSize);
7980
TORCH_CHECK(resources_.recv_buffer_ != nullptr,
8081
"Failed to allocate CPU P2P recv buffer");
81-
rc = engine_->registerLocalMemory(
82-
resources_.recv_buffer_, kP2PTotalBufferSize, kWildcardLocation);
82+
rc = engine_->registerLocalMemory(resources_.recv_buffer_,
83+
kP2PTotalBufferSize, location_);
8384
TORCH_CHECK(rc == 0, "Failed to register CPU P2P recv buffer");
8485
} else {
8586
SetCudaDeviceIfNeeded(
@@ -89,15 +90,15 @@ void P2PProxy::AllocateResources() {
8990
cudaMalloc(&resources_.send_buffer_, kP2PTotalBufferSize);
9091
TORCH_CHECK(err == cudaSuccess,
9192
"Failed to allocate CUDA P2P send buffer");
92-
int rc = engine_->registerLocalMemory(
93-
resources_.send_buffer_, kP2PTotalBufferSize, kWildcardLocation);
93+
int rc = engine_->registerLocalMemory(resources_.send_buffer_,
94+
kP2PTotalBufferSize, location_);
9495
TORCH_CHECK(rc == 0, "Failed to register CUDA P2P send buffer");
9596

9697
err = cudaMalloc(&resources_.recv_buffer_, kP2PTotalBufferSize);
9798
TORCH_CHECK(err == cudaSuccess,
9899
"Failed to allocate CUDA P2P recv buffer");
99-
rc = engine_->registerLocalMemory(
100-
resources_.recv_buffer_, kP2PTotalBufferSize, kWildcardLocation);
100+
rc = engine_->registerLocalMemory(resources_.recv_buffer_,
101+
kP2PTotalBufferSize, location_);
101102
TORCH_CHECK(rc == 0, "Failed to register CUDA P2P recv buffer");
102103
}
103104

@@ -152,12 +153,12 @@ void P2PProxy::AllocateResources() {
152153

153154
int rc = engine_->registerLocalMemory(resources_.ctrl_send_region_,
154155
kMaxNumRanks * sizeof(P2PControlSlot),
155-
kWildcardLocation);
156+
location_);
156157
TORCH_CHECK(rc == 0, "Failed to register P2P ctrl send region");
157158

158159
rc = engine_->registerLocalMemory(resources_.ctrl_recv_region_,
159160
kMaxNumRanks * sizeof(P2PControlSlot),
160-
kWildcardLocation);
161+
location_);
161162
TORCH_CHECK(rc == 0, "Failed to register P2P ctrl recv region");
162163
}
163164

0 commit comments

Comments
 (0)