Skip to content

Commit f99c2cd

Browse files
committed
use cudaMemcpy3DPeerAsync
1 parent 1659cd1 commit f99c2cd

File tree

1 file changed

+18
-11
lines changed

1 file changed

+18
-11
lines changed

ggml-cuda.cu

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@
6868
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
6969
#endif
7070
#define cudaMemcpy hipMemcpy
71-
#define cudaMemcpy2DAsync hipMemcpy2DAsync
7271
#define cudaMemcpyAsync hipMemcpyAsync
7372
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
73+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
7474
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
7575
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
7676
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
@@ -8259,17 +8259,24 @@ static void ggml_cuda_op_mul_mat(
82598259
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
82608260
dhf_dst_i += src1_col_0*ne0 + row_low[id];
82618261

8262-
if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) {
8263-
// there is no cudaMemcpy2DPeerAsync so we need to copy each row separately
8264-
for (int64_t i = 0; i < src1_ncols; ++i) {
8265-
CUDA_CHECK(cudaMemcpyPeerAsync(dhf_dst_i + i*ne0, g_main_device,
8266-
dst_dd_i + i*row_diff, id,
8267-
row_diff*sizeof(float), stream));
8262+
#if !defined(GGML_USE_HIPBLAS)
8263+
if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) {
8264+
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
8265+
cudaMemcpy3DPeerParms p = {};
8266+
p.dstDevice = g_main_device;
8267+
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), ne0, src1_ncols);
8268+
p.srcDevice = id;
8269+
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
8270+
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
8271+
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
8272+
} else
8273+
#endif
8274+
{
8275+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
8276+
dst_dd_i, row_diff*sizeof(float),
8277+
row_diff*sizeof(float), src1_ncols,
8278+
kind, stream));
82688279
}
8269-
} else {
8270-
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
8271-
row_diff*sizeof(float), src1_ncols, kind, stream));
8272-
}
82738280
} else {
82748281
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
82758282
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));

0 commit comments

Comments
 (0)