|
1 | 1 | #include "binbcast.cuh"
|
| 2 | +#include <driver_types.h> |
2 | 3 | #include <cstdint>
|
3 | 4 |
|
4 | 5 | static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
@@ -56,6 +57,37 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
|
56 | 57 | }
|
57 | 58 | }
|
58 | 59 |
|
| 60 | +template<typename src_t, typename dst_t> |
| 61 | +static __global__ void k_fused_add(const src_t ** src, int n_srcs, dst_t * dst, |
| 62 | + const int ne0, const int ne1, const int ne2, const int ne3, |
| 63 | + const int s1, const int s2, const int s3) { |
| 64 | + |
| 65 | + const int i0s = blockDim.x*blockIdx.x + threadIdx.x; |
| 66 | + const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); |
| 67 | + const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; |
| 68 | + const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; |
| 69 | + |
| 70 | + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { |
| 71 | + return; |
| 72 | + } |
| 73 | + |
| 74 | + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; |
| 75 | + const size_t i_src1 = i3*s3 + i2*s2 + i1*s1; |
| 76 | + const size_t i_src2 = i3*s3 + i2*s2 + i1*s1; |
| 77 | + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; |
| 78 | + |
| 79 | + dst_t * dst_row = dst + i_dst; |
| 80 | + |
| 81 | + for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { |
| 82 | + float sum = 0.; |
| 83 | + for (int i = 0 ; i < n_srcs; ++i) { |
| 84 | + const src_t * src_row = src[i] + i_dst; // use same offset as dst |
| 85 | + sum += (float)src_row[i0]; |
| 86 | + } |
| 87 | + dst_row[i0] = (dst_t)sum; |
| 88 | + } |
| 89 | +} |
| 90 | + |
59 | 91 | template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
60 | 92 | static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
61 | 93 | int ne0, int ne1, int ne2, int ne3,
|
@@ -331,6 +363,51 @@ void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
331 | 363 | ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
332 | 364 | }
|
333 | 365 |
|
| 366 | + |
| 367 | +void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse) { |
| 368 | + |
| 369 | + printf("ggml_cuda_op_fused_add: %d\n", n_fuse); |
| 370 | + |
| 371 | + cudaStream_t stream = ctx.stream(); |
| 372 | + |
| 373 | + GGML_ASSERT(1 <= n_fuse && n_fuse <= 8); |
| 374 | + |
| 375 | + // Collect device pointers on host |
| 376 | + const float * h_src[n_fuse + 1]; |
| 377 | + for(int i = 0 ; i < n_fuse + 1; ++i) { |
| 378 | + h_src[i] = (const float*)dst->src[i]->data; |
| 379 | + } |
| 380 | + |
| 381 | + // Allocate device array for pointers and copy |
| 382 | + const float ** d_src; |
| 383 | + cudaMalloc((void **) &d_src, (n_fuse + 1) * sizeof(float *)); |
| 384 | + cudaMemcpy(d_src, h_src, (n_fuse + 1) * sizeof(float *), cudaMemcpyHostToDevice); |
| 385 | + |
| 386 | + //All layouts are same in the fused ops |
| 387 | + const int ne0 = dst->ne[0]; |
| 388 | + const int ne1 = dst->ne[1]; |
| 389 | + const int ne2 = dst->ne[2]; |
| 390 | + const int ne3 = dst->ne[3]; |
| 391 | + const int s1 = dst->nb[1] / sizeof(float); |
| 392 | + const int s2 = dst->nb[2] / sizeof(float); |
| 393 | + const int s3 = dst->nb[3] / sizeof(float); |
| 394 | + |
| 395 | + const int block_size = 128; |
| 396 | + dim3 block_dims; |
| 397 | + block_dims.x = std::min<unsigned int>(ne0, block_size); |
| 398 | + block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x); |
| 399 | + block_dims.z = std::min(std::min<unsigned int>(ne2 * ne3, block_size / block_dims.x / block_dims.y), 64U); |
| 400 | + |
| 401 | + dim3 block_nums((ne0 + block_dims.x - 1) / block_dims.x, |
| 402 | + (ne1 + block_dims.y - 1) / block_dims.y, |
| 403 | + (ne2 * ne3 + block_dims.z - 1) / block_dims.z); |
| 404 | + |
| 405 | + k_fused_add<float, float> |
| 406 | + <<<block_nums, block_dims, 0, stream>>>(d_src, n_fuse + 1, (float *) dst->data, ne0, ne1, ne2, ne3, s1, s2, s3); |
| 407 | + |
| 408 | + cudaFree(d_src); |
| 409 | +} |
| 410 | + |
334 | 411 | void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
335 | 412 | const ggml_tensor * src0 = dst->src[0];
|
336 | 413 |
|
|
0 commit comments