Skip to content

Commit 15bbce9

Browse files
committed
opencl: fix couple crashes
* fix kernel launches failed on devices which do not support non-uniform work-groups. When non-uniform work-groups are not supported, set `local_work_size` to NULL (= let driver choose the work-group sizes). This patch does not cover everything - just the cases tested by test-backend-ops. * fix sub-buffer creation failed due to `cl_buffer_region::origin` not being aligned to `CL_DEVICE_MEM_BASE_ADDR_ALIGN`.
1 parent 80982e8 commit 15bbce9

File tree

1 file changed

+73
-23
lines changed

1 file changed

+73
-23
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 73 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ struct ggml_cl_version {
7474
cl_uint minor = 0;
7575
};
7676

77+
7778
struct ggml_cl_compiler_version {
7879
ADRENO_CL_COMPILER_TYPE type;
7980
int major = -1;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
9192
}
9293
};
9394

95+
static size_t align_to(size_t value, size_t to_alignment) {
96+
GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
97+
GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
98+
99+
return ((value + to_alignment - 1) / to_alignment) * to_alignment;
100+
}
101+
102+
94103
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95104
static ggml_cl_version parse_cl_version(std::string_view str) {
96105
size_t major_str_begin = 0;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
248257

249258
int adreno_wave_size;
250259

260+
cl_bool non_uniform_workgroups;
261+
251262
cl_context context;
252263
cl_command_queue queue;
253264

@@ -1397,6 +1408,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
13971408
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
13981409
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
13991410

1411+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
1412+
&backend_ctx->non_uniform_workgroups, 0));
1413+
14001414
// Print out configurations
14011415
#ifdef GGML_OPENCL_SOA_Q
14021416
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -2058,15 +2072,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
20582072
// The original tensor memory is divided into scales and quants, i.e.,
20592073
// we first store scales, then quants.
20602074
// Create subbuffer for scales.
2061-
region.origin = extra_orig->offset + tensor->view_offs + offset;
2075+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
20622076
region.size = size_d;
20632077
extra->d = clCreateSubBuffer(
20642078
extra_orig->data_device, CL_MEM_READ_WRITE,
20652079
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
20662080
CL_CHECK(err);
2081+
auto previous_origin = region.origin;
20672082

20682083
// Create subbuffer for quants.
2069-
region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
2084+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
20702085
region.size = size_q;
20712086
extra->q = clCreateSubBuffer(
20722087
extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2942,14 +2957,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
29422957
size_t global_work_size[] = {(size_t)n, 1, 1};
29432958
size_t local_work_size[] = {64, 1, 1};
29442959

2960+
size_t * local_work_size_ptr = local_work_size;
2961+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2962+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2963+
}
2964+
29452965
#ifdef GGML_OPENCL_PROFILING
29462966
cl_event evt;
2947-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2967+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
29482968

29492969
g_profiling_info.emplace_back();
2950-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2970+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
29512971
#else
2952-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2972+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
29532973
#endif
29542974
} else {
29552975
unsigned int nth = MIN(64, ne0);
@@ -3077,14 +3097,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
30773097
size_t global_work_size[] = {(size_t)n, 1, 1};
30783098
size_t local_work_size[] = {64, 1, 1};
30793099

3100+
size_t * local_work_size_ptr = local_work_size;
3101+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3102+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3103+
}
3104+
30803105
#ifdef GGML_OPENCL_PROFILING
30813106
cl_event evt;
3082-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3107+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
30833108

30843109
g_profiling_info.emplace_back();
3085-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3110+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
30863111
#else
3087-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3112+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
30883113
#endif
30893114
} else {
30903115
unsigned int nth = MIN(64, ne0);
@@ -3233,14 +3258,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
32333258
size_t global_work_size[] = {(size_t)n, 1, 1};
32343259
size_t local_work_size[] = {64, 1, 1};
32353260

3261+
size_t * local_work_size_ptr = local_work_size;
3262+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3263+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3264+
}
3265+
32363266
#ifdef GGML_OPENCL_PROFILING
32373267
cl_event evt;
3238-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3268+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
32393269

32403270
g_profiling_info.emplace_back();
3241-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3271+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
32423272
#else
3243-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3273+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
32443274
#endif
32453275
}
32463276

@@ -3273,14 +3303,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
32733303
size_t global_work_size[] = {(size_t)n, 1, 1};
32743304
size_t local_work_size[] = {64, 1, 1};
32753305

3306+
size_t * local_work_size_ptr = local_work_size;
3307+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3308+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3309+
}
3310+
32763311
#ifdef GGML_OPENCL_PROFILING
32773312
cl_event evt;
3278-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3313+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
32793314

32803315
g_profiling_info.emplace_back();
3281-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3316+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
32823317
#else
3283-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3318+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
32843319
#endif
32853320
}
32863321

@@ -3320,14 +3355,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
33203355
size_t global_work_size[] = {(size_t)n, 1, 1};
33213356
size_t local_work_size[] = {64, 1, 1};
33223357

3358+
size_t * local_work_size_ptr = local_work_size;
3359+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3360+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3361+
}
3362+
33233363
#ifdef GGML_OPENCL_PROFILING
33243364
cl_event evt;
3325-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3365+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
33263366

33273367
g_profiling_info.emplace_back();
3328-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3368+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
33293369
#else
3330-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3370+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
33313371
#endif
33323372
}
33333373

@@ -4230,14 +4270,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
42304270
size_t global_work_size[] = {(size_t)n, 1, 1};
42314271
size_t local_work_size[] = {64, 1, 1};
42324272

4273+
size_t * local_work_size_ptr = local_work_size;
4274+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4275+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4276+
}
4277+
42334278
#ifdef GGML_OPENCL_PROFILING
42344279
cl_event evt;
4235-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4280+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
42364281

42374282
g_profiling_info.emplace_back();
4238-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4283+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
42394284
#else
4240-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4285+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
42414286
#endif
42424287
}
42434288

@@ -4418,14 +4463,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
44184463
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
44194464
size_t local_work_size[] = {64, 1, 1};
44204465

4466+
size_t * local_work_size_ptr = local_work_size;
4467+
if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4468+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4469+
}
4470+
44214471
#ifdef GGML_OPENCL_PROFILING
44224472
cl_event evt;
4423-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4473+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
44244474

44254475
g_profiling_info.emplace_back();
4426-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4476+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
44274477
#else
4428-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4478+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
44294479
#endif
44304480
}
44314481
}

0 commit comments

Comments
 (0)