@@ -74,6 +74,7 @@ struct ggml_cl_version {
74
74
cl_uint minor = 0 ;
75
75
};
76
76
77
+
77
78
struct ggml_cl_compiler_version {
78
79
ADRENO_CL_COMPILER_TYPE type;
79
80
int major = -1 ;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
91
92
}
92
93
};
93
94
95
+ static size_t align_to (size_t value, size_t to_alignment) {
96
+ GGML_ASSERT (to_alignment && " Invalid alignment (must be non-zero)" );
97
+ GGML_ASSERT ((to_alignment & (to_alignment - 1 )) == 0 && " to_alignment must be power-of-two" );
98
+
99
+ return ((value + to_alignment - 1 ) / to_alignment) * to_alignment;
100
+ }
101
+
102
+
94
103
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95
104
static ggml_cl_version parse_cl_version (std::string_view str) {
96
105
size_t major_str_begin = 0 ;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
248
257
249
258
int adreno_wave_size;
250
259
260
+ cl_bool non_uniform_workgroups;
261
+
251
262
cl_context context;
252
263
cl_command_queue queue;
253
264
@@ -1397,6 +1408,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1397
1408
GGML_LOG_INFO (" ggml_opencl: SVM atomics support: %s\n " ,
1398
1409
svm_caps & CL_DEVICE_SVM_ATOMICS ? " true" : " false" );
1399
1410
1411
+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof (cl_bool),
1412
+ &backend_ctx->non_uniform_workgroups , 0 ));
1413
+
1400
1414
// Print out configurations
1401
1415
#ifdef GGML_OPENCL_SOA_Q
1402
1416
GGML_LOG_INFO (" ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n " );
@@ -2058,15 +2072,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2058
2072
// The original tensor memory is divided into scales and quants, i.e.,
2059
2073
// we first store scales, then quants.
2060
2074
// Create subbuffer for scales.
2061
- region.origin = extra_orig->offset + tensor->view_offs + offset;
2075
+ region.origin = align_to ( extra_orig->offset + tensor->view_offs + offset, backend_ctx-> alignment ) ;
2062
2076
region.size = size_d;
2063
2077
extra->d = clCreateSubBuffer (
2064
2078
extra_orig->data_device , CL_MEM_READ_WRITE,
2065
2079
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
2066
2080
CL_CHECK (err);
2081
+ auto previous_origin = region.origin ;
2067
2082
2068
2083
// Create subbuffer for quants.
2069
- region.origin = extra_orig-> offset + tensor-> view_offs + offset + size_d ;
2084
+ region.origin = align_to (previous_origin + size_d, backend_ctx-> alignment ) ;
2070
2085
region.size = size_q;
2071
2086
extra->q = clCreateSubBuffer (
2072
2087
extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -2942,14 +2957,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
2942
2957
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2943
2958
size_t local_work_size[] = {64 , 1 , 1 };
2944
2959
2960
+ size_t * local_work_size_ptr = local_work_size;
2961
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2962
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2963
+ }
2964
+
2945
2965
#ifdef GGML_OPENCL_PROFILING
2946
2966
cl_event evt;
2947
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2967
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2948
2968
2949
2969
g_profiling_info.emplace_back ();
2950
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2970
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2951
2971
#else
2952
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2972
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2953
2973
#endif
2954
2974
} else {
2955
2975
unsigned int nth = MIN (64 , ne0);
@@ -3077,14 +3097,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3077
3097
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3078
3098
size_t local_work_size[] = {64 , 1 , 1 };
3079
3099
3100
+ size_t * local_work_size_ptr = local_work_size;
3101
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3102
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3103
+ }
3104
+
3080
3105
#ifdef GGML_OPENCL_PROFILING
3081
3106
cl_event evt;
3082
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3107
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3083
3108
3084
3109
g_profiling_info.emplace_back ();
3085
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3110
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3086
3111
#else
3087
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3112
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3088
3113
#endif
3089
3114
} else {
3090
3115
unsigned int nth = MIN (64 , ne0);
@@ -3233,14 +3258,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3233
3258
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3234
3259
size_t local_work_size[] = {64 , 1 , 1 };
3235
3260
3261
+ size_t * local_work_size_ptr = local_work_size;
3262
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3263
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3264
+ }
3265
+
3236
3266
#ifdef GGML_OPENCL_PROFILING
3237
3267
cl_event evt;
3238
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3268
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3239
3269
3240
3270
g_profiling_info.emplace_back ();
3241
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3271
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3242
3272
#else
3243
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3273
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3244
3274
#endif
3245
3275
}
3246
3276
@@ -3273,14 +3303,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3273
3303
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3274
3304
size_t local_work_size[] = {64 , 1 , 1 };
3275
3305
3306
+ size_t * local_work_size_ptr = local_work_size;
3307
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3308
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3309
+ }
3310
+
3276
3311
#ifdef GGML_OPENCL_PROFILING
3277
3312
cl_event evt;
3278
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3313
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3279
3314
3280
3315
g_profiling_info.emplace_back ();
3281
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3316
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3282
3317
#else
3283
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3318
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3284
3319
#endif
3285
3320
}
3286
3321
@@ -3320,14 +3355,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3320
3355
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3321
3356
size_t local_work_size[] = {64 , 1 , 1 };
3322
3357
3358
+ size_t * local_work_size_ptr = local_work_size;
3359
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3360
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3361
+ }
3362
+
3323
3363
#ifdef GGML_OPENCL_PROFILING
3324
3364
cl_event evt;
3325
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3365
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3326
3366
3327
3367
g_profiling_info.emplace_back ();
3328
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3368
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3329
3369
#else
3330
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3370
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3331
3371
#endif
3332
3372
}
3333
3373
@@ -4230,14 +4270,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4230
4270
size_t global_work_size[] = {(size_t )n, 1 , 1 };
4231
4271
size_t local_work_size[] = {64 , 1 , 1 };
4232
4272
4273
+ size_t * local_work_size_ptr = local_work_size;
4274
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4275
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4276
+ }
4277
+
4233
4278
#ifdef GGML_OPENCL_PROFILING
4234
4279
cl_event evt;
4235
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4280
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
4236
4281
4237
4282
g_profiling_info.emplace_back ();
4238
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4283
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
4239
4284
#else
4240
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4285
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
4241
4286
#endif
4242
4287
}
4243
4288
@@ -4418,14 +4463,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4418
4463
size_t global_work_size[] = {(size_t )ne00, (size_t )ne01, (size_t )ne02};
4419
4464
size_t local_work_size[] = {64 , 1 , 1 };
4420
4465
4466
+ size_t * local_work_size_ptr = local_work_size;
4467
+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4468
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4469
+ }
4470
+
4421
4471
#ifdef GGML_OPENCL_PROFILING
4422
4472
cl_event evt;
4423
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4473
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
4424
4474
4425
4475
g_profiling_info.emplace_back ();
4426
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4476
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
4427
4477
#else
4428
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4478
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
4429
4479
#endif
4430
4480
}
4431
4481
}
0 commit comments