-
Notifications
You must be signed in to change notification settings - Fork 11.9k
sycl : Implemented reorder Q4_K mmvq #13109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -341,7 +341,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, | |||||
assert(tensor->view_src->buffer->buft == buffer->buft); | ||||||
return GGML_STATUS_SUCCESS; | ||||||
} | ||||||
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) { | ||||||
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) { | ||||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; | ||||||
tensor->extra = extra; | ||||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx. | ||||||
|
@@ -2841,6 +2841,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { | |||||
switch (type) { | ||||||
case GGML_TYPE_Q4_0: | ||||||
sgeor255 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return true; | ||||||
case GGML_TYPE_Q4_K: | ||||||
return !g_ggml_sycl_prioritize_dmmv; | ||||||
default: | ||||||
return false; | ||||||
} | ||||||
|
@@ -2858,6 +2860,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { | |||||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { | ||||||
switch (type) { | ||||||
case GGML_TYPE_Q4_0: | ||||||
case GGML_TYPE_Q4_K: | ||||||
return true; | ||||||
default: | ||||||
return false; | ||||||
|
@@ -2883,16 +2886,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { | |||||
} | ||||||
} | ||||||
|
||||||
static void reorder_qw(char *data_device, const int ncols, const int nrows, | ||||||
size_t size, size_t offset, dpct::queue_ptr stream) { | ||||||
auto tmp_buf = sycl::malloc_shared<char>(size, *stream); | ||||||
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset, | ||||||
dpct::queue_ptr stream) { | ||||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream); | ||||||
SYCL_CHECK( | ||||||
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) | ||||||
.wait())); | ||||||
GGML_ASSERT((size % sizeof(block_q4_0) == 0)); | ||||||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); | ||||||
int offset_blks = offset / sizeof(block_q4_0); | ||||||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2; | ||||||
auto qs_ptr = data_device + offset_blks * QK4_0 / 2; | ||||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; | ||||||
|
||||||
stream->parallel_for( | ||||||
|
@@ -2906,18 +2909,59 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows, | |||||
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; | ||||||
} | ||||||
*(d_ptr + ib) = x[ib].d; | ||||||
}); | ||||||
}).wait_and_throw(); | ||||||
|
||||||
sycl::free(tmp_buf, *stream); | ||||||
} | ||||||
|
||||||
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { | ||||||
GGML_ASSERT(size % sizeof(block_q4_K) == 0); | ||||||
GGML_ASSERT(offset % sizeof(block_q4_K) == 0); | ||||||
|
||||||
const int nblocks = size / sizeof(block_q4_K); | ||||||
|
||||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream); | ||||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait())); | ||||||
|
||||||
auto * qs_ptr = data_device; | ||||||
qnixsynapse marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks; | ||||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks); | ||||||
|
||||||
stream->parallel_for(nblocks, [=](auto i) { | ||||||
const block_q4_K * x = (const block_q4_K *) tmp_buf; | ||||||
const int ib = i; | ||||||
|
||||||
for (int j = 0; j < QK_K / 2; ++j) { | ||||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j]; | ||||||
} | ||||||
|
||||||
for (int j = 0; j < K_SCALE_SIZE; ++j) { | ||||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j]; | ||||||
} | ||||||
|
||||||
dm_ptr[ib] = x[ib].dm; | ||||||
}).wait_and_throw(); | ||||||
|
||||||
sycl::free(tmp_buf, *stream); | ||||||
} | ||||||
|
||||||
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { | ||||||
char*data_device = (char*)src0->data; | ||||||
uint8_t * data_device = (uint8_t *) src0->data; | ||||||
size_t ncols = src0->ne[0]; | ||||||
size_t nrows = src0->ne[1]; | ||||||
size_t size = ggml_nbytes(src0); | ||||||
|
||||||
reorder_qw(data_device, ncols, nrows, size, 0, stream); | ||||||
switch (src0->type) { | ||||||
case GGML_TYPE_Q4_0: | ||||||
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream); | ||||||
break; | ||||||
case GGML_TYPE_Q4_K: | ||||||
reorder_qw_q4_k(data_device, size, 0, stream); | ||||||
break; | ||||||
default: | ||||||
GGML_ABORT("reorder_qw() called with unsupported type"); | ||||||
break; | ||||||
} | ||||||
} | ||||||
|
||||||
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) { | ||||||
|
@@ -2960,8 +3004,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * | |||||
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering | ||||||
} | ||||||
|
||||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|
||||||
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's nit but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should use one function to detect if need call deq_mul_mat_vec(), instead of several. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I mean is that the final choice depends on the outputs from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I agree with @Rbiessy and that's why it is currently implemented this way. |
||||||
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && | ||||||
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; | ||||||
} | ||||||
|
||||||
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && | ||||||
src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; | ||||||
} | ||||||
|
||||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); | ||||||
int64_t min_compute_capability = INT_MAX; | ||||||
|
||||||
|
@@ -2984,13 +3038,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |||||
} | ||||||
|
||||||
// check data types and tensor shapes for custom matrix multiplication kernels: | ||||||
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type) | ||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 | ||||||
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; | ||||||
bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst); | ||||||
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) | ||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 | ||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; | ||||||
bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst); | ||||||
|
||||||
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type) | ||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; | ||||||
|
Uh oh!
There was an error while loading. Please reload this page.