-
Notifications
You must be signed in to change notification settings - Fork 12k
[SYCL] Optimize mul_mat for Q4_0 on Intel GPU #12035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
78e232a
7a6b48d
63e5285
5cfde90
4eaab12
0e91e0e
b3570b9
f111721
c541d6a
30ddc90
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,9 @@ | |
#include "dpct/helper.hpp" | ||
#include "ggml-sycl.h" | ||
#include "presets.hpp" | ||
#include "sycl_hw.hpp" | ||
|
||
|
||
#if GGML_SYCL_DNNL | ||
#include "dnnl.hpp" | ||
#include "dnnl_sycl.hpp" | ||
|
@@ -35,7 +38,10 @@ | |
void* ggml_sycl_host_malloc(size_t size); | ||
void ggml_sycl_host_free(void* ptr); | ||
|
||
|
||
static int g_ggml_sycl_debug = 0; | ||
static int g_ggml_sycl_disable_optimize = 0; | ||
|
||
#define GGML_SYCL_DEBUG(...) \ | ||
do { \ | ||
if (g_ggml_sycl_debug) \ | ||
|
@@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try { | |
} | ||
|
||
////////////////////// | ||
struct optimize_feature { | ||
bool reorder=false; | ||
}; | ||
|
||
struct sycl_device_info { | ||
int cc; // compute capability | ||
// int nsm; // number of streaming multiprocessors | ||
// size_t smpb; // max. shared memory per block | ||
Comment on lines
+197
to
+198
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since there is no use for this right now, it could be good to start removing these as well. |
||
bool vmm; // virtual memory support | ||
size_t total_vram; | ||
sycl_hw_info hw_info; | ||
optimize_feature opt_feature; | ||
}; | ||
|
||
|
||
struct ggml_sycl_device_info { | ||
int device_count; | ||
|
||
struct sycl_device_info { | ||
int cc; // compute capability | ||
// int nsm; // number of streaming multiprocessors | ||
// size_t smpb; // max. shared memory per block | ||
bool vmm; // virtual memory support | ||
size_t total_vram; | ||
}; | ||
|
||
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; | ||
|
||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {}; | ||
|
@@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu { | |
// tensors | ||
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES] | ||
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs | ||
optimize_feature optimized_feature; | ||
}; | ||
|
||
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={}); | ||
|
||
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { | ||
optimize_feature opt; | ||
|
||
opt.reorder = | ||
(arch == syclex::architecture::intel_gpu_dg1 || | ||
arch == syclex::architecture::intel_gpu_acm_g10 || | ||
arch == syclex::architecture::intel_gpu_acm_g11 || | ||
arch == syclex::architecture::intel_gpu_acm_g12 || | ||
arch == syclex::architecture::intel_gpu_pvc || | ||
arch == syclex::architecture::intel_gpu_pvc_vg || | ||
arch == syclex::architecture::intel_gpu_mtl_u || | ||
arch == syclex::architecture::intel_gpu_mtl_s || | ||
arch == syclex::architecture::intel_gpu_mtl_h || | ||
arch == syclex::architecture::intel_gpu_arl_u || | ||
arch == syclex::architecture::intel_gpu_arl_s || | ||
arch == syclex::architecture::intel_gpu_arl_h || | ||
arch == syclex::architecture::intel_gpu_bmg_g21 || | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have access to a BMG gpu, I'll reply later with perf numbers, since I guess you'd want to add them to the README. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Alcpz - did you manage to get BMG GPU perf numbers? |
||
arch == syclex::architecture::intel_gpu_lnl_m | ||
); | ||
|
||
return opt; | ||
} | ||
|
||
struct ggml_backend_sycl_context { | ||
int device; | ||
std::string name; | ||
optimize_feature opt_feature; | ||
bool optimized_graph=false; | ||
|
||
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; | ||
|
||
explicit ggml_backend_sycl_context(int device) : | ||
device(device), | ||
name(GGML_SYCL_NAME + std::to_string(device)) { | ||
opt_feature = ggml_sycl_info().devices[device].opt_feature; | ||
} | ||
|
||
queue_ptr stream(int device, int stream) { | ||
|
@@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev); | |
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, | ||
const ggml_tensor *src1, ggml_tensor *dst, | ||
const ggml_sycl_op_flatten_t op); | ||
|
||
#endif // GGML_SYCL_COMMON_HPP |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,8 @@ | |
#include "common.hpp" | ||
|
||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); | ||
typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs, | ||
const int iqs, dfloat2 &v); | ||
|
||
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib, | ||
const int iqs, dfloat2 &v) { | ||
|
@@ -40,6 +42,29 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib, | |
#endif // GGML_SYCL_F16 | ||
} | ||
|
||
static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs, | ||
const int iqs, dfloat2 &v) { | ||
// const block_q4_0 * x = (const block_q4_0 *) vx; | ||
|
||
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib); | ||
|
||
const int vui = *((const uint8_t *)qs+iqs); | ||
Comment on lines
+49
to
+51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The overall contribution is great. I was doing similar work for the Q4_K quantization, and this is quite helpful. |
||
|
||
v.x() = vui & 0xF; | ||
v.y() = vui >> 4; | ||
|
||
#ifdef GGML_SYCL_F16 | ||
// v = v - {8.0f, 8.0f}; | ||
// v = v * {d, d}; | ||
v.s0() = (v.s0() - 8.0f) * d; | ||
v.s1() = (v.s1() - 8.0f) * d; | ||
|
||
#else | ||
v.x() = (v.x() - 8.0f) * d; | ||
v.y() = (v.y() - 8.0f) * d; | ||
#endif // GGML_SYCL_F16 | ||
Comment on lines
+56
to
+65
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A lot of the new code has the same functionality, with very minor differences in how dawta is accessed. I worry about the combinatorial explosion of having duplicated code in order to maintain support for all non-reordered and reordered quants. Long term, I think it's best for the backend to avoid going in this direction. |
||
} | ||
|
||
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, | ||
const int iqs, dfloat2 &v) { | ||
const block_q4_1 * x = (const block_q4_1 *) vx; | ||
|
@@ -167,6 +192,36 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri | |
} | ||
} | ||
|
||
template<typename dst_t> | ||
static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, | ||
const sycl::nd_item<3> &item_ct1) { | ||
|
||
const int64_t i = item_ct1.get_group(2); | ||
auto k=nb32; | ||
// assume 32 threads | ||
const int64_t tid = item_ct1.get_local_id(2); | ||
const int lane_ib = i * WARP_SIZE + tid; | ||
|
||
if (lane_ib >= k / QK4_0) { | ||
return; | ||
} | ||
|
||
dst_t * y_ptr = yy + lane_ib * QK4_0; | ||
|
||
auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2; | ||
auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib; | ||
|
||
const float d = float(*s_ptr); | ||
|
||
#pragma unroll | ||
for (int l = 0; l < QK4_0 / 2; ++l) { | ||
int vq = qs[l]; | ||
y_ptr[l + 0] = d * ((vq & 0xF) - 8); | ||
y_ptr[l + 16] = d * ((vq >> 4) - 8); | ||
} | ||
|
||
} | ||
|
||
template<typename dst_t> | ||
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, | ||
const sycl::nd_item<3> &item_ct1) { | ||
|
Uh oh!
There was an error while loading. Please reload this page.