Skip to content

Commit d62a361

Browse files
committed
ggml-dsp: make GGML_OP_ADD more faster on cDSP side
1 parent 24d0ef2 commit d62a361

File tree

5 files changed

+87
-37
lines changed

5 files changed

+87
-37
lines changed

ggml/src/ggml-hexagon/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
2222
message("Debug mode:${DEBUG_FLAG}")
2323
else()
2424
set(DEBUG_FLAG "-DNDEBUG -Wall")
25+
#manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
26+
#make compare NPU performance through llama-bench more clear
27+
#set(DEBUG_FLAG "-DNDEBUG -Wall -DDISABLE_ALL_LOG")
2528
message("Release mode:${DEBUG_FLAG}")
2629
endif()
2730

ggml/src/ggml-hexagon/ggml-hexagon.cpp

+29-1
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,16 @@ struct ggml_backend_hexagon_context;
141141

142142
#define GGMLHEXAGON_LOG_ERROR(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
143143
#define GGMLHEXAGON_LOG_WARN(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
144+
145+
#if !defined (DISABLE_ALL_LOG)
144146
#define GGMLHEXAGON_LOG_INFO(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
145147
#define GGMLHEXAGON_LOG_VERBOSE(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
148+
#else
149+
//manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
150+
//make compare NPU performance through llama-bench more clear
151+
#define GGMLHEXAGON_LOG_INFO(...)
152+
#define GGMLHEXAGON_LOG_VERBOSE(...)
153+
#endif
146154

147155
#if GGMLHEXAGON_DEBUG
148156
#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -6365,7 +6373,27 @@ struct ggml_backend_hexagon_reg_context {
63656373

63666374
static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
63676375
GGML_UNUSED(reg);
6368-
return "ggml-hexagon";
6376+
//return "ggml-hexagon";
6377+
6378+
//return accurate backend name rather than "ggml-hexagon" to
6379+
//make compare NPU performance through llama-bench more clear
6380+
if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
6381+
GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
6382+
return "Hexagon-cDSP";
6383+
}
6384+
6385+
if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) {
6386+
if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend)
6387+
return "QNN-NPU";
6388+
6389+
if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend)
6390+
return "QNN-GPU";
6391+
6392+
if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend)
6393+
return "QNN-CPU";
6394+
}
6395+
6396+
return "unknown";
63696397
}
63706398

63716399
static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {

ggml/src/ggml-hexagon/kernels/add.c

+29-24
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
#include "ggml-dsp.h"
22

3-
inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
3+
static inline void l2fetch(const void * p, uint32_t stride,
4+
uint32_t width, uint32_t height,
5+
uint32_t dir) {
6+
uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
7+
__asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control));
8+
}
9+
10+
static inline void ggmlhexagon_dsp_add_f32(const int n, float * GGML_RESTRICT z, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
411
HVX_Vector * va;
512
HVX_Vector * vb;
613
HVX_Vector * vc;
714
HVX_Vector qf32;
8-
const int FLOATS_PER_VECTOR = 128 / sizeof(float);
9-
const int block = n / FLOATS_PER_VECTOR;
10-
const int left = n % FLOATS_PER_VECTOR;
11-
const int blocks = block * FLOATS_PER_VECTOR;
15+
const size_t FLOATS_PER_VECTOR = 128 / sizeof(float);
16+
const size_t block = n / FLOATS_PER_VECTOR;
17+
const size_t left = n % FLOATS_PER_VECTOR;
18+
const size_t blocks = block * FLOATS_PER_VECTOR;
1219

1320
if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
1421
GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
@@ -21,11 +28,13 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
2128
va = (HVX_Vector *)x;
2229
vb = (HVX_Vector *)y;
2330
vc = (HVX_Vector *)z;
31+
//unroll is better but need more carefully check for various cases and I think DSP also don't like branch predication
2432
for (size_t i = 0; i < block; ++i) {
33+
l2fetch(va + VLEN, VLEN, VLEN, 1, 0);
34+
l2fetch(vb + VLEN, VLEN, VLEN, 1, 0);
2535
//*vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
2636
qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
27-
*vc = Q6_Vsf_equals_Vqf32(qf32);
28-
vc++;
37+
*vc++ = Q6_Vsf_equals_Vqf32(qf32);
2938
}
3039

3140
if (left > 0) {
@@ -49,6 +58,17 @@ static void ggml_compute_forward_add_f32(
4958

5059
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5160

61+
const int rank = ggml_n_dims(src0);
62+
if (1 == rank) {
63+
//element-wise addition with vector
64+
const size_t len = src0->ne[0];
65+
float * dst_ptr = (float *) (dst->data);
66+
float * src0_ptr = (float *) (src0->data);
67+
float * src1_ptr = (float *) (src1->data);
68+
ggmlhexagon_dsp_add_f32(len, dst_ptr, src0_ptr, src1_ptr);
69+
return;
70+
}
71+
5272
const int ith = 0;
5373
const int nth = 1;
5474

@@ -115,24 +135,9 @@ static void ggml_compute_forward_add_f32(
115135
}
116136

117137
//FIXME: why failed with test-backend-ops when disable ion rpc mempool
118-
int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
119-
{
138+
int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
120139
GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
121-
switch (src0->type) {
122-
case GGML_TYPE_F32:
123-
{
124-
if (src1->type == GGML_TYPE_F32) {
125-
ggml_compute_forward_add_f32(src0, src1, dst);
126-
} else {
127-
GGML_ABORT("fatal error");
128-
}
129-
break;
130-
}
131-
default:
132-
{
133-
GGML_ABORT("fatal error");
134-
}
135-
}
140+
ggml_compute_forward_add_f32(src0, src1, dst);
136141
GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
137142
return 0;
138143
}

ggml/src/ggml-hexagon/kernels/ggml-dsp.c

+9
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
178178
return ggml_is_contiguous_0(tensor);
179179
}
180180

181+
int ggml_n_dims(const struct ggml_tensor * tensor) {
182+
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
183+
if (tensor->ne[i] > 1) {
184+
return i + 1;
185+
}
186+
}
187+
return 1;
188+
}
189+
181190
void ggml_abort(const char * file, int line, const char * fmt, ...) {
182191
GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
183192
abort();

ggml/src/ggml-hexagon/kernels/ggml-dsp.h

+17-12
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ extern "C" {
3131

3232
#define ALIGN_128_BYTE 128
3333

34+
#define VLEN 128
35+
3436
#define GGML_UNUSED(x) (void)(x)
3537

3638
#define UNUSED GGML_UNUSED
@@ -50,6 +52,8 @@ extern "C" {
5052
#define GGML_MEM_ALIGN 16
5153
#endif
5254

55+
#define GGML_API extern
56+
5357
#ifdef __cplusplus
5458
// restrict not standard in C++
5559
# if defined(__GNUC__)
@@ -142,21 +146,22 @@ enum ggml_type {
142146

143147
typedef double ggml_float;
144148

145-
int64_t ggml_time_ms(void);
146-
int64_t ggml_time_us(void);
149+
GGML_API int64_t ggml_time_ms(void);
150+
GGML_API int64_t ggml_time_us(void);
147151

148-
size_t ggml_nbytes(const struct ggml_tensor * tensor);
149-
int64_t ggml_nrows(const struct ggml_tensor * tensor);
150-
bool ggml_is_contiguous(const struct ggml_tensor * tensor);
151-
void ggml_abort(const char * file, int line, const char * fmt, ...);
152-
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
153-
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
152+
GGML_API size_t ggml_nbytes(const struct ggml_tensor * tensor);
153+
GGML_API int64_t ggml_nrows(const struct ggml_tensor * tensor);
154+
GGML_API int ggml_n_dims(const struct ggml_tensor * tensor);
155+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
156+
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
157+
GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
158+
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
154159

155-
void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
156-
void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
157-
void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
160+
GGML_API void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
161+
GGML_API void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
162+
GGML_API void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
158163

159-
int ggmlop_get_thread_counts(void);
164+
GGML_API int ggmlop_get_thread_counts(void);
160165

161166
#ifdef __cplusplus
162167
}

0 commit comments

Comments
 (0)