ggml-dsp: make GGML_OP_ADD more faster on cDSP side

zhouwg · zhouwg · commit d62a3614c82a · 2025-04-19T13:04:29.000+08:00
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -22,6 +22,9 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     message("Debug mode:${DEBUG_FLAG}")
 else()
     set(DEBUG_FLAG "-DNDEBUG -Wall")
+#manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
+#make compare NPU performance through llama-bench more clear
+#set(DEBUG_FLAG "-DNDEBUG -Wall -DDISABLE_ALL_LOG")
     message("Release mode:${DEBUG_FLAG}")
 endif()
 
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -141,8 +141,16 @@ struct ggml_backend_hexagon_context;
 
 #define GGMLHEXAGON_LOG_ERROR(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_WARN(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if !defined (DISABLE_ALL_LOG)
 #define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_VERBOSE(...)                    ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+//manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
+//make compare NPU performance through llama-bench more clear
+#define GGMLHEXAGON_LOG_INFO(...)
+#define GGMLHEXAGON_LOG_VERBOSE(...)
+#endif
 
 #if GGMLHEXAGON_DEBUG
 #define GGMLHEXAGON_LOG_DEBUG(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -6365,7 +6373,27 @@ struct ggml_backend_hexagon_reg_context {
 
 static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    return "ggml-hexagon";
+    //return "ggml-hexagon";
+
+    //return accurate backend name rather than "ggml-hexagon" to
+    //make compare NPU performance through llama-bench more clear
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
+        return "Hexagon-cDSP";
+    }
+
+    if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) {
+        if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend)
+            return "QNN-NPU";
+
+        if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend)
+            return "QNN-GPU";
+
+        if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend)
+            return "QNN-CPU";
+    }
+
+    return "unknown";
 }
 
 static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c
@@ -1,14 +1,21 @@
 #include "ggml-dsp.h"
 
-inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
+static inline void l2fetch(const void * p, uint32_t stride,
+                           uint32_t width, uint32_t height,
+                           uint32_t dir) {
+    uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
+    __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control));
+}
+
+static inline void ggmlhexagon_dsp_add_f32(const int n, float * GGML_RESTRICT z, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
     HVX_Vector * va;
     HVX_Vector * vb;
     HVX_Vector * vc;
     HVX_Vector qf32;
-    const int FLOATS_PER_VECTOR = 128 / sizeof(float);
-    const int block  = n / FLOATS_PER_VECTOR;
-    const int left   = n % FLOATS_PER_VECTOR;
-    const int blocks = block * FLOATS_PER_VECTOR;
+    const size_t FLOATS_PER_VECTOR = 128 / sizeof(float);
+    const size_t block  = n / FLOATS_PER_VECTOR;
+    const size_t left   = n % FLOATS_PER_VECTOR;
+    const size_t blocks = block * FLOATS_PER_VECTOR;
 
     if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
         GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
@@ -21,11 +28,13 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
     va = (HVX_Vector *)x;
     vb = (HVX_Vector *)y;
     vc = (HVX_Vector *)z;
+    //unroll is better but need more carefully check for various cases and I think DSP also don't like branch predication
     for (size_t i = 0; i < block; ++i) {
+        l2fetch(va + VLEN, VLEN, VLEN, 1, 0);
+        l2fetch(vb + VLEN, VLEN, VLEN, 1, 0);
         //*vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
         qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
-        *vc = Q6_Vsf_equals_Vqf32(qf32);
-        vc++;
+        *vc++ = Q6_Vsf_equals_Vqf32(qf32);
     }
 
     if (left > 0) {
@@ -49,6 +58,17 @@ static void ggml_compute_forward_add_f32(
 
     GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
+    const int rank = ggml_n_dims(src0);
+    if (1 == rank) {
+        //element-wise addition with vector
+        const size_t len = src0->ne[0];
+        float * dst_ptr  = (float *) (dst->data);
+        float * src0_ptr = (float *) (src0->data);
+        float * src1_ptr = (float *) (src1->data);
+        ggmlhexagon_dsp_add_f32(len, dst_ptr, src0_ptr, src1_ptr);
+        return;
+    }
+
     const int ith = 0;
     const int nth = 1;
 
@@ -115,24 +135,9 @@ static void ggml_compute_forward_add_f32(
 }
 
 //FIXME: why failed with test-backend-ops when disable ion rpc mempool
-int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
-{
+int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
-            if (src1->type == GGML_TYPE_F32) {
-                ggml_compute_forward_add_f32(src0, src1, dst);
-            } else {
-                GGML_ABORT("fatal error");
-            }
-            break;
-        }
-        default:
-        {
-            GGML_ABORT("fatal error");
-        }
-    }
+    ggml_compute_forward_add_f32(src0, src1, dst);
     GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
     return 0;
 }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -178,6 +178,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_0(tensor);
 }
 
+int ggml_n_dims(const struct ggml_tensor * tensor) {
+    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
+        if (tensor->ne[i] > 1) {
+            return i + 1;
+        }
+    }
+    return 1;
+}
+
 void ggml_abort(const char * file, int line, const char * fmt, ...) {
     GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
     abort();
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -31,6 +31,8 @@ extern "C" {
 
 #define ALIGN_128_BYTE      128
 
+#define VLEN                128
+
 #define GGML_UNUSED(x)      (void)(x)
 
 #define UNUSED              GGML_UNUSED
@@ -50,6 +52,8 @@ extern "C" {
 #define GGML_MEM_ALIGN      16
 #endif
 
+#define GGML_API            extern
+
 #ifdef __cplusplus
 // restrict not standard in C++
 #    if defined(__GNUC__)
@@ -142,21 +146,22 @@ enum ggml_type {
 
 typedef double      ggml_float;
 
-int64_t ggml_time_ms(void);
-int64_t ggml_time_us(void);
+GGML_API int64_t ggml_time_ms(void);
+GGML_API int64_t ggml_time_us(void);
 
-size_t ggml_nbytes(const struct ggml_tensor * tensor);
-int64_t ggml_nrows(const struct ggml_tensor * tensor);
-bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-void ggml_abort(const char * file, int line, const char * fmt, ...);
-bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+GGML_API size_t ggml_nbytes(const struct ggml_tensor * tensor);
+GGML_API int64_t ggml_nrows(const struct ggml_tensor * tensor);
+GGML_API int ggml_n_dims(const struct ggml_tensor * tensor);
+GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
-void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
-void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
-void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
+GGML_API void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
+GGML_API void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
+GGML_API void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
 
-int ggmlop_get_thread_counts(void);
+GGML_API int ggmlop_get_thread_counts(void);
 
 #ifdef  __cplusplus
 }