From 7f58793c5670c9607d53157c26bc9206ab534553 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 20 Mar 2024 18:45:05 +0100
Subject: [PATCH 01/14] move BLAS to a separate backend

---
 CMakeLists.txt      |  10 +-
 Makefile            |  21 +++-
 ggml-alloc.c        |  42 +++++---
 ggml-backend-impl.h |  28 +++--
 ggml-backend.c      | 158 ++++++++++++++++++++-------
 ggml-backend.h      |   6 +-
 ggml-blas.c         | 257 ++++++++++++++++++++++++++++++++++++++++++++
 ggml-blas.h         |  22 ++++
 ggml-cuda.cu        |  44 ++++----
 ggml-kompute.cpp    |  13 +--
 ggml-metal.m        |  15 +--
 ggml-rpc.cpp        |  21 ++--
 ggml-sycl.cpp       |  28 ++---
 ggml-vulkan.cpp     |  26 ++---
 ggml.c              | 185 ++-----------------------------
 llama.cpp           |  51 ++++++---
 16 files changed, 589 insertions(+), 338 deletions(-)
 create mode 100644 ggml-blas.c
 create mode 100644 ggml-blas.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf37d5bb242ac..2933e714842de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,9 +311,9 @@ if (LLAMA_BLAS)
     if (LLAMA_STATIC)
         set(BLA_STATIC ON)
     endif()
-    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-        set(BLA_SIZEOF_INTEGER 8)
-    endif()
+    #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+    #    set(BLA_SIZEOF_INTEGER 8)
+    #endif()
 
     set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
     find_package(BLAS)
@@ -380,6 +380,9 @@ if (LLAMA_BLAS)
             add_compile_definitions(GGML_BLAS_USE_MKL)
         endif()
 
+        set(GGML_HEADERS_BLAS ggml-blas.h)
+        set(GGML_SOURCES_BLAS ggml-blas.c)
+
         set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
     else()
@@ -1255,6 +1258,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
             ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
             ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
             ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
             )
 
diff --git a/Makefile b/Makefile
index 802ee6a47654c..d45b2759bed2b 100644
--- a/Makefile
+++ b/Makefile
@@ -408,6 +408,7 @@ ifndef LLAMA_NO_ACCELERATE
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
+		OBJS        += ggml-blas.o
 	endif
 endif # LLAMA_NO_ACCELERATE
 
@@ -421,23 +422,35 @@ ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
+	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS
 
-ifndef LLAMA_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJS        += sgemm.o
-endif
+ifdef LLAMA_OPENBLAS64
+	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
+	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
+	OBJS        += ggml-blas.o
+endif # LLAMA_OPENBLAS64
 
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
+	OBJS        += ggml-blas.o
 endif # LLAMA_BLIS
 
+ifndef LLAMA_NO_LLAMAFILE
+	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+	OBJS        += sgemm.o
+endif
+
 ifdef LLAMA_RPC
 	MK_CPPFLAGS   += -DGGML_USE_RPC
 	OBJS          += ggml-rpc.o
 endif # LLAMA_RPC
 
+ggml-blas.o: ggml-blas.c ggml-blas.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
 ifdef LLAMA_CUBLAS
 # LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 73a3c15756ba1..893884dbe5700 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -339,6 +339,7 @@ struct hash_node {
 };
 
 struct tensor_alloc {
+    int buffer_id;
     size_t offset;
     size_t size_max; // 0 = pre-allocated, unused, or view
 };
@@ -349,7 +350,6 @@ struct leaf_alloc {
 };
 
 struct node_alloc {
-    int buffer_id;
     struct tensor_alloc dst;
     struct tensor_alloc src[GGML_MAX_SRC];
 };
@@ -511,17 +511,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
     }
 }
 
-static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
     // graph outputs are never freed
     if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
         AT_PRINTF("not freeing output %s\n", node->name);
         return;
     }
 
-    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
-    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
     size_t offset = hn->offset;
+    int buffer_id = hn->buffer_id;
+    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     size_t size = ggml_backend_buft_get_alloc_size(buft, node);
     ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
     hn->allocated = false;
@@ -626,11 +627,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
                     AT_PRINTF("view_src %s: %d children, %d views\n",
                         view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                     if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
-                        ggml_gallocr_free_node(galloc, view_src, buffer_id);
+                        ggml_gallocr_free_node(galloc, view_src);
                     }
                 }
                 else if (p_hn->allocated) {
-                    ggml_gallocr_free_node(galloc, parent, buffer_id);
+                    ggml_gallocr_free_node(galloc, parent);
                 }
             }
             AT_PRINTF("\n");
@@ -674,22 +675,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
-        node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
+        //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
         if (node->view_src || node->data) {
+            node_alloc->dst.buffer_id = -1;
             node_alloc->dst.offset = SIZE_MAX;
             node_alloc->dst.size_max = 0;
         } else {
             struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-            node_alloc->dst.offset   = hn->offset;
-            node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+            node_alloc->dst.buffer_id = hn->buffer_id;
+            node_alloc->dst.offset    = hn->offset;
+            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (!src || src->view_src || src->data) {
+                node_alloc->src[j].buffer_id = -1;
                 node_alloc->src[j].offset = SIZE_MAX;
                 node_alloc->src[j].size_max = 0;
             } else {
                 struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
+                node_alloc->src[j].buffer_id = hn->buffer_id;
                 node_alloc->src[j].offset   = hn->offset;
                 node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
             }
@@ -706,9 +711,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
         galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
         if (leaf->view_src || leaf->data) {
+            galloc->leaf_allocs[i].leaf.buffer_id = -1;
             galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
             galloc->leaf_allocs[i].leaf.size_max = 0;
         } else {
+            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
             galloc->leaf_allocs[i].leaf.offset = hn->offset;
             galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
         }
@@ -740,7 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
     return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
+    int buffer_id = tensor_alloc->buffer_id;
     assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
 
     if (tensor->view_src != NULL) {
@@ -768,8 +776,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
     }
 }
 
-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
-    ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
+    ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
     size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
     return talloc->size_max >= node_size;
 }
@@ -793,7 +801,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
 
-        if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
+        if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 #ifndef NDEBUG
             fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
 #endif
@@ -805,7 +813,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
             if (src == NULL) {
                 continue;
             }
-            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
+            if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 #ifndef NDEBUG
                 fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 #endif
@@ -846,7 +854,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
-        ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
+        ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
     }
     // nodes
     for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +865,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
             if (src == NULL) {
                 continue;
             }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
         }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
+        ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
     }
 
     return true;
diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index f121e1de420fa..36ca370867c9e 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -17,13 +17,15 @@ extern "C" {
 
     struct ggml_backend_buffer_type_i {
         const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
+        // allocate a buffer of this type
         ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // tensor alignment
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft);
+        // max buffer size that can be allocated
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft);
+        // data size needed to allocate the tensor, including padding
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
         // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
         bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
     };
 
@@ -92,27 +94,37 @@ extern "C" {
         void (*GGML_CALL synchronize)(ggml_backend_t backend);
 
         // compute graph with a plan (not used currently)
+        // create a new plan for a graph
         ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
+        void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+        // compute the graph with the plan
+        enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
-        // compute graph with a plan
-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         // compute graph without a plan (async)
         enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
-        // check if the backend supports an operation
+        // check if the backend can compute an operation
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
 
+        // check if the backend can use tensors allocated in a buffer type
+        bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+
         // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
         // these should be expensive operations with large batch sizes that may benefit from running on this backend
         // even if the weight has to be copied from the CPU temporarily
         bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
 
         // (optional) event synchronization
+        // create a new event that can record events on this backend instance
         ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
         void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
+        // record an event on the backend instance that created it
         void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
+        // wait for an event on on a different backend instance
         void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
+        // block until an event is recorded
         void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
     };
 
diff --git a/ggml-backend.c b/ggml-backend.c
index 05737ed696954..2cc9e09a8385f 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
     return ggml_nbytes(tensor);
 }
 
-bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return buft->iface.supports_backend(buft, backend);
-}
-
 bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
     if (buft->iface.is_host) {
         return buft->iface.is_host(buft);
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
     return backend->iface.supports_op(backend, op);
 }
 
+bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    return backend->iface.supports_buft(backend, buft);
+}
+
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     if (backend->iface.offload_op != NULL) {
         return backend->iface.offload_op(backend, op);
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
     GGML_UNUSED(buft);
 }
 
-GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buft);
-}
-
 GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return true;
 
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
             /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         },
         /* .context = */ NULL,
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
             /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         },
         /* .context  = */ NULL,
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
     GGML_UNUSED(backend);
 }
 
+GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(backend);
+}
+
 static struct ggml_backend_i cpu_backend_i = {
     /* .get_name                = */ ggml_backend_cpu_name,
     /* .free                    = */ ggml_backend_cpu_free,
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
     /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
     /* .supports_op             = */ ggml_backend_cpu_supports_op,
+    /* .supports_buft           = */ ggml_backend_cpu_supports_buft,
     /* .offload_op              = */ NULL,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
     int * node_backend_ids; // [graph_size]
     int * leaf_backend_ids; // [graph_size]
 
+    int * prev_node_backend_ids; // [graph_size]
+    int * prev_leaf_backend_ids; // [graph_size]
+
     // copy of the graph with modified inputs
     struct ggml_cgraph * graph;
 
@@ -1097,15 +1100,16 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
     return -1;
 }
 
-static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
     ggml_backend_buffer_t buffer = tensor->buffer;
     if (buffer == NULL) {
         return -1;
     }
 
-    // find highest prio backend that supports the buffer type
+    // find highest prio backend that supports the buffer type and the op
     for (int i = 0; i < sched->n_backends; i++) {
-        if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
+        if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
+            ggml_backend_supports_op(sched->backends[i], op)) {
             return i;
         }
     }
@@ -1126,12 +1130,17 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
 #define GET_CAUSE(node) ""
 #endif
 
+//#define DEBUG_PASS1
+//#define DEBUG_PASS2
+//#define DEBUG_PASS3
+//#define DEBUG_PASS4
+
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
     // TODO: use supports_op to check if the backend supports the op
 
     // assign pre-allocated nodes to their backend
-    int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
+    int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
     if (cur_backend_id != -1) {
         SET_CAUSE(tensor, "1.dst");
         return cur_backend_id;
@@ -1139,7 +1148,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 
     // view_src
     if (tensor->view_src != NULL) {
-        cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
+        cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
         if (cur_backend_id != -1) {
             SET_CAUSE(tensor, "1.vsrc");
             return cur_backend_id;
@@ -1161,7 +1170,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
             continue;
         }
         if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
+            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
             if (src_backend_id == sched->n_backends - 1) {
                 for (int b = 0; b < src_backend_id; b++) {
@@ -1223,10 +1232,43 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
     }
 }
 
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
+static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
+        *node_backend_id = cur_backend_id;
+        SET_CAUSE(node, "2.2");
+    } else {
+        for (int b = 0; b < sched->n_backends; b++) {
+            if (b == cur_backend_id) {
+                continue;
+            }
+            if (ggml_backend_supports_op(sched->backends[b], node)) {
+                *node_backend_id = b;
+                cur_backend_id = b;
+                SET_CAUSE(node, "2.2");
+                break;
+            }
+        }
+    }
+    return cur_backend_id;
+}
+
+static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
+    ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
+    ggml_backend_buffer_type_t buft = NULL;
+
+    if (buf) {
+        // the tensor is already allocated
+        buft = buf->buft;
+    } else {
+        // see if the tensor already has a backend assigned, and use the buffer type of that backend
+        int tensor_backend_id = tensor_backend_id(t);
+        if (tensor_backend_id != -1) {
+            buft = sched->bufts[tensor_backend_id];
+        }
+    }
+
+    return buft != NULL && ggml_backend_supports_buft(sched->backends[cur_backend_id], buft);
+}
 
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -1306,9 +1348,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 } else {
                     cur_backend_id = *node_backend_id;
                 }
-            } else {
-                *node_backend_id = cur_backend_id;
-                SET_CAUSE(node, "2.2");
+            } else if (cur_backend_id != -1) {
+                // FIXME: clean this
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                if (cur_backend_id == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
+                }
             }
         }
     }
@@ -1328,9 +1374,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 } else {
                     cur_backend_id = *node_backend_id;
                 }
-            } else {
-                *node_backend_id = cur_backend_id;
-                SET_CAUSE(node, "2.1");
+            } else if (cur_backend_id != -1) {
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                if (cur_backend_id == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
+                }
             }
         }
     }
@@ -1345,9 +1394,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             int * node_backend_id = &tensor_backend_id(node);
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
-            } else {
-                *node_backend_id = cur_backend_id;
-                SET_CAUSE(node, "2.4");
+            } else if (cur_backend_id != -1) {
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1362,9 +1410,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             int * node_backend_id = &tensor_backend_id(node);
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
-            } else {
-                *node_backend_id = cur_backend_id;
-                SET_CAUSE(node, "2.3");
+            } else if (cur_backend_id != -1) {
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1448,10 +1495,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                         }
                     }
                     // check if the split has too many inputs
+                    // FIXME: count the number of inputs instead of only checking when full
                     if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
                         const size_t id = hash_id(src);
                         int src_backend_id = sched->tensor_backend_id[id];
-                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
+                        bool supported = buffer_supported(sched, src, cur_backend_id);
+                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
                             //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                             need_new_split = true;
                             break;
@@ -1486,7 +1535,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 const int src_backend_id = tensor_backend_id(src);
                 assert(src_backend_id != -1); // all inputs should be assigned by now
 
-                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1)  {
+                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                     size_t id = hash_id(src);
                     if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
                         ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1511,7 +1560,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     }
                 }
 
-                if (src_backend_id != node_backend_id) {
+                bool supported = buffer_supported(sched, src, cur_backend_id);
+                if (src_backend_id != cur_backend_id && !supported) {
                     // create a copy of the input in the split's backend
                     const size_t id = hash_id(src);
                     if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@@ -1543,6 +1593,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
     // create copies of the graph for each split
     // TODO: avoid this copy
+
+    // swap node_backend_ids and leaf_backend_ids and prevs
+    {
+        int * tmp = sched->node_backend_ids;
+        sched->node_backend_ids = sched->prev_node_backend_ids;
+        sched->prev_node_backend_ids = tmp;
+
+        tmp = sched->leaf_backend_ids;
+        sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
+        sched->prev_leaf_backend_ids = tmp;
+    }
+
     struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &sched->splits[i];
@@ -1613,8 +1675,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 }
 
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
+    bool backend_ids_changed = false;
+    for (int i = 0; i < sched->graph->n_nodes; i++) {
+        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
+            backend_ids_changed = true;
+            break;
+        }
+    }
+    if (!backend_ids_changed) {
+        for (int i = 0; i < sched->graph->n_leafs; i++) {
+            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
+                backend_ids_changed = true;
+                break;
+            }
+        }
+    }
+
     // allocate graph
-    if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
         // the re-allocation may cause the split inputs to be moved to a different address
         ggml_backend_sched_synchronize(sched);
 #ifndef NDEBUG
@@ -1735,6 +1813,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
     const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
     sched->node_backend_ids  = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
     sched->leaf_backend_ids  = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+    sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
+    sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
 
     sched->n_backends = n_backends;
 
@@ -1747,7 +1827,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
     for (int b = 0; b < n_backends; b++) {
         sched->backends[b] = backends[b];
         sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
-        GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
+        GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
         if (sched->n_copies > 1) {
             for (int c = 0; c < sched->n_copies; c++) {
                 sched->events[b][c] = ggml_backend_event_new(backends[b]);
diff --git a/ggml-backend.h b/ggml-backend.h
index c582b06850ed1..47fd814751795 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -23,7 +23,6 @@ extern "C" {
     GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
     GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
     GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
     GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
 
     // buffer
@@ -74,6 +73,7 @@ extern "C" {
     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
     GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
 
     // tensor copy between different backends
@@ -90,7 +90,7 @@ extern "C" {
     GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
     GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
     GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
+    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event);
 
     //
     // CPU backend
@@ -119,7 +119,7 @@ extern "C" {
 
     GGML_API size_t                     ggml_backend_reg_get_count(void);
     GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
     GGML_API const char *               ggml_backend_reg_get_name(size_t i);
     GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
     GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
diff --git a/ggml-blas.c b/ggml-blas.c
new file mode 100644
index 0000000000000..dab6fcf47a8ee
--- /dev/null
+++ b/ggml-blas.c
@@ -0,0 +1,257 @@
+#include "ggml-blas.h"
+#include "ggml-backend-impl.h"
+
+#include <stdlib.h>
+
+#if defined(GGML_USE_ACCELERATE)
+#   include <Accelerate/Accelerate.h>
+#elif defined(GGML_USE_OPENBLAS)
+#   if defined(GGML_BLAS_USE_MKL)
+#       include <mkl.h>
+#   else
+#       include <cblas.h>
+#   endif
+#endif
+
+struct ggml_backend_blas_context {
+    int n_threads;
+    void * work_data;
+    size_t work_size;
+};
+
+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    //const int64_t ne00 = src0->ne[0];
+    //const int64_t ne01 = src0->ne[1];
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if (ggml_is_contiguous(src0) &&
+        ggml_is_contiguous(src1) &&
+        src1->type == GGML_TYPE_F32 &&
+        ((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
+
+        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
+        return true;
+    }
+
+    return false;
+}
+
+static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+
+    ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    const int64_t ne_plane      = ne01*ne00;
+    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne13*ne12*ne_plane*sizeof(float);
+
+    if (ctx->work_size < desired_wsize) {
+        free(ctx->work_data);
+        ctx->work_data = malloc(desired_wsize);
+        GGML_ASSERT(ctx->work_data != NULL);
+        ctx->work_size = desired_wsize;
+    }
+    void * wdata = ctx->work_data;
+
+    // convert src0 to float
+    if (true) {
+        if (type != GGML_TYPE_F32) {
+            ggml_to_float_t const to_float = type_traits.to_float;
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
+                          float * const wplane = (float *) wdata      + i03*ne12*ne_plane + i02*ne_plane;
+
+#ifdef GGML_USE_OPENMP
+                #pragma omp parallel for num_threads(ctx->n_threads)
+#endif
+                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                    }
+                }
+            }
+        }
+    }
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            const int64_t i03 = i13/r3;
+            const int64_t i02 = i12/r2;
+
+            const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
+            const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+                  float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
+
+            if (type != GGML_TYPE_F32) {
+                x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
+            }
+
+            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne1, ne01, ne10,
+                        1.0f,   y, ne10,
+                                x, ne00,
+                        0.0f,   d, ne01);
+        }
+    }
+}
+
+// backend interface
+
+GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
+    return "BLAS";
+
+    GGML_UNUSED(backend);
+}
+
+GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
+    free(backend);
+}
+
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(backend);
+}
+
+GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                ggml_backend_blas_mul_mat(ctx, node);
+                break;
+
+            // TODO
+            //case GGML_OP_OUT_PROD:
+
+            case GGML_OP_NONE:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+                break;
+
+            default:
+                fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+                GGML_ASSERT(false);
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+
+    GGML_UNUSED(backend);
+}
+
+GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return op->op == GGML_OP_MUL_MAT && ggml_compute_forward_mul_mat_use_blas(op);
+
+    GGML_UNUSED(backend);
+}
+
+GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i blas_backend_i = {
+    /* .get_name                = */ ggml_backend_blas_name,
+    /* .free                    = */ ggml_backend_blas_free,
+    /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
+    /* .supports_op             = */ ggml_backend_blas_supports_op,
+    /* .supports_buft           = */ ggml_backend_blas_supports_buft,
+    /* .offload_op              = */ NULL,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_blas_guid(void) {
+    static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_blas_init(void) {
+    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
+    if (backend == NULL) {
+        return NULL;
+    }
+    struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context));
+    if (ctx == NULL) {
+        return NULL;
+    }
+
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+
+    *backend = (struct ggml_backend) {
+        /* .guid      = */ ggml_backend_blas_guid(),
+        /* .interface = */ blas_backend_i,
+        /* .context   = */ ctx,
+    };
+
+    return backend;
+}
+
+GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
+}
+
+void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_blas(backend_blas));
+
+    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context;
+    ctx->n_threads = n_threads;
+}
diff --git a/ggml-blas.h b/ggml-blas.h
new file mode 100644
index 0000000000000..646ca84ef8122
--- /dev/null
+++ b/ggml-blas.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c81c6a0d783be..c2c9940bff05e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
     return ctx->name.c_str();
 }
 
+static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
+}
+
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
@@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
     GGML_UNUSED(buft);
 }
 
-GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_cuda(backend)) {
-        return false;
-    }
-
-    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return buft_ctx->device == cuda_ctx->device;
-}
-
 static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
 };
 
@@ -929,6 +921,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
     GGML_UNUSED(buft);
 }
 
+static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
+}
+
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
     // instead, we allocate them for each tensor separately in init_tensor
@@ -972,12 +968,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
     return total_size;
 }
 
-GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
-
-    GGML_UNUSED(buft);
-}
-
 GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return false;
 
@@ -990,7 +980,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
     /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
     /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };
 
@@ -1090,7 +1079,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
         /* .context  = */ nullptr,
@@ -2919,6 +2907,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
     GGML_UNUSED(backend);
 }
 
+GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    if (ggml_backend_buft_is_cuda_split(buft)) {
+        return true;
+    }
+
+    if (ggml_backend_buft_is_cuda(buft)) {
+        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+        ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+        return buft_ctx->device == cuda_ctx->device;
+    }
+
+    return false;
+}
+
 GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
     const int min_batch_size = 32;
 
@@ -2991,9 +2993,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
     /* .synchronize             = */ ggml_backend_cuda_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
+    /* .supports_buft           = */ ggml_backend_cuda_supports_buft,
     /* .offload_op              = */ ggml_backend_cuda_offload_op,
     /* .event_new               = */ ggml_backend_cuda_event_new,
     /* .event_free              = */ ggml_backend_cuda_event_free,
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index eabd70d5eeed8..721080031157f 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1899,18 +1899,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
     return ctx->max_alloc;
 }
 
-static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    GGML_UNUSED(buft);
-    return ggml_backend_is_kompute(backend);
-}
-
 static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
     /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
     /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
     /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-    /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
 };
 
@@ -1970,6 +1964,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
     return ggml_vk_supports_op(op);
 }
 
+static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(backend);
+    return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
+}
+
 static struct ggml_backend_i kompute_backend_i = {
     /* .get_name                = */ ggml_backend_kompute_name,
     /* .free                    = */ ggml_backend_kompute_free,
@@ -1980,9 +1979,11 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
+    /* .supports_buft           = */ ggml_backend_kompute_supports_buft,
     /* .offload_op              = */ NULL,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
diff --git a/ggml-metal.m b/ggml-metal.m
index fddc44f78d8af..154c8052a44fd 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -3040,12 +3040,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
     UNUSED(buft);
 }
 
-GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
-
-    UNUSED(buft);
-}
-
 GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return true;
 
@@ -3060,7 +3054,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
             /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
             /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
             /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
         },
         /* .context = */ NULL,
@@ -3175,6 +3168,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
     return ggml_metal_supports_op(metal_ctx, op);
 }
 
+GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
+
+    UNUSED(backend);
+}
+
 static struct ggml_backend_i ggml_backend_metal_i = {
     /* .get_name                = */ ggml_backend_metal_name,
     /* .free                    = */ ggml_backend_metal_free,
@@ -3185,9 +3184,11 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
     /* .supports_op             = */ ggml_backend_metal_supports_op,
+    /* .supports_buft           = */ ggml_backend_metal_supports_buft,
     /* .offload_op              = */ NULL,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp
index 679ce4f280c5f..9b95193d3229d 100644
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
@@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
     return ggml_nbytes(tensor);
 }
 
-GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_rpc(backend)) {
-        return false;
-    }
-    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
-    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
-    return buft_ctx->endpoint == rpc_ctx->endpoint;
-}
-
 static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_rpc_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_rpc_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_rpc_buffer_type_get_alignment,
     /* .get_max_size     = */ ggml_backend_rpc_get_max_size,
     /* .get_alloc_size   = */ ggml_backend_rpc_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
 };
 
@@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
     return false;
 }
 
+GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
+        return false;
+    }
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+    return buft_ctx->endpoint == rpc_ctx->endpoint;
+}
+
 static ggml_backend_i ggml_backend_rpc_interface = {
     /* .get_name                = */ ggml_backend_rpc_name,
     /* .free                    = */ ggml_backend_rpc_free,
@@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
     /* .synchronize             = */ ggml_backend_rpc_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_rpc_graph_compute,
     /* .supports_op             = */ ggml_backend_rpc_supports_op,
+    /* .supports_buft           = */ ggml_backend_rpc_supports_buft,
     /* .offload_op              = */ NULL,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 5cd97e4ff98df..332f9991e1020 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -16631,22 +16631,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
     UNUSED(buft);
 }
 
-GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_sycl(backend)) {
-        return false;
-    }
-    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-    return buft_ctx->device == sycl_ctx->device;
-}
-
 static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_sycl_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
     /* .get_max_size     = */ ggml_backend_sycl_buffer_type_get_max_size,
     /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
     /* .is_host          = */ nullptr,
 };
 
@@ -16998,12 +16988,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
     return total_size;
 }
 
-GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_sycl(backend);
-
-    UNUSED(buft);
-}
-
 GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return false;
 
@@ -17016,7 +17000,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
     /* .get_alignment    = */ ggml_backend_sycl_split_buffer_type_get_alignment,
     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
     /* .is_host          = */ ggml_backend_sycl_split_buffer_type_is_host,
 };
 
@@ -17102,7 +17085,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
             /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
         /* .context  = */ nullptr,
@@ -17367,6 +17349,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
     GGML_UNUSED(backend);
 }
 
+GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
+        return false;
+    }
+    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    return buft_ctx->device == sycl_ctx->device;
+}
 
 static ggml_backend_i ggml_backend_sycl_interface = {
     /* .get_name                = */ ggml_backend_sycl_name,
@@ -17378,9 +17368,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
     /* .synchronize             = */ ggml_backend_sycl_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
     /* .supports_op             = */ ggml_backend_sycl_supports_op,
+    /* .supports_buft           = */ ggml_backend_sycl_supports_buft,
     /* .offload_op              = */ ggml_backend_sycl_offload_op,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 5e12ea9dde4d7..a07c646b90129 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -6110,24 +6110,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
     UNUSED(buft);
 }
 
-GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_vk(backend)) {
-        return false;
-    }
-
-    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    return buft_ctx->ctx->idx == ctx->idx;
-}
-
 static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_vk_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_vk_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_vk_buffer_type_get_alignment,
     /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
     /* .get_alloc_size   = */ ggml_backend_vk_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
 };
 
@@ -6203,7 +6191,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
             /* .get_alignment    = */ ggml_backend_vk_host_buffer_type_get_alignment,
             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
         /* .context  = */ nullptr,
@@ -6524,6 +6511,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
     UNUSED(backend);
 }
 
+GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
+        return false;
+    }
+
+    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+
+    return buft_ctx->ctx->idx == ctx->idx;
+}
+
 // TODO: enable async and synchronize
 static ggml_backend_i ggml_backend_vk_interface = {
     /* .get_name                = */ ggml_backend_vk_name,
@@ -6535,9 +6533,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
     /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_vk_graph_compute,
     /* .supports_op             = */ ggml_backend_vk_supports_op,
+    /* .supports_buft           = */ ggml_backend_vk_supports_buft,
     /* .offload_op              = */ ggml_backend_vk_offload_op,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
diff --git a/ggml.c b/ggml.c
index 11e5c34ab56ad..0724b3b49a03e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
-#elif defined(GGML_USE_OPENBLAS)
-#if defined(GGML_BLAS_USE_MKL)
-#include <mkl.h>
-#else
-#include <cblas.h>
-#endif
 #endif
 
 // floating point type used to accumulate sums
@@ -12216,39 +12210,6 @@ static void ggml_compute_forward_group_norm(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
-    //       all the experts for each batch element and the processing would become incredibly slow
-    // TODO: find the optimal values for these
-    if (dst->op != GGML_OP_MUL_MAT_ID &&
-        ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-      //src0->type == GGML_TYPE_F32 &&
-        src1->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
 static void ggml_compute_forward_mul_mat_one_chunk(
     const struct ggml_compute_params * params,
     struct ggml_tensor * dst,
@@ -12386,73 +12347,6 @@ static void ggml_compute_forward_mul_mat(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        const int64_t ne_plane      = ne01*ne00;
-        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
-        UNUSED(desired_wsize);
-
-        if (params->type == GGML_TASK_TYPE_INIT) {
-            if (type != GGML_TYPE_F32) {
-                assert(params->wsize >= desired_wsize);
-                // parallelize by src0 rows
-                for (int64_t i13 = 0; i13 < ne13; i13++) {
-                    for (int64_t i12 = 0; i12 < ne12; i12++) {
-                        // broadcast src0 into src1 across 2nd,3rd dimension
-                        const int64_t i03 = i13/r3;
-                        const int64_t i02 = i12/r2;
-
-                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
-                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
-                              ggml_to_float_t  const to_float = type_traits[type].to_float;
-
-                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
-                        }
-                    }
-                }
-            }
-            return;
-        }
-
-        if (params->type == GGML_TASK_TYPE_FINALIZE) {
-            return;
-        }
-
-        // perform sgemm, parallelization controlled by blas lib
-        if (ith != 0) {
-            return;
-        }
-
-        //const int64_t tgemm0 = ggml_perf_time_us();
-        for (int64_t i13 = 0; i13 < ne13; i13++) {
-            for (int64_t i12 = 0; i12 < ne12; i12++) {
-                const int64_t i03 = i13/r3;
-                const int64_t i02 = i12/r2;
-
-                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
-                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
-
-                if (type != GGML_TYPE_F32) {
-                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
-                }
-
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                          ne1, ne01, ne10,
-                         1.0f,    y, ne10,
-                                  x, ne00,
-                         0.0f,    d, ne01);
-            }
-        }
-        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
-
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
 #if GGML_USE_LLAMAFILE
     const bool src1_cont = ggml_is_contiguous(src1);
 
@@ -12833,19 +12727,7 @@ static void ggml_compute_forward_out_prod_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    bool use_blas = ggml_is_matrix(src0) &&
-        ggml_is_matrix(src1) &&
-        ggml_is_contiguous(src0) &&
-        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
-#endif
-
     if (params->type == GGML_TASK_TYPE_INIT) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
-        if (use_blas) {
-            return;
-        }
-#endif
         if (ith != 0) {
             return;
         }
@@ -12857,50 +12739,6 @@ static void ggml_compute_forward_out_prod_f32(
         return;
     }
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (use_blas) {
-        if (params->ith != 0) { // All threads other than the first do no work.
-            return;
-        }
-        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
-        // src0: (k,n)
-        // src1: (k,m)
-        // dst:  (m,n)
-        //
-        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
-        // Also expressed as (major,minor)
-        // a: (m,k): so src1 transposed
-        // b: (k,n): so src0
-        // c: (m,n)
-        //
-        // However, if ggml_is_transposed(src1) is true, then
-        // src1->data already contains a transposed version, so sgemm mustn't
-        // transpose it further.
-
-        int n = src0->ne[0];
-        int k = src0->ne[1];
-        int m = src1->ne[0];
-
-        int transposeA, lda;
-
-        if (!ggml_is_transposed(src1)) {
-            transposeA = CblasTrans;
-            lda = m;
-        } else {
-            transposeA = CblasNoTrans;
-            lda = k;
-        }
-
-        float * a = (float *) ((char *) src1->data);
-        float * b = (float *) ((char *) src0->data);
-        float * c = (float *) ((char *) dst->data);
-
-        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
-
-        return;
-    }
-#endif
-
     // dst[:,:,:,:] = 0
     // for i2,i3:
     //   for i1:
@@ -13030,8 +12868,6 @@ static void ggml_compute_forward_out_prod_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-
     if (params->type == GGML_TASK_TYPE_INIT) {
         if (ith != 0) {
             return;
@@ -13428,6 +13264,8 @@ static void ggml_compute_forward_get_rows_q(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
+        assert(i01 >= 0 && i01 < ne01);
+
         dequantize_row_q(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                      (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
@@ -13471,6 +13309,8 @@ static void ggml_compute_forward_get_rows_f16(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
+        assert(i01 >= 0 && i01 < ne01);
+
         ggml_fp16_to_fp32_row(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                      (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
@@ -13514,7 +13354,9 @@ static void ggml_compute_forward_get_rows_bf16(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
-       ggml_bf16_to_fp32_row(
+        assert(i01 >= 0 && i01 < ne01);
+
+        ggml_bf16_to_fp32_row(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                      (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
@@ -13557,6 +13399,8 @@ static void ggml_compute_forward_get_rows_f32(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
+        assert(i01 >= 0 && i01 < ne01);
+
         ggml_vec_cpy_f32(nc,
                 (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
                 (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@@ -19504,17 +19348,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                 {
                     const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
-                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory for fully dequantized matrix from src0
-                            // take into account that src0 can be broadcasted into src1[2,3]
-                            cur = ggml_type_size(GGML_TYPE_F32)
-                                * node->src[0]->ne[0]*node->src[0]->ne[1]
-                                * node->src[1]->ne[2]*node->src[1]->ne[3];
-                        }
-                    } else
-#endif
                     if (node->src[1]->type != vec_dot_type) {
                         cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
                     }
diff --git a/llama.cpp b/llama.cpp
index 06889126ecdc4..8208786beabec 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -21,6 +21,10 @@
 #   include "ggml-kompute.h"
 #endif
 
+#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+#  include "ggml-blas.h"
+#endif
+
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
@@ -2298,9 +2302,13 @@ struct llama_context {
     std::vector<ggml_backend_t> backends;
 #ifdef GGML_USE_METAL
     ggml_backend_t backend_metal = nullptr;
+#endif
+#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+    ggml_backend_t backend_blas = nullptr;
 #endif
     ggml_backend_t backend_cpu = nullptr;
 
+
     const llama_model & model;
 
     // key + value cache for the self attention
@@ -11516,17 +11524,17 @@ static struct ggml_cgraph * llama_build_graph(
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
-        if (batch.n_tokens < 32 || full_offload) {
-            if (il != -1 && strcmp(name, "norm") == 0) {
-                for (auto * backend : lctx.backends) {
-                    if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
-                        break;
-                    }
-                }
-            }
-        }
+        //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+        //if (batch.n_tokens < 32 || full_offload) {
+        //    if (il != -1 && strcmp(name, "norm") == 0) {
+        //        for (auto * backend : lctx.backends) {
+        //            if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
+        //                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+        //                break;
+        //            }
+        //        }
+        //    }
+        //}
     };
 
     struct ggml_cgraph * result = NULL;
@@ -12017,6 +12025,11 @@ static void llama_graph_compute(
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
+#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+    if (lctx.backend_blas != nullptr) {
+        ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
+    }
+#endif
 
     ggml_backend_sched_graph_compute_async(lctx.sched, gf);
 
@@ -12246,9 +12259,9 @@ static int llama_decode_internal(
         //       with the BLAS calls. need a better solution
         // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
         //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-            n_threads = std::min(4, n_threads);
-        }
+        //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+        //    n_threads = std::min(4, n_threads);
+        //}
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
@@ -16226,6 +16239,16 @@ struct llama_context * llama_new_context_with_model(
             ctx->backends.push_back(backend);
         }
 #endif
+
+#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+        ctx->backend_blas = ggml_backend_blas_init();
+        if (ctx->backend_blas == nullptr) {
+            LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
+        } else {
+            ctx->backends.push_back(ctx->backend_blas);
+        }
+#endif
+
 #if defined(GGML_USE_RPC)
         if (model->n_gpu_layers > 0) {
             for (const auto & endpoint : model->rpc_servers) {

From b88957e519c3bdd5cf231522d41163b72e5d5b12 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 00:35:55 +0200
Subject: [PATCH 02/14] rename GGML_USE_OPENBLAS to GGML_USE_BLAS

---
 CMakeLists.txt | 2 +-
 Makefile       | 6 +++---
 ggml-blas.c    | 5 +----
 ggml.c         | 2 +-
 llama.cpp      | 8 ++++----
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2933e714842de..e4eaed0706eea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -374,7 +374,7 @@ if (LLAMA_BLAS)
 
         add_compile_options(${BLAS_LINKER_FLAGS})
 
-        add_compile_definitions(GGML_USE_OPENBLAS)
+        add_compile_definitions(GGML_USE_BLAS)
 
         if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
             add_compile_definitions(GGML_BLAS_USE_MKL)
diff --git a/Makefile b/Makefile
index d45b2759bed2b..59dd8533666d3 100644
--- a/Makefile
+++ b/Makefile
@@ -419,21 +419,21 @@ ifndef LLAMA_NO_OPENMP
 endif # LLAMA_NO_OPENMP
 
 ifdef LLAMA_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_OPENBLAS64
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
 	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS64
 
 ifdef LLAMA_BLIS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 	OBJS        += ggml-blas.o
 endif # LLAMA_BLIS
diff --git a/ggml-blas.c b/ggml-blas.c
index dab6fcf47a8ee..6d527c041b441 100644
--- a/ggml-blas.c
+++ b/ggml-blas.c
@@ -5,7 +5,7 @@
 
 #if defined(GGML_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
-#elif defined(GGML_USE_OPENBLAS)
+#elif defined(GGML_USE_BLAS)
 #   if defined(GGML_BLAS_USE_MKL)
 #       include <mkl.h>
 #   else
@@ -25,9 +25,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
 
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
     const int64_t ne10 = src1->ne[0];
 
     const int64_t ne0 = dst->ne[0];
diff --git a/ggml.c b/ggml.c
index 0724b3b49a03e..e4ef34f2565fb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -22645,7 +22645,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
     return 1;
 #else
     return 0;
diff --git a/llama.cpp b/llama.cpp
index 8208786beabec..57d007f3313a3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -21,7 +21,7 @@
 #   include "ggml-kompute.h"
 #endif
 
-#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
 #  include "ggml-blas.h"
 #endif
 
@@ -2303,7 +2303,7 @@ struct llama_context {
 #ifdef GGML_USE_METAL
     ggml_backend_t backend_metal = nullptr;
 #endif
-#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
     ggml_backend_t backend_blas = nullptr;
 #endif
     ggml_backend_t backend_cpu = nullptr;
@@ -12025,7 +12025,7 @@ static void llama_graph_compute(
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
-#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
     if (lctx.backend_blas != nullptr) {
         ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
     }
@@ -16240,7 +16240,7 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
 
-#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
         ctx->backend_blas = ggml_backend_blas_init();
         if (ctx->backend_blas == nullptr) {
             LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);

From 77f88e350e10a7c0b5ded8ffeac7ae9504ad545e Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 01:40:43 +0200
Subject: [PATCH 03/14] add support for out_prod

---
 CMakeLists.txt |   2 +-
 ggml-blas.c    | 124 ++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e4eaed0706eea..6e5baa6a460a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,8 +92,8 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
-option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
diff --git a/ggml-blas.c b/ggml-blas.c
index 6d527c041b441..f826e7ab85c12 100644
--- a/ggml-blas.c
+++ b/ggml-blas.c
@@ -5,12 +5,10 @@
 
 #if defined(GGML_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
-#elif defined(GGML_USE_BLAS)
-#   if defined(GGML_BLAS_USE_MKL)
-#       include <mkl.h>
-#   else
-#       include <cblas.h>
-#   endif
+#elif defined(GGML_BLAS_USE_MKL)
+#   include <mkl.h>
+#else
+#   include <cblas.h>
 #endif
 
 struct ggml_backend_blas_context {
@@ -21,7 +19,7 @@ struct ggml_backend_blas_context {
 
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst) {
+static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
 
@@ -72,11 +70,8 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
     const int64_t r2 = ne12/ne02;
     const int64_t r3 = ne13/ne03;
 
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
     const int64_t ne_plane      = ne01*ne00;
-    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne13*ne12*ne_plane*sizeof(float);
+    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
 
     if (ctx->work_size < desired_wsize) {
         free(ctx->work_data);
@@ -87,21 +82,19 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
     void * wdata = ctx->work_data;
 
     // convert src0 to float
-    if (true) {
-        if (type != GGML_TYPE_F32) {
-            ggml_to_float_t const to_float = type_traits.to_float;
+    if (type != GGML_TYPE_F32) {
+        ggml_to_float_t const to_float = type_traits.to_float;
 
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
-                          float * const wplane = (float *) wdata      + i03*ne12*ne_plane + i02*ne_plane;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
+                      float * const wplane = (float *) wdata      + i03*ne12*ne_plane + i02*ne_plane;
 
 #ifdef GGML_USE_OPENMP
                 #pragma omp parallel for num_threads(ctx->n_threads)
 #endif
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
-                    }
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                 }
             }
         }
@@ -129,6 +122,70 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
     }
 }
 
+static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(ne0  == ne00);
+    GGML_ASSERT(ne1  == ne10);
+    GGML_ASSERT(ne2  == ne02);
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne3  == ne13);
+    GGML_ASSERT(ne03 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
+    // src0: (k,n)
+    // src1: (k,m)
+    // dst:  (m,n)
+    //
+    // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+    // Also expressed as (major,minor)
+    // a: (m,k): so src1 transposed
+    // b: (k,n): so src0
+    // c: (m,n)
+    //
+    // However, if ggml_is_transposed(src1) is true, then
+    // src1->data already contains a transposed version, so sgemm mustn't
+    // transpose it further.
+
+    int n = src0->ne[0];
+    int k = src0->ne[1];
+    int m = src1->ne[0];
+
+    int transposeA;
+    int lda;
+
+    if (!ggml_is_transposed(src1)) {
+        transposeA = CblasTrans;
+        lda = m;
+    } else {
+        transposeA = CblasNoTrans;
+        lda = k;
+    }
+
+    float * a = (float *) ((char *) src1->data);
+    float * b = (float *) ((char *) src0->data);
+    float * c = (float *) ((char *) dst->data);
+
+    cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+
+    GGML_UNUSED(ctx);
+}
+
 // backend interface
 
 GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
@@ -138,6 +195,9 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
 }
 
 GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
+    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context;
+    free(ctx->work_data);
+    free(ctx);
     free(backend);
 }
 
@@ -158,8 +218,9 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
                 ggml_backend_blas_mul_mat(ctx, node);
                 break;
 
-            // TODO
-            //case GGML_OP_OUT_PROD:
+            case GGML_OP_OUT_PROD:
+                ggml_backend_blas_out_prod(ctx, node);
+                break;
 
             case GGML_OP_NONE:
             case GGML_OP_RESHAPE:
@@ -180,7 +241,16 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
 }
 
 GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return op->op == GGML_OP_MUL_MAT && ggml_compute_forward_mul_mat_use_blas(op);
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    return (op->op == GGML_OP_MUL_MAT  && ggml_backend_blas_use_blas(op)) ||
+           (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
+                                          op->src[1]->type == GGML_TYPE_F32 &&
+                                          ggml_is_matrix(src0) &&
+                                          ggml_is_matrix(src1) &&
+                                          ggml_is_contiguous(src0) &&
+                                          (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
 
     GGML_UNUSED(backend);
 }
@@ -229,9 +299,9 @@ ggml_backend_t ggml_backend_blas_init(void) {
         return NULL;
     }
 
-    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
+    ctx->n_threads = GGML_DEFAULT_N_THREADS;
+    ctx->work_data = NULL;
+    ctx->work_size = 0;
 
     *backend = (struct ggml_backend) {
         /* .guid      = */ ggml_backend_blas_guid(),

From 845fa20f26fed397c53a3d2eb09d610650ca8657 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 02:16:13 +0200
Subject: [PATCH 04/14] alloc : reuse same buffer when the same buffer type if
 used multiple times

---
 ggml-alloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++----
 ggml-blas.c  |  3 ---
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 893884dbe5700..8973ef81323e2 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
     for (int i = 0; i < n_bufs; i++) {
         galloc->bufts[i] = bufts[i];
         galloc->buffers[i] = NULL;
-        size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
-        galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+
+        // check if the same buffer type is used multiple times and reuse the same allocator
+        for (int j = 0; j < i; j++) {
+            if (bufts[i] == bufts[j]) {
+                galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
+                break;
+            }
+        }
+
+        if (galloc->buf_tallocs[i] == NULL) {
+            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
+            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+        }
     }
     galloc->n_buffers = n_bufs;
 
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 
     for (int i = 0; i < galloc->n_buffers; i++) {
         if (galloc->buffers != NULL) {
-            ggml_backend_buffer_free(galloc->buffers[i]);
+            // skip if already freed
+            bool freed = false;
+            for (int j = 0; j < i; j++) {
+                if (galloc->buffers[j] == galloc->buffers[i]) {
+                    freed = true;
+                    break;
+                }
+            }
+            if (!freed) {
+                ggml_backend_buffer_free(galloc->buffers[i]);
+            }
         }
         if (galloc->buf_tallocs != NULL) {
-            ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+            // skip if already freed
+            bool freed = false;
+            for (int j = 0; j < i; j++) {
+                if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
+                    freed = true;
+                    break;
+                }
+            }
+            if (!freed) {
+                ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+            }
         }
     }
 
@@ -723,6 +754,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
+        // if the buffer type is used multiple times, we reuse the same buffer
+        for (int j = 0; j < i; j++) {
+            if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
+                galloc->buffers[i] = galloc->buffers[j];
+                break;
+            }
+        }
+
         size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
 
@@ -731,6 +770,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 #ifndef NDEBUG
             fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
+
             ggml_backend_buffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
@@ -879,6 +919,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     if (galloc->buffers[buffer_id] == NULL) {
         return 0;
     }
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
+            // this buffer is the same as a previous one due to the same buffer type being used multiple times
+            // only return the buffer size the first time it appears to avoid double counting
+            return 0;
+        }
+    }
+
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 
diff --git a/ggml-blas.c b/ggml-blas.c
index f826e7ab85c12..edb5474ddfebe 100644
--- a/ggml-blas.c
+++ b/ggml-blas.c
@@ -144,9 +144,6 @@ static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, s
     // GGML_ASSERT(nb1 <= nb2);
     // GGML_ASSERT(nb2 <= nb3);
 
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
     // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
     // src0: (k,n)
     // src1: (k,m)

From 2bfdb7fe4e6c71d919bef19324f100589af6bd2f Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 03:12:50 +0200
Subject: [PATCH 05/14] support multithreaded dequantization with std::async
 when openmp is not available

---
 CMakeLists.txt               |  2 +-
 Makefile                     |  6 ++--
 ggml-blas.c => ggml-blas.cpp | 66 ++++++++++++++++++++----------------
 3 files changed, 41 insertions(+), 33 deletions(-)
 rename ggml-blas.c => ggml-blas.cpp (84%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e5baa6a460a2..d8f7780f709a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -381,7 +381,7 @@ if (LLAMA_BLAS)
         endif()
 
         set(GGML_HEADERS_BLAS ggml-blas.h)
-        set(GGML_SOURCES_BLAS ggml-blas.c)
+        set(GGML_SOURCES_BLAS ggml-blas.cpp)
 
         set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
diff --git a/Makefile b/Makefile
index 59dd8533666d3..e57fcd9adb239 100644
--- a/Makefile
+++ b/Makefile
@@ -448,9 +448,6 @@ ifdef LLAMA_RPC
 	OBJS          += ggml-rpc.o
 endif # LLAMA_RPC
 
-ggml-blas.o: ggml-blas.c ggml-blas.h
-	$(CC) $(CFLAGS) -c $< -o $@
-
 ifdef LLAMA_CUBLAS
 # LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
@@ -752,6 +749,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
+ggml-blas.o: ggml-blas.cpp ggml-blas.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 unicode.o: unicode.cpp unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
diff --git a/ggml-blas.c b/ggml-blas.cpp
similarity index 84%
rename from ggml-blas.c
rename to ggml-blas.cpp
index edb5474ddfebe..2537a4a0f0526 100644
--- a/ggml-blas.c
+++ b/ggml-blas.cpp
@@ -1,7 +1,8 @@
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
 
-#include <stdlib.h>
+#include <future>
+#include <vector>
 
 #if defined(GGML_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
@@ -13,7 +14,7 @@
 
 struct ggml_backend_blas_context {
     int n_threads;
-    void * work_data;
+    char * work_data;
     size_t work_size;
 };
 
@@ -41,7 +42,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
     return false;
 }
 
-static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
 
@@ -74,15 +75,15 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
     const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
 
     if (ctx->work_size < desired_wsize) {
-        free(ctx->work_data);
-        ctx->work_data = malloc(desired_wsize);
-        GGML_ASSERT(ctx->work_data != NULL);
+        delete[] ctx->work_data;
+        ctx->work_data = new char[desired_wsize];
         ctx->work_size = desired_wsize;
     }
     void * wdata = ctx->work_data;
 
     // convert src0 to float
     if (type != GGML_TYPE_F32) {
+        std::vector<std::future<void>> tasks;
         ggml_to_float_t const to_float = type_traits.to_float;
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -92,12 +93,26 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
 
 #ifdef GGML_USE_OPENMP
                 #pragma omp parallel for num_threads(ctx->n_threads)
-#endif
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                 }
+#else
+                for (int i = 0; i < ctx->n_threads; i++) {
+                    tasks.push_back(std::async(std::launch::async, [=]() {
+                        const int64_t start = i*ne01/ctx->n_threads;
+                        const int64_t end   = (i + 1)*ne01/ctx->n_threads;
+                        for (int64_t i01 = start; i01 < end; i01++) {
+                            to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                        }
+                    }));
+                }
+#endif
             }
         }
+        // wait for all tasks to finish
+        for (auto & task : tasks) {
+            task.get();
+        }
     }
 
     for (int64_t i13 = 0; i13 < ne13; i13++) {
@@ -105,7 +120,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
             const int64_t i03 = i13/r3;
             const int64_t i02 = i12/r2;
 
-            const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
+            const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
             const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
                   float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
 
@@ -122,7 +137,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
     }
 }
 
-static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
 
@@ -163,7 +178,7 @@ static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, s
     int k = src0->ne[1];
     int m = src1->ne[0];
 
-    int transposeA;
+    CBLAS_TRANSPOSE transposeA;
     int lda;
 
     if (!ggml_is_transposed(src1)) {
@@ -192,10 +207,10 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
 }
 
 GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
-    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context;
-    free(ctx->work_data);
-    free(ctx);
-    free(backend);
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+    delete[] ctx->work_data;
+    delete ctx;
+    delete backend;
 }
 
 GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
@@ -205,7 +220,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer
 }
 
 GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context;
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
@@ -287,20 +302,13 @@ static ggml_guid_t ggml_backend_blas_guid(void) {
 }
 
 ggml_backend_t ggml_backend_blas_init(void) {
-    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
-    if (backend == NULL) {
-        return NULL;
-    }
-    struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context));
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ctx->n_threads = GGML_DEFAULT_N_THREADS;
-    ctx->work_data = NULL;
-    ctx->work_size = 0;
+    ggml_backend_blas_context * ctx = new ggml_backend_blas_context{
+        /* .n_threads = */ GGML_DEFAULT_N_THREADS,
+        /* .work_data = */ NULL,
+        /* .work_size = */ 0,
+    };
 
-    *backend = (struct ggml_backend) {
+    ggml_backend_t backend = new ggml_backend {
         /* .guid      = */ ggml_backend_blas_guid(),
         /* .interface = */ blas_backend_i,
         /* .context   = */ ctx,
@@ -316,6 +324,6 @@ GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
 void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
     GGML_ASSERT(ggml_backend_is_blas(backend_blas));
 
-    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context;
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
     ctx->n_threads = n_threads;
 }

From 0425305d3249be9626b008d559d44b974300dea8 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 19:54:14 +0200
Subject: [PATCH 06/14] fix apple build

---
 CMakeLists.txt | 11 ++++++++---
 Makefile       |  2 +-
 ggml-alloc.c   |  1 -
 ggml-blas.cpp  | 12 ++++++++----
 ggml.c         |  2 +-
 llama.cpp      |  8 ++++----
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8f7780f709a6..5576c26e1cc5e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,8 +39,12 @@ endif()
 
 if (APPLE)
     set(LLAMA_METAL_DEFAULT ON)
+    set(LLAMA_BLAS_DEFAULT ON)
+    set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
 else()
     set(LLAMA_METAL_DEFAULT OFF)
+    set(LLAMA_BLAS_DEFAULT OFF)
+    set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 
 set(LLAMA_LLAMAFILE_DEFAULT ON)
@@ -91,8 +95,9 @@ endif()
 
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
-set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+option(LLAMA_BLAS                            "llama: use BLAS"                                  ${LLAMA_BLAS_DEFAULT})
+set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
+                                             "llama: BLAS library vendor")
 option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
@@ -321,7 +326,7 @@ if (LLAMA_BLAS)
     if (BLAS_FOUND)
         message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
 
-        if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
+        if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
             # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
             # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
             find_package(PkgConfig REQUIRED)
diff --git a/Makefile b/Makefile
index e57fcd9adb239..adc9fa4347de7 100644
--- a/Makefile
+++ b/Makefile
@@ -404,7 +404,7 @@ ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 8973ef81323e2..0048e5c922e79 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -706,7 +706,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
-        //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
         if (node->view_src || node->data) {
             node_alloc->dst.buffer_id = -1;
             node_alloc->dst.offset = SIZE_MAX;
diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index 2537a4a0f0526..608ead1902a02 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -16,6 +16,7 @@ struct ggml_backend_blas_context {
     int n_threads;
     char * work_data;
     size_t work_size;
+    std::vector<std::future<void>> tasks;
 };
 
 // helper function to determine if it is better to use BLAS or not
@@ -33,7 +34,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
     if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) &&
         src1->type == GGML_TYPE_F32 &&
-        ((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
         return true;
@@ -83,7 +84,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 
     // convert src0 to float
     if (type != GGML_TYPE_F32) {
-        std::vector<std::future<void>> tasks;
         ggml_to_float_t const to_float = type_traits.to_float;
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -98,7 +98,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                 }
 #else
                 for (int i = 0; i < ctx->n_threads; i++) {
-                    tasks.push_back(std::async(std::launch::async, [=]() {
+                    ctx->tasks.push_back(std::async(std::launch::async, [=]() {
                         const int64_t start = i*ne01/ctx->n_threads;
                         const int64_t end   = (i + 1)*ne01/ctx->n_threads;
                         for (int64_t i01 = start; i01 < end; i01++) {
@@ -109,10 +109,14 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
             }
         }
+
+#ifndef GGML_USE_OPENMP
         // wait for all tasks to finish
-        for (auto & task : tasks) {
+        for (auto & task : ctx->tasks) {
             task.get();
         }
+        ctx->tasks.clear();
+#endif
     }
 
     for (int64_t i13 = 0; i13 < ne13; i13++) {
diff --git a/ggml.c b/ggml.c
index e4ef34f2565fb..01589c10e9cec 100644
--- a/ggml.c
+++ b/ggml.c
@@ -22645,7 +22645,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
     return 1;
 #else
     return 0;
diff --git a/llama.cpp b/llama.cpp
index 57d007f3313a3..ec087a0a31173 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -21,7 +21,7 @@
 #   include "ggml-kompute.h"
 #endif
 
-#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
+#ifdef GGML_USE_BLAS
 #  include "ggml-blas.h"
 #endif
 
@@ -2303,7 +2303,7 @@ struct llama_context {
 #ifdef GGML_USE_METAL
     ggml_backend_t backend_metal = nullptr;
 #endif
-#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
+#ifdef GGML_USE_BLAS
     ggml_backend_t backend_blas = nullptr;
 #endif
     ggml_backend_t backend_cpu = nullptr;
@@ -12025,7 +12025,7 @@ static void llama_graph_compute(
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
-#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
+#ifdef GGML_USE_BLAS
     if (lctx.backend_blas != nullptr) {
         ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
     }
@@ -16240,7 +16240,7 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
 
-#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE)
+#ifdef GGML_USE_BLAS
         ctx->backend_blas = ggml_backend_blas_init();
         if (ctx->backend_blas == nullptr) {
             LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);

From 2dd049ed4527d093a63f8b44d4759dfb03863f0d Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 7 Jun 2024 02:18:27 +0200
Subject: [PATCH 07/14] ggml-ci

---
 ggml-blas.cpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index 608ead1902a02..92d0e66372cee 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -13,10 +13,12 @@
 #endif
 
 struct ggml_backend_blas_context {
-    int n_threads;
-    char * work_data;
-    size_t work_size;
+    int n_threads = GGML_DEFAULT_N_THREADS;
+    std::unique_ptr<char[]> work_data;
+    size_t work_size = 0;
+#ifndef GGML_USE_OPENMP
     std::vector<std::future<void>> tasks;
+#endif
 };
 
 // helper function to determine if it is better to use BLAS or not
@@ -76,11 +78,10 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
     const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
 
     if (ctx->work_size < desired_wsize) {
-        delete[] ctx->work_data;
-        ctx->work_data = new char[desired_wsize];
+        ctx->work_data.reset(new char[desired_wsize]);
         ctx->work_size = desired_wsize;
     }
-    void * wdata = ctx->work_data;
+    void * wdata = ctx->work_data.get();
 
     // convert src0 to float
     if (type != GGML_TYPE_F32) {
@@ -212,7 +213,6 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
 
 GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
-    delete[] ctx->work_data;
     delete ctx;
     delete backend;
 }
@@ -306,11 +306,7 @@ static ggml_guid_t ggml_backend_blas_guid(void) {
 }
 
 ggml_backend_t ggml_backend_blas_init(void) {
-    ggml_backend_blas_context * ctx = new ggml_backend_blas_context{
-        /* .n_threads = */ GGML_DEFAULT_N_THREADS,
-        /* .work_data = */ NULL,
-        /* .work_size = */ 0,
-    };
+    ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
 
     ggml_backend_t backend = new ggml_backend {
         /* .guid      = */ ggml_backend_blas_guid(),

From 63e06d0c28452b2618e620316f14ad06fd89ecd5 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 7 Jun 2024 04:35:11 +0200
Subject: [PATCH 08/14] reuse main thread

---
 ggml-blas.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index 92d0e66372cee..ade10b9ac586a 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -98,15 +98,23 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                     to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                 }
 #else
-                for (int i = 0; i < ctx->n_threads; i++) {
+                for (int i = 0; i < ctx->n_threads - 1; i++) {
                     ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                        const int64_t start = i*ne01/ctx->n_threads;
+                        const int64_t start =       i*ne01/ctx->n_threads;
                         const int64_t end   = (i + 1)*ne01/ctx->n_threads;
                         for (int64_t i01 = start; i01 < end; i01++) {
                             to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                         }
                     }));
                 }
+                {
+                    // reuse the current thread for the last task
+                    const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads;
+                    const int64_t end   = ne01;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                    }
+                }
 #endif
             }
         }

From a8a1bf798104568d0cdad7011fd40c3ae228467c Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 7 Jun 2024 19:29:35 +0200
Subject: [PATCH 09/14] set number of threads automatically for openblas and
 blis

---
 ggml-blas.cpp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index ade10b9ac586a..3d146fc013913 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -10,6 +10,9 @@
 #   include <mkl.h>
 #else
 #   include <cblas.h>
+#   ifdef BLIS_ENABLE_CBLAS
+#       include <blis.h>
+#   endif
 #endif
 
 struct ggml_backend_blas_context {
@@ -128,6 +131,15 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
     }
 
+
+#if defined(OPENBLAS_VERSION)
+    openblas_set_num_threads(ctx->n_threads);
+#endif
+
+#if defined(BLIS_ENABLE_CBLAS)
+    bli_thread_set_num_threads(ctx->n_threads);
+#endif
+
     for (int64_t i13 = 0; i13 < ne13; i13++) {
         for (int64_t i12 = 0; i12 < ne12; i12++) {
             const int64_t i03 = i13/r3;
@@ -322,6 +334,16 @@ ggml_backend_t ggml_backend_blas_init(void) {
         /* .context   = */ ctx,
     };
 
+#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
+        fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+    }
+#endif
+
+#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+    fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#endif
+
     return backend;
 }
 

From ecb75b5f54cab6ca7f77ec51eb5f7d87c87be6cd Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 7 Jun 2024 20:00:38 +0200
Subject: [PATCH 10/14] sched : print assignments when GGML_SCHED_DEBUG env
 variable is set

---
 ggml-backend.c | 45 +++++++++++++++++----------------------------
 ggml-blas.h    |  1 +
 llama.cpp      | 34 ++++++++++++----------------------
 3 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 2cc9e09a8385f..80e129cf843f8 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1078,6 +1078,8 @@ struct ggml_backend_sched {
     ggml_backend_sched_eval_callback callback_eval;
     void * callback_eval_user_data;
 
+    bool debug;
+
     // align context_buffer to GGML_MEM_ALIGN
 #ifdef _MSC_VER
     __declspec(align(GGML_MEM_ALIGN))
@@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
 #define GET_CAUSE(node) ""
 #endif
 
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
-
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
     // TODO: use supports_op to check if the backend supports the op
@@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
     }
 }
 
-static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
     if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
         *node_backend_id = cur_backend_id;
         SET_CAUSE(node, "2.2");
@@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node
     return cur_backend_id;
 }
 
-static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
+static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
     ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
     ggml_backend_buffer_type_t buft = NULL;
 
@@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-#ifdef DEBUG_PASS1
-    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
 
     // pass 2: expand current backend assignments
     // assign the same backend to adjacent nodes
@@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 }
             } else if (cur_backend_id != -1) {
                 // FIXME: clean this
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
                 if (cur_backend_id == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
                     cur_backend_id = -1;
@@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     cur_backend_id = *node_backend_id;
                 }
             } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
                 if (cur_backend_id == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
                     cur_backend_id = -1;
@@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
             } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
             } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
 
-#ifdef DEBUG_PASS2
-    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
     // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-#ifdef DEBUG_PASS3
-    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
 
     // pass 4: split graph, find tensors that need to be copied
     {
@@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
                         const size_t id = hash_id(src);
                         int src_backend_id = sched->tensor_backend_id[id];
-                        bool supported = buffer_supported(sched, src, cur_backend_id);
+                        bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                         if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
                             //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                             need_new_split = true;
@@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     }
                 }
 
-                bool supported = buffer_supported(sched, src, cur_backend_id);
+                bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                 if (src_backend_id != cur_backend_id && !supported) {
                     // create a copy of the input in the split's backend
                     const size_t id = hash_id(src);
@@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         split->i_end = graph->n_nodes;
         sched->n_splits = i_split + 1;
     }
-#ifdef DEBUG_PASS4
-    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
 
-    // create copies of the graph for each split
-    // TODO: avoid this copy
+    if (sched->debug) {
+        ggml_backend_sched_print_assignments(sched, graph);
+    }
 
     // swap node_backend_ids and leaf_backend_ids and prevs
     {
@@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         sched->prev_leaf_backend_ids = tmp;
     }
 
+    // create copies of the graph for each split
+    // TODO: avoid this copy
     struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &sched->splits[i];
@@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
 
+    sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+
     // initialize hash table
     sched->hash_set          = ggml_hash_set_new(graph_size);
     sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
diff --git a/ggml-blas.h b/ggml-blas.h
index 646ca84ef8122..f2e37de06f609 100644
--- a/ggml-blas.h
+++ b/ggml-blas.h
@@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
 GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
 
 // number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
 GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 
 
diff --git a/llama.cpp b/llama.cpp
index 7e76c022b75d9..225ea977f4612 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph(
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
-        //if (batch.n_tokens < 32 || full_offload) {
-        //    if (il != -1 && strcmp(name, "norm") == 0) {
-        //        for (auto * backend : lctx.backends) {
-        //            if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
-        //                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
-        //                break;
-        //            }
-        //        }
-        //    }
-        //}
+        const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+        if (batch.n_tokens < 32 || full_offload) {
+            if (il != -1 && strcmp(name, "norm") == 0) {
+                for (auto * backend : lctx.backends) {
+                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
+                        (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+                        break;
+                    }
+                }
+            }
+        }
     };
 
     struct ggml_cgraph * result = NULL;
@@ -12261,17 +12262,6 @@ static int llama_decode_internal(
         }
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-        // for big prompts, if BLAS is enabled, it is better to use only one thread
-        // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-        // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-        //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-        //       with the BLAS calls. need a better solution
-        // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
-        //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-        //    n_threads = std::min(4, n_threads);
-        //}
-
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
         llama_set_inputs(lctx, u_batch);

From e06659811e118462b22ed5c16ed3106544d522d9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 8 Jun 2024 17:00:26 +0200
Subject: [PATCH 11/14] fixes

---
 examples/llama-bench/llama-bench.cpp |  1 +
 ggml-backend.c                       | 56 +++++++++++++++++++++++-----
 ggml-blas.cpp                        | 36 ++++++++++--------
 ggml.c                               | 18 ++++++---
 4 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 5c31548a6c25c..2a263d2a281b1 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     params.output_format = cmd_params_defaults.output_format;
     params.output_format_stderr = cmd_params_defaults.output_format_stderr;
     params.reps = cmd_params_defaults.reps;
+    params.numa = cmd_params_defaults.numa;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
diff --git a/ggml-backend.c b/ggml-backend.c
index 80e129cf843f8..68094e0543a41 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1232,7 +1232,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
 static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
     if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
         *node_backend_id = cur_backend_id;
-        SET_CAUSE(node, "2.2");
+        SET_CAUSE(node, "2.1");
     } else {
         for (int b = 0; b < sched->n_backends; b++) {
             if (b == cur_backend_id) {
@@ -1326,7 +1326,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
 
 
-    // pass 2.2 expand gpu down
+    // expand gpu down
     {
         int cur_backend_id = -1;
         for (int i = 0; i < graph->n_nodes; i++) {
@@ -1352,7 +1352,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-    // pass 2.1 expand gpu up
+    // expand gpu up
     {
         int cur_backend_id = -1;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1377,7 +1377,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-    // pass 2.4 expand rest down
+    // expand rest down
     {
         int cur_backend_id = -1;
         for (int i = 0; i < graph->n_nodes; i++) {
@@ -1393,7 +1393,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-    // pass 2.3 expand rest up
+    // expand rest up
     {
         int cur_backend_id = -1;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1410,13 +1410,48 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         }
     }
 
-    // pass 3: assign backends to remaining src from dst and view_src
+    // pass 3
+    // upgrade nodes to higher prio backends with compatible buffer types
+    // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
+    // however, we also need to verify that the sources are in compatible buffer types
+    // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
+    // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
+    // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        if (ggml_is_view_op(node->op)) {
+            continue;
+        }
+        int * node_backend_id = &tensor_backend_id(node);
+        for (int b = 0; b < *node_backend_id; b++) {
+            if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
+                bool supported = true;
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * src = node->src[j];
+                    if (src == NULL) {
+                        continue;
+                    }
+                    if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
+                        supported = false;
+                        break;
+                    }
+                }
+                if (supported) {
+                    *node_backend_id = b;
+                    SET_CAUSE(node, "3.upg");
+                    break;
+                }
+            }
+        }
+    }
+
+    // pass 4: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         int * cur_backend_id = &tensor_backend_id(node);
         if (node->view_src != NULL && *cur_backend_id == -1) {
             *cur_backend_id = tensor_backend_id(node->view_src);
-            SET_CAUSE(node, "3.vsrc");
+            SET_CAUSE(node, "4.vsrc");
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
@@ -1428,10 +1463,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 if (src->view_src != NULL) {
                     // views are always on the same backend as the source
                     *src_backend_id = tensor_backend_id(src->view_src);
-                    SET_CAUSE(src, "3.vsrc");
+                    SET_CAUSE(src, "4.vsrc");
                 } else {
                     *src_backend_id = *cur_backend_id;
-                    SET_CAUSE(src, "3.cur");
+                    SET_CAUSE(src, "4.cur");
                 }
             }
         }
@@ -1848,6 +1883,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     free(sched->tensor_copies);
     free(sched->node_backend_ids);
     free(sched->leaf_backend_ids);
+    free(sched->prev_node_backend_ids);
+    free(sched->prev_leaf_backend_ids);
     free(sched);
 }
 
@@ -1944,6 +1981,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     tensor_backend_id(node) = backend_index;
+    SET_CAUSE(node, "usr");
 }
 
 ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index 3d146fc013913..089c73dd34060 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -56,8 +56,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 
     const enum ggml_type type = src0->type;
 
-    ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
-
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
     GGML_ASSERT(ne2 == ne12);
@@ -88,32 +86,39 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 
     // convert src0 to float
     if (type != GGML_TYPE_F32) {
+        ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
         ggml_to_float_t const to_float = type_traits.to_float;
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
-                      float * const wplane = (float *) wdata      + i03*ne12*ne_plane + i02*ne_plane;
+                      float * const wplane = (float *) wdata      + i02*ne_plane      + i03*ne02*ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
+                const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread));
 
 #ifdef GGML_USE_OPENMP
-                #pragma omp parallel for num_threads(ctx->n_threads)
+                #pragma omp parallel for num_threads(n_threads)
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                 }
 #else
-                for (int i = 0; i < ctx->n_threads - 1; i++) {
-                    ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                        const int64_t start =       i*ne01/ctx->n_threads;
-                        const int64_t end   = (i + 1)*ne01/ctx->n_threads;
-                        for (int64_t i01 = start; i01 < end; i01++) {
-                            to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
-                        }
-                    }));
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start =       i*ne01/n_threads;
+                    const int64_t end   = (i + 1)*ne01/n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                            }
+                        }));
+                    }
                 }
                 {
                     // reuse the current thread for the last task
-                    const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads;
-                    const int64_t end   = ne01;
+                    const int64_t start = 0;
+                    const int64_t end   = ne01/n_threads;
                     for (int64_t i01 = start; i01 < end; i01++) {
                         to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                     }
@@ -131,7 +136,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
     }
 
-
 #if defined(OPENBLAS_VERSION)
     openblas_set_num_threads(ctx->n_threads);
 #endif
@@ -150,7 +154,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                   float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
 
             if (type != GGML_TYPE_F32) {
-                x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
+                x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
             }
 
             cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
diff --git a/ggml.c b/ggml.c
index a5d143b5cb13a..e372fee932b4a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18749,6 +18749,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
     switch (node->op) {
         case GGML_OP_CPY:
         case GGML_OP_DUP:
+        case GGML_OP_CONT:
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
         case GGML_OP_ACC:
@@ -18833,7 +18834,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
             } break;
         case GGML_OP_SCALE:
         case GGML_OP_SET:
-        case GGML_OP_CONT:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
@@ -18993,8 +18993,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
             sched_yield();
         }
 
-        * node_n = atomic_load(&state->shared->node_n);
-        if (* node_n != last_node_n) break;
+        *node_n = atomic_load(&state->shared->node_n);
+        if (*node_n != last_node_n) {
+            break;
+        }
+
 #if defined(__SSE3__)
         // Tell the processor we're spinning.  It's a processor hint for spinlocks.
         _mm_pause();
@@ -19004,15 +19007,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
 
 static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
     // wait for other threads to finish
-    const int last_task_phase = * task_phase;
+    const int last_task_phase = *task_phase;
 
     while (true) {
         if (do_yield) {
             sched_yield();
         }
 
-        * task_phase = atomic_load(&state->shared->node_task);
-        if (* task_phase != last_task_phase) break;
+        *task_phase = atomic_load(&state->shared->node_task);
+        if (*task_phase != last_task_phase) {
+            break;
+        }
+
 #if defined(__SSE3__)
         // Tell the processor we're spinning.  It's a processor hint for spinlocks.
         _mm_pause();

From a54b791211823f0c0cbf74aa317c09e501440967 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 12 Jun 2024 10:32:20 +0200
Subject: [PATCH 12/14] Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-blas.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index 089c73dd34060..d709a357bbf29 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -96,7 +96,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 
                 const int min_cols_per_thread = 4096;
                 const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
-                const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread));
+                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
 
 #ifdef GGML_USE_OPENMP
                 #pragma omp parallel for num_threads(n_threads)
@@ -116,7 +116,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                     }
                 }
                 {
-                    // reuse the current thread for the last task
+                    // reuse the current thread for the first task
                     const int64_t start = 0;
                     const int64_t end   = ne01/n_threads;
                     for (int64_t i01 = start; i01 < end; i01++) {

From ae9cd856980696e26e1d7f8df3737572ea304927 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 13 Jun 2024 02:04:06 +0200
Subject: [PATCH 13/14] fix metal being used in layers not offloaded

---
 ggml-backend.c | 112 ++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 53 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 68094e0543a41..21636a6ae5061 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1229,27 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
     }
 }
 
-static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
-    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
-        *node_backend_id = cur_backend_id;
-        SET_CAUSE(node, "2.1");
-    } else {
-        for (int b = 0; b < sched->n_backends; b++) {
-            if (b == cur_backend_id) {
-                continue;
-            }
-            if (ggml_backend_supports_op(sched->backends[b], node)) {
-                *node_backend_id = b;
-                cur_backend_id = b;
-                SET_CAUSE(node, "2.2");
-                break;
-            }
-        }
-    }
-    return cur_backend_id;
-}
-
-static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
+static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
     ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
     ggml_backend_buffer_type_t buft = NULL;
 
@@ -1259,12 +1239,22 @@ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, stru
     } else {
         // see if the tensor already has a backend assigned, and use the buffer type of that backend
         int tensor_backend_id = tensor_backend_id(t);
+        if (tensor_backend_id == -1 && t->view_src) {
+            tensor_backend_id = tensor_backend_id(t->view_src);
+        }
         if (tensor_backend_id != -1) {
             buft = sched->bufts[tensor_backend_id];
         }
     }
 
-    return buft != NULL && ggml_backend_supports_buft(sched->backends[cur_backend_id], buft);
+    return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
+}
+
+static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
+        *node_backend_id = cur_backend_id;
+        SET_CAUSE(node, "2.sup");
+    }
 }
 
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
@@ -1324,8 +1314,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     // assign the same backend to adjacent nodes
     // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
     // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
-
-
+    // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
     // expand gpu down
     {
         int cur_backend_id = -1;
@@ -1343,12 +1332,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     cur_backend_id = *node_backend_id;
                 }
             } else if (cur_backend_id != -1) {
-                // FIXME: clean this
-                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
-                if (cur_backend_id == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_backend_id = -1;
-                }
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1369,11 +1353,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     cur_backend_id = *node_backend_id;
                 }
             } else if (cur_backend_id != -1) {
-                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
-                if (cur_backend_id == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_backend_id = -1;
-                }
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1389,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
             } else if (cur_backend_id != -1) {
-                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1405,41 +1385,67 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
             } else if (cur_backend_id != -1) {
-                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
 
-    // pass 3
-    // upgrade nodes to higher prio backends with compatible buffer types
+    // pass 3: upgrade nodes to higher prio backends with compatible buffer types
     // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
     // however, we also need to verify that the sources are in compatible buffer types
     // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
     // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
     // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
+    // additionally, set remaining unassigned nodes to the backend with the most supported inputs
+    // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         if (ggml_is_view_op(node->op)) {
             continue;
         }
         int * node_backend_id = &tensor_backend_id(node);
-        for (int b = 0; b < *node_backend_id; b++) {
-            if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
-                bool supported = true;
-                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                    struct ggml_tensor * src = node->src[j];
-                    if (src == NULL) {
-                        continue;
+        if (*node_backend_id == -1) {
+            // unassigned node: find the backend with the most supported inputs
+            int n_supported_best = -1;
+            for (int b = 0; b < sched->n_backends; b++) {
+                if (ggml_backend_supports_op(sched->backends[b], node)) {
+                    int n_supported = 0;
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * src = node->src[j];
+                        if (src == NULL) {
+                            continue;
+                        }
+                        if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
+                            n_supported++;
+                        }
                     }
-                    if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
-                        supported = false;
-                        break;
+                    if (n_supported > n_supported_best) {
+                        n_supported_best = n_supported;
+                        *node_backend_id = b;
+                        SET_CAUSE(node, "3.best");
                     }
                 }
-                if (supported) {
-                    *node_backend_id = b;
-                    SET_CAUSE(node, "3.upg");
-                    break;
+            }
+        } else {
+            // assigned node: upgrade to higher prio backend if possible
+            for (int b = 0; b < *node_backend_id; b++) {
+                if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
+                    bool supported = true;
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * src = node->src[j];
+                        if (src == NULL) {
+                            continue;
+                        }
+                        if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
+                            supported = false;
+                            break;
+                        }
+                    }
+                    if (supported) {
+                        *node_backend_id = b;
+                        SET_CAUSE(node, "3.upg");
+                        break;
+                    }
                 }
             }
         }

From 211fb045f1c9cd4c949389817624ec510c0fcefd Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 13 Jun 2024 02:38:36 +0200
Subject: [PATCH 14/14] sched : allow ops with weights on an incompatible
 buffer type

This will cause the weight to be copied to a backend that supports the
op, which is very costly. The weight should have been stored in a buffer
of a backend that can run the op, but llama.cpp cannot do this
automatically at the moment.

ggml-ci
---
 ggml-backend.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 21636a6ae5061..2bec7bea38a85 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1116,9 +1116,10 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
         }
     }
 
-    fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
-        __func__, ggml_backend_buffer_name(buffer), tensor->name);
-    GGML_ASSERT(false);
+#ifndef NDEBUG
+    fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
+        __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
+#endif
 
     return -1;
 }