From 7f58793c5670c9607d53157c26bc9206ab534553 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Mar 2024 18:45:05 +0100 Subject: [PATCH 01/14] move BLAS to a separate backend --- CMakeLists.txt | 10 +- Makefile | 21 +++- ggml-alloc.c | 42 +++++--- ggml-backend-impl.h | 28 +++-- ggml-backend.c | 158 ++++++++++++++++++++------- ggml-backend.h | 6 +- ggml-blas.c | 257 ++++++++++++++++++++++++++++++++++++++++++++ ggml-blas.h | 22 ++++ ggml-cuda.cu | 44 ++++---- ggml-kompute.cpp | 13 +-- ggml-metal.m | 15 +-- ggml-rpc.cpp | 21 ++-- ggml-sycl.cpp | 28 ++--- ggml-vulkan.cpp | 26 ++--- ggml.c | 185 ++----------------------------- llama.cpp | 51 ++++++--- 16 files changed, 589 insertions(+), 338 deletions(-) create mode 100644 ggml-blas.c create mode 100644 ggml-blas.h diff --git a/CMakeLists.txt b/CMakeLists.txt index cf37d5bb242ac..2933e714842de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -311,9 +311,9 @@ if (LLAMA_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) endif() - if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) - set(BLA_SIZEOF_INTEGER 8) - endif() + #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) + # set(BLA_SIZEOF_INTEGER 8) + #endif() set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) find_package(BLAS) @@ -380,6 +380,9 @@ if (LLAMA_BLAS) add_compile_definitions(GGML_BLAS_USE_MKL) endif() + set(GGML_HEADERS_BLAS ggml-blas.h) + set(GGML_SOURCES_BLAS ggml-blas.c) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) else() @@ -1255,6 +1258,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ) diff --git a/Makefile b/Makefile index 802ee6a47654c..d45b2759bed2b 100644 --- a/Makefile +++ b/Makefile @@ -408,6 +408,7 @@ ifndef LLAMA_NO_ACCELERATE MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate + OBJS += ggml-blas.o endif endif # LLAMA_NO_ACCELERATE @@ -421,23 +422,35 @@ ifdef LLAMA_OPENBLAS MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) + OBJS += ggml-blas.o endif # LLAMA_OPENBLAS -ifndef LLAMA_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJS += sgemm.o -endif +ifdef LLAMA_OPENBLAS64 + MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) + MK_LDFLAGS += $(shell pkg-config --libs openblas64) + OBJS += ggml-blas.o +endif # LLAMA_OPENBLAS64 ifdef LLAMA_BLIS MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib + OBJS += ggml-blas.o endif # LLAMA_BLIS +ifndef LLAMA_NO_LLAMAFILE + MK_CPPFLAGS += -DGGML_USE_LLAMAFILE + OBJS += sgemm.o +endif + ifdef LLAMA_RPC MK_CPPFLAGS += -DGGML_USE_RPC OBJS += ggml-rpc.o endif # LLAMA_RPC +ggml-blas.o: ggml-blas.c ggml-blas.h + $(CC) $(CFLAGS) -c $< -o $@ + ifdef LLAMA_CUBLAS # LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 diff --git a/ggml-alloc.c b/ggml-alloc.c index 73a3c15756ba1..893884dbe5700 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -339,6 +339,7 @@ struct hash_node { }; struct tensor_alloc { + int buffer_id; size_t offset; size_t size_max; // 0 = pre-allocated, unused, or view }; @@ -349,7 +350,6 @@ struct leaf_alloc { }; struct node_alloc { - int buffer_id; struct tensor_alloc dst; struct tensor_alloc src[GGML_MAX_SRC]; }; @@ -511,17 +511,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor } } -static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { +static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { // graph outputs are never freed if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { AT_PRINTF("not freeing output %s\n", node->name); return; } - struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; - ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); size_t offset = hn->offset; + int buffer_id = hn->buffer_id; + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; size_t size = ggml_backend_buft_get_alloc_size(buft, node); ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); hn->allocated = false; @@ -626,11 +627,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { - ggml_gallocr_free_node(galloc, view_src, buffer_id); + ggml_gallocr_free_node(galloc, view_src); } } else if (p_hn->allocated) { - ggml_gallocr_free_node(galloc, parent, buffer_id); + ggml_gallocr_free_node(galloc, parent); } } AT_PRINTF("\n"); @@ -674,22 +675,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); + //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); if (node->view_src || node->data) { + node_alloc->dst.buffer_id = -1; node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.size_max = 0; } else { struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - node_alloc->dst.offset = hn->offset; - node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + node_alloc->dst.buffer_id = hn->buffer_id; + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (!src || src->view_src || src->data) { + node_alloc->src[j].buffer_id = -1; node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].size_max = 0; } else { struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].buffer_id = hn->buffer_id; node_alloc->src[j].offset = hn->offset; node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); } @@ -706,9 +711,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { + galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.size_max = 0; } else { + galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; galloc->leaf_allocs[i].leaf.offset = hn->offset; galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); } @@ -740,7 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { + int buffer_id = tensor_alloc->buffer_id; assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); if (tensor->view_src != NULL) { @@ -768,8 +776,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * } } -static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { - ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; +static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { + ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); return talloc->size_max >= node_size; } @@ -793,7 +801,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { + if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { #ifndef NDEBUG fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); #endif @@ -805,7 +813,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph if (src == NULL) { continue; } - if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { + if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { #ifndef NDEBUG fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); #endif @@ -846,7 +854,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; - ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); + ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf); } // nodes for (int i = 0; i < graph->n_nodes; i++) { @@ -857,9 +865,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (src == NULL) { continue; } - ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); + ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]); } - ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst); } return true; diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f121e1de420fa..36ca370867c9e 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -17,13 +17,15 @@ extern "C" { struct ggml_backend_buffer_type_i { const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + // allocate a buffer of this type ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size - size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // tensor alignment + size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); + // max buffer size that can be allocated + size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); + // data size needed to allocate the tensor, including padding + size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // check if tensor data is in host memory - // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); }; @@ -92,27 +94,37 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) + // create a new plan for a graph ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology + void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); + // compute the graph with the plan + enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph with a plan - enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - // check if the backend supports an operation + // check if the backend can compute an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // check if the backend can use tensors allocated in a buffer type + bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // these should be expensive operations with large batch sizes that may benefit from running on this backend // even if the weight has to be copied from the CPU temporarily bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); // (optional) event synchronization + // create a new event that can record events on this backend instance ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); void (*GGML_CALL event_free) (ggml_backend_event_t event); + // record an event on the backend instance that created it void (*GGML_CALL event_record) (ggml_backend_event_t event); + // wait for an event on on a different backend instance void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + // block until an event is recorded void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); }; diff --git a/ggml-backend.c b/ggml-backend.c index 05737ed696954..2cc9e09a8385f 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf return ggml_nbytes(tensor); } -bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return buft->iface.supports_backend(buft, backend); -} - bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { if (buft->iface.is_host) { return buft->iface.is_host(buft); @@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * return backend->iface.supports_op(backend, op); } +bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return backend->iface.supports_buft(backend, buft); +} + bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { if (backend->iface.offload_op != NULL) { return backend->iface.offload_op(backend, op); @@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_ GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cpu(backend); - - GGML_UNUSED(buft); -} - GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; @@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, @@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, @@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + static struct ggml_backend_i cpu_backend_i = { /* .get_name = */ ggml_backend_cpu_name, /* .free = */ ggml_backend_cpu_free, @@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .supports_buft = */ ggml_backend_cpu_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, @@ -1055,6 +1055,9 @@ struct ggml_backend_sched { int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] + int * prev_node_backend_ids; // [graph_size] + int * prev_leaf_backend_ids; // [graph_size] + // copy of the graph with modified inputs struct ggml_cgraph * graph; @@ -1097,15 +1100,16 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen return -1; } -static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) { +static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) { ggml_backend_buffer_t buffer = tensor->buffer; if (buffer == NULL) { return -1; } - // find highest prio backend that supports the buffer type + // find highest prio backend that supports the buffer type and the op for (int i = 0; i < sched->n_backends; i++) { - if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { + if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && + ggml_backend_supports_op(sched->backends[i], op)) { return i; } } @@ -1126,12 +1130,17 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED #define GET_CAUSE(node) "" #endif +//#define DEBUG_PASS1 +//#define DEBUG_PASS2 +//#define DEBUG_PASS3 +//#define DEBUG_PASS4 + // returns the backend that should be used for the node based on the current locations static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend - int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); + int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); return cur_backend_id; @@ -1139,7 +1148,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // view_src if (tensor->view_src != NULL) { - cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); + cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); return cur_backend_id; @@ -1161,7 +1170,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op if (src_backend_id == sched->n_backends - 1) { for (int b = 0; b < src_backend_id; b++) { @@ -1223,10 +1232,43 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } } -//#define DEBUG_PASS1 -//#define DEBUG_PASS2 -//#define DEBUG_PASS3 -//#define DEBUG_PASS4 +static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { + if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { + *node_backend_id = cur_backend_id; + SET_CAUSE(node, "2.2"); + } else { + for (int b = 0; b < sched->n_backends; b++) { + if (b == cur_backend_id) { + continue; + } + if (ggml_backend_supports_op(sched->backends[b], node)) { + *node_backend_id = b; + cur_backend_id = b; + SET_CAUSE(node, "2.2"); + break; + } + } + } + return cur_backend_id; +} + +static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) { + ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; + ggml_backend_buffer_type_t buft = NULL; + + if (buf) { + // the tensor is already allocated + buft = buf->buft; + } else { + // see if the tensor already has a backend assigned, and use the buffer type of that backend + int tensor_backend_id = tensor_backend_id(t); + if (tensor_backend_id != -1) { + buft = sched->bufts[tensor_backend_id]; + } + } + + return buft != NULL && ggml_backend_supports_buft(sched->backends[cur_backend_id], buft); +} // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { @@ -1306,9 +1348,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.2"); + } else if (cur_backend_id != -1) { + // FIXME: clean this + cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + if (cur_backend_id == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_backend_id = -1; + } } } } @@ -1328,9 +1374,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.1"); + } else if (cur_backend_id != -1) { + cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + if (cur_backend_id == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_backend_id = -1; + } } } } @@ -1345,9 +1394,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.4"); + } else if (cur_backend_id != -1) { + cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); } } } @@ -1362,9 +1410,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.3"); + } else if (cur_backend_id != -1) { + cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); } } } @@ -1448,10 +1495,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } // check if the split has too many inputs + // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); int src_backend_id = sched->tensor_backend_id[id]; - if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { + bool supported = buffer_supported(sched, src, cur_backend_id); + if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; @@ -1486,7 +1535,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now - if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { + if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { size_t id = hash_id(src); if (sched->tensor_copies[id][src_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; @@ -1511,7 +1560,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != node_backend_id) { + bool supported = buffer_supported(sched, src, cur_backend_id); + if (src_backend_id != cur_backend_id && !supported) { // create a copy of the input in the split's backend const size_t id = hash_id(src); if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { @@ -1543,6 +1593,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // create copies of the graph for each split // TODO: avoid this copy + + // swap node_backend_ids and leaf_backend_ids and prevs + { + int * tmp = sched->node_backend_ids; + sched->node_backend_ids = sched->prev_node_backend_ids; + sched->prev_node_backend_ids = tmp; + + tmp = sched->leaf_backend_ids; + sched->leaf_backend_ids = sched->prev_leaf_backend_ids; + sched->prev_leaf_backend_ids = tmp; + } + struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; @@ -1613,8 +1675,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { + bool backend_ids_changed = false; + for (int i = 0; i < sched->graph->n_nodes; i++) { + if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) { + backend_ids_changed = true; + break; + } + } + if (!backend_ids_changed) { + for (int i = 0; i < sched->graph->n_leafs; i++) { + if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) { + backend_ids_changed = true; + break; + } + } + } + // allocate graph - if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG @@ -1735,6 +1813,8 @@ ggml_backend_sched_t ggml_backend_sched_new( const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); + sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->n_backends = n_backends; @@ -1747,7 +1827,7 @@ ggml_backend_sched_t ggml_backend_sched_new( for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); - GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b])); + GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); if (sched->n_copies > 1) { for (int c = 0; c < sched->n_copies; c++) { sched->events[b][c] = ggml_backend_event_new(backends[b]); diff --git a/ggml-backend.h b/ggml-backend.h index c582b06850ed1..47fd814751795 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -23,7 +23,6 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer @@ -74,6 +73,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends @@ -90,7 +90,7 @@ extern "C" { GGML_API void ggml_backend_event_free (ggml_backend_event_t event); GGML_API void ggml_backend_event_record (ggml_backend_event_t event); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event + GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // // CPU backend @@ -119,7 +119,7 @@ extern "C" { GGML_API size_t ggml_backend_reg_get_count(void); GGML_API size_t ggml_backend_reg_find_by_name(const char * name); - GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] + GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); diff --git a/ggml-blas.c b/ggml-blas.c new file mode 100644 index 0000000000000..dab6fcf47a8ee --- /dev/null +++ b/ggml-blas.c @@ -0,0 +1,257 @@ +#include "ggml-blas.h" +#include "ggml-backend-impl.h" + +#include + +#if defined(GGML_USE_ACCELERATE) +# include +#elif defined(GGML_USE_OPENBLAS) +# if defined(GGML_BLAS_USE_MKL) +# include +# else +# include +# endif +#endif + +struct ggml_backend_blas_context { + int n_threads; + void * work_data; + size_t work_size; +}; + +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + //const int64_t ne00 = src0->ne[0]; + //const int64_t ne01 = src0->ne[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + ((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { + + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; + } + + return false; +} + +static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne13*ne12*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + free(ctx->work_data); + ctx->work_data = malloc(desired_wsize); + GGML_ASSERT(ctx->work_data != NULL); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data; + + // convert src0 to float + if (true) { + if (type != GGML_TYPE_F32) { + ggml_to_float_t const to_float = type_traits.to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(ctx->n_threads) +#endif + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } + } + } + } + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } +} + +// backend interface + +GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { + return "BLAS"; + + GGML_UNUSED(backend); +} + +GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { + free(backend); +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(backend); +} + +GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_blas_mul_mat(ctx, node); + break; + + // TODO + //case GGML_OP_OUT_PROD: + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + GGML_ASSERT(false); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return op->op == GGML_OP_MUL_MAT && ggml_compute_forward_mul_mat_use_blas(op); + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i blas_backend_i = { + /* .get_name = */ ggml_backend_blas_name, + /* .free = */ ggml_backend_blas_free, + /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .supports_op = */ ggml_backend_blas_supports_op, + /* .supports_buft = */ ggml_backend_blas_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static ggml_guid_t ggml_backend_blas_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +ggml_backend_t ggml_backend_blas_init(void) { + ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); + if (backend == NULL) { + return NULL; + } + struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context)); + if (ctx == NULL) { + return NULL; + } + + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + + *backend = (struct ggml_backend) { + /* .guid = */ ggml_backend_blas_guid(), + /* .interface = */ blas_backend_i, + /* .context = */ ctx, + }; + + return backend; +} + +GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); +} + +void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { + GGML_ASSERT(ggml_backend_is_blas(backend_blas)); + + struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context; + ctx->n_threads = n_threads; +} diff --git a/ggml-blas.h b/ggml-blas.h new file mode 100644 index 0000000000000..646ca84ef8122 --- /dev/null +++ b/ggml-blas.h @@ -0,0 +1,22 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); + +GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); + +// number of threads used for conversion to float +GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c81c6a0d783be..c2c9940bff05e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu return ctx->name.c_str(); } +static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_buffer_type_name; +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; @@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_cuda(backend)) { - return false; - } - - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - return buft_ctx->device == cuda_ctx->device; -} - static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -929,6 +921,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back GGML_UNUSED(buft); } +static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name; +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor @@ -972,12 +968,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); - - GGML_UNUSED(buft); -} - GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -990,7 +980,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; @@ -1090,7 +1079,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -2919,6 +2907,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (ggml_backend_buft_is_cuda_split(buft)) { + return true; + } + + if (ggml_backend_buft_is_cuda(buft)) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + return buft_ctx->device == cuda_ctx->device; + } + + return false; +} + GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; @@ -2991,9 +2993,11 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .synchronize = */ ggml_backend_cuda_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .supports_buft = */ ggml_backend_cuda_supports_buft, /* .offload_op = */ ggml_backend_cuda_offload_op, /* .event_new = */ ggml_backend_cuda_event_new, /* .event_free = */ ggml_backend_cuda_event_free, diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index eabd70d5eeed8..721080031157f 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1899,18 +1899,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_ return ctx->max_alloc; } -static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - GGML_UNUSED(buft); - return ggml_backend_is_kompute(backend); -} - static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -1970,6 +1964,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc return ggml_vk_supports_op(op); } +static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(backend); + return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name; +} + static struct ggml_backend_i kompute_backend_i = { /* .get_name = */ ggml_backend_kompute_name, /* .free = */ ggml_backend_kompute_free, @@ -1980,9 +1979,11 @@ static struct ggml_backend_i kompute_backend_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .supports_buft = */ ggml_backend_kompute_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-metal.m b/ggml-metal.m index fddc44f78d8af..154c8052a44fd 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -3040,12 +3040,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend UNUSED(buft); } -GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - - UNUSED(buft); -} - GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; @@ -3060,7 +3054,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, @@ -3175,6 +3168,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con return ggml_metal_supports_op(metal_ctx, op); } +GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + + UNUSED(backend); +} + static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, @@ -3185,9 +3184,11 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .supports_buft = */ ggml_backend_metal_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 679ce4f280c5f..9b95193d3229d 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend return ggml_nbytes(tensor); } -GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_rpc(backend)) { - return false; - } - ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; - ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; - return buft_ctx->endpoint == rpc_ctx->endpoint; -} - static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { /* .get_name = */ ggml_backend_rpc_buffer_type_name, /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_rpc_get_max_size, /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const return false; } +GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) { + return false; + } + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + return buft_ctx->endpoint == rpc_ctx->endpoint; +} + static ggml_backend_i ggml_backend_rpc_interface = { /* .get_name = */ ggml_backend_rpc_name, /* .free = */ ggml_backend_rpc_free, @@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .synchronize = */ ggml_backend_rpc_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .supports_op = */ ggml_backend_rpc_supports_op, + /* .supports_buft = */ ggml_backend_rpc_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 5cd97e4ff98df..332f9991e1020 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16631,22 +16631,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen UNUSED(buft); } -GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_sycl(backend)) { - return false; - } - ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - return buft_ctx->device == sycl_ctx->device; -} - static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend, /* .is_host = */ nullptr, }; @@ -16998,12 +16988,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_sycl(backend); - - UNUSED(buft); -} - GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -17016,7 +17000,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, }; @@ -17102,7 +17085,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -17367,6 +17349,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) { + return false; + } + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + return buft_ctx->device == sycl_ctx->device; +} static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, @@ -17378,9 +17368,11 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .supports_buft = */ ggml_backend_sycl_supports_buft, /* .offload_op = */ ggml_backend_sycl_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5e12ea9dde4d7..a07c646b90129 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -6110,24 +6110,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_ UNUSED(buft); } -GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_vk(backend)) { - return false; - } - - ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return buft_ctx->ctx->idx == ctx->idx; -} - static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_name = */ ggml_backend_vk_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -6203,7 +6191,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -6524,6 +6511,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g UNUSED(backend); } +GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { + return false; + } + + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return buft_ctx->ctx->idx == ctx->idx; +} + // TODO: enable async and synchronize static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, @@ -6535,9 +6533,11 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .supports_buft = */ ggml_backend_vk_supports_buft, /* .offload_op = */ ggml_backend_vk_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml.c b/ggml.c index 11e5c34ab56ad..0724b3b49a03e 100644 --- a/ggml.c +++ b/ggml.c @@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #if defined(GGML_USE_ACCELERATE) #include -#elif defined(GGML_USE_OPENBLAS) -#if defined(GGML_BLAS_USE_MKL) -#include -#else -#include -#endif #endif // floating point type used to accumulate sums @@ -12216,39 +12210,6 @@ static void ggml_compute_forward_group_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float) - // all the experts for each batch element and the processing would become incredibly slow - // TODO: find the optimal values for these - if (dst->op != GGML_OP_MUL_MAT_ID && - ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - //src0->type == GGML_TYPE_F32 && - src1->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} -#endif - static void ggml_compute_forward_mul_mat_one_chunk( const struct ggml_compute_params * params, struct ggml_tensor * dst, @@ -12386,73 +12347,6 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(dst)) { - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); - UNUSED(desired_wsize); - - if (params->type == GGML_TASK_TYPE_INIT) { - if (type != GGML_TYPE_F32) { - assert(params->wsize >= desired_wsize); - // parallelize by src0 rows - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - // broadcast src0 into src1 across 2nd,3rd dimension - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; - ggml_to_float_t const to_float = type_traits[type].to_float; - - for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00); - } - } - } - } - return; - } - - if (params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - // perform sgemm, parallelization controlled by blas lib - if (ith != 0) { - return; - } - - //const int64_t tgemm0 = ggml_perf_time_us(); - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - if (type != GGML_TYPE_F32) { - x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; - } - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne1, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2); - - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif - #if GGML_USE_LLAMAFILE const bool src1_cont = ggml_is_contiguous(src1); @@ -12833,19 +12727,7 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - bool use_blas = ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); -#endif - if (params->type == GGML_TASK_TYPE_INIT) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst - if (use_blas) { - return; - } -#endif if (ith != 0) { return; } @@ -12857,50 +12739,6 @@ static void ggml_compute_forward_out_prod_f32( return; } -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (use_blas) { - if (params->ith != 0) { // All threads other than the first do no work. - return; - } - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) - // src0: (k,n) - // src1: (k,m) - // dst: (m,n) - // - // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) - // Also expressed as (major,minor) - // a: (m,k): so src1 transposed - // b: (k,n): so src0 - // c: (m,n) - // - // However, if ggml_is_transposed(src1) is true, then - // src1->data already contains a transposed version, so sgemm mustn't - // transpose it further. - - int n = src0->ne[0]; - int k = src0->ne[1]; - int m = src1->ne[0]; - - int transposeA, lda; - - if (!ggml_is_transposed(src1)) { - transposeA = CblasTrans; - lda = m; - } else { - transposeA = CblasNoTrans; - lda = k; - } - - float * a = (float *) ((char *) src1->data); - float * b = (float *) ((char *) src0->data); - float * c = (float *) ((char *) dst->data); - - cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - - return; - } -#endif - // dst[:,:,:,:] = 0 // for i2,i3: // for i1: @@ -13030,8 +12868,6 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; @@ -13428,6 +13264,8 @@ static void ggml_compute_forward_get_rows_q( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); @@ -13471,6 +13309,8 @@ static void ggml_compute_forward_get_rows_f16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + ggml_fp16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); @@ -13514,7 +13354,9 @@ static void ggml_compute_forward_get_rows_bf16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - ggml_bf16_to_fp32_row( + assert(i01 >= 0 && i01 < ne01); + + ggml_bf16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -13557,6 +13399,8 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); @@ -19504,17 +19348,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa { const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node)) { - if (node->src[0]->type != GGML_TYPE_F32) { - // here we need memory for fully dequantized matrix from src0 - // take into account that src0 can be broadcasted into src1[2,3] - cur = ggml_type_size(GGML_TYPE_F32) - * node->src[0]->ne[0]*node->src[0]->ne[1] - * node->src[1]->ne[2]*node->src[1]->ne[3]; - } - } else -#endif if (node->src[1]->type != vec_dot_type) { cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); } diff --git a/llama.cpp b/llama.cpp index 06889126ecdc4..8208786beabec 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,6 +21,10 @@ # include "ggml-kompute.h" #endif +#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) +# include "ggml-blas.h" +#endif + #ifdef GGML_USE_METAL # include "ggml-metal.h" #endif @@ -2298,9 +2302,13 @@ struct llama_context { std::vector backends; #ifdef GGML_USE_METAL ggml_backend_t backend_metal = nullptr; +#endif +#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) + ggml_backend_t backend_blas = nullptr; #endif ggml_backend_t backend_cpu = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -11516,17 +11524,17 @@ static struct ggml_cgraph * llama_build_graph( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; - if (batch.n_tokens < 32 || full_offload) { - if (il != -1 && strcmp(name, "norm") == 0) { - for (auto * backend : lctx.backends) { - if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); - break; - } - } - } - } + //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; + //if (batch.n_tokens < 32 || full_offload) { + // if (il != -1 && strcmp(name, "norm") == 0) { + // for (auto * backend : lctx.backends) { + // if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + // ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + // break; + // } + // } + // } + //} }; struct ggml_cgraph * result = NULL; @@ -12017,6 +12025,11 @@ static void llama_graph_compute( ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } +#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) + if (lctx.backend_blas != nullptr) { + ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads); + } +#endif ggml_backend_sched_graph_compute_async(lctx.sched, gf); @@ -12246,9 +12259,9 @@ static int llama_decode_internal( // with the BLAS calls. need a better solution // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - n_threads = std::min(4, n_threads); - } + //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + // n_threads = std::min(4, n_threads); + //} ggml_backend_sched_alloc_graph(lctx.sched, gf); @@ -16226,6 +16239,16 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(backend); } #endif + +#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) + ctx->backend_blas = ggml_backend_blas_init(); + if (ctx->backend_blas == nullptr) { + LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__); + } else { + ctx->backends.push_back(ctx->backend_blas); + } +#endif + #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { for (const auto & endpoint : model->rpc_servers) { From b88957e519c3bdd5cf231522d41163b72e5d5b12 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Jun 2024 00:35:55 +0200 Subject: [PATCH 02/14] rename GGML_USE_OPENBLAS to GGML_USE_BLAS --- CMakeLists.txt | 2 +- Makefile | 6 +++--- ggml-blas.c | 5 +---- ggml.c | 2 +- llama.cpp | 8 ++++---- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2933e714842de..e4eaed0706eea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -374,7 +374,7 @@ if (LLAMA_BLAS) add_compile_options(${BLAS_LINKER_FLAGS}) - add_compile_definitions(GGML_USE_OPENBLAS) + add_compile_definitions(GGML_USE_BLAS) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) diff --git a/Makefile b/Makefile index d45b2759bed2b..59dd8533666d3 100644 --- a/Makefile +++ b/Makefile @@ -419,21 +419,21 @@ ifndef LLAMA_NO_OPENMP endif # LLAMA_NO_OPENMP ifdef LLAMA_OPENBLAS - MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) OBJS += ggml-blas.o endif # LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS64 - MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64) + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64) OBJS += ggml-blas.o endif # LLAMA_OPENBLAS64 ifdef LLAMA_BLIS - MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis + MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib OBJS += ggml-blas.o endif # LLAMA_BLIS diff --git a/ggml-blas.c b/ggml-blas.c index dab6fcf47a8ee..6d527c041b441 100644 --- a/ggml-blas.c +++ b/ggml-blas.c @@ -5,7 +5,7 @@ #if defined(GGML_USE_ACCELERATE) # include -#elif defined(GGML_USE_OPENBLAS) +#elif defined(GGML_USE_BLAS) # if defined(GGML_BLAS_USE_MKL) # include # else @@ -25,9 +25,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; diff --git a/ggml.c b/ggml.c index 0724b3b49a03e..e4ef34f2565fb 100644 --- a/ggml.c +++ b/ggml.c @@ -22645,7 +22645,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) return 1; #else return 0; diff --git a/llama.cpp b/llama.cpp index 8208786beabec..57d007f3313a3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,7 +21,7 @@ # include "ggml-kompute.h" #endif -#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) # include "ggml-blas.h" #endif @@ -2303,7 +2303,7 @@ struct llama_context { #ifdef GGML_USE_METAL ggml_backend_t backend_metal = nullptr; #endif -#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) ggml_backend_t backend_blas = nullptr; #endif ggml_backend_t backend_cpu = nullptr; @@ -12025,7 +12025,7 @@ static void llama_graph_compute( ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } -#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) if (lctx.backend_blas != nullptr) { ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads); } @@ -16240,7 +16240,7 @@ struct llama_context * llama_new_context_with_model( } #endif -#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) ctx->backend_blas = ggml_backend_blas_init(); if (ctx->backend_blas == nullptr) { LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__); From 77f88e350e10a7c0b5ded8ffeac7ae9504ad545e Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Jun 2024 01:40:43 +0200 Subject: [PATCH 03/14] add support for out_prod --- CMakeLists.txt | 2 +- ggml-blas.c | 124 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 98 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4eaed0706eea..6e5baa6a460a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,8 +92,8 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") +option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) diff --git a/ggml-blas.c b/ggml-blas.c index 6d527c041b441..f826e7ab85c12 100644 --- a/ggml-blas.c +++ b/ggml-blas.c @@ -5,12 +5,10 @@ #if defined(GGML_USE_ACCELERATE) # include -#elif defined(GGML_USE_BLAS) -# if defined(GGML_BLAS_USE_MKL) -# include -# else -# include -# endif +#elif defined(GGML_BLAS_USE_MKL) +# include +#else +# include #endif struct ggml_backend_blas_context { @@ -21,7 +19,7 @@ struct ggml_backend_blas_context { // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst) { +static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -72,11 +70,8 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne13*ne12*ne_plane*sizeof(float); + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); if (ctx->work_size < desired_wsize) { free(ctx->work_data); @@ -87,21 +82,19 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st void * wdata = ctx->work_data; // convert src0 to float - if (true) { - if (type != GGML_TYPE_F32) { - ggml_to_float_t const to_float = type_traits.to_float; + if (type != GGML_TYPE_F32) { + ggml_to_float_t const to_float = type_traits.to_float; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(ctx->n_threads) #endif - for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } } } @@ -129,6 +122,70 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st } } +static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + int transposeA; + int lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + GGML_UNUSED(ctx); +} + // backend interface GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { @@ -138,6 +195,9 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { } GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { + struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; + free(ctx->work_data); + free(ctx); free(backend); } @@ -158,8 +218,9 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t ggml_backend_blas_mul_mat(ctx, node); break; - // TODO - //case GGML_OP_OUT_PROD: + case GGML_OP_OUT_PROD: + ggml_backend_blas_out_prod(ctx, node); + break; case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -180,7 +241,16 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t } GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - return op->op == GGML_OP_MUL_MAT && ggml_compute_forward_mul_mat_use_blas(op); + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || + (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); GGML_UNUSED(backend); } @@ -229,9 +299,9 @@ ggml_backend_t ggml_backend_blas_init(void) { return NULL; } - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; *backend = (struct ggml_backend) { /* .guid = */ ggml_backend_blas_guid(), From 845fa20f26fed397c53a3d2eb09d610650ca8657 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Jun 2024 02:16:13 +0200 Subject: [PATCH 04/14] alloc : reuse same buffer when the same buffer type if used multiple times --- ggml-alloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++---- ggml-blas.c | 3 --- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 893884dbe5700..8973ef81323e2 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs for (int i = 0; i < n_bufs; i++) { galloc->bufts[i] = bufts[i]; galloc->buffers[i] = NULL; - size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); - galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + + // check if the same buffer type is used multiple times and reuse the same allocator + for (int j = 0; j < i; j++) { + if (bufts[i] == bufts[j]) { + galloc->buf_tallocs[i] = galloc->buf_tallocs[j]; + break; + } + } + + if (galloc->buf_tallocs[i] == NULL) { + size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + } } galloc->n_buffers = n_bufs; @@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { for (int i = 0; i < galloc->n_buffers; i++) { if (galloc->buffers != NULL) { - ggml_backend_buffer_free(galloc->buffers[i]); + // skip if already freed + bool freed = false; + for (int j = 0; j < i; j++) { + if (galloc->buffers[j] == galloc->buffers[i]) { + freed = true; + break; + } + } + if (!freed) { + ggml_backend_buffer_free(galloc->buffers[i]); + } } if (galloc->buf_tallocs != NULL) { - ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + // skip if already freed + bool freed = false; + for (int j = 0; j < i; j++) { + if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { + freed = true; + break; + } + } + if (!freed) { + ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } } } @@ -723,6 +754,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // reallocate buffers if needed for (int i = 0; i < galloc->n_buffers; i++) { + // if the buffer type is used multiple times, we reuse the same buffer + for (int j = 0; j < i; j++) { + if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { + galloc->buffers[i] = galloc->buffers[j]; + break; + } + } + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); @@ -731,6 +770,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c #ifndef NDEBUG fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif + ggml_backend_buffer_free(galloc->buffers[i]); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { @@ -879,6 +919,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { if (galloc->buffers[buffer_id] == NULL) { return 0; } + + for (int i = 0; i < buffer_id; i++) { + if (galloc->buffers[i] == galloc->buffers[buffer_id]) { + // this buffer is the same as a previous one due to the same buffer type being used multiple times + // only return the buffer size the first time it appears to avoid double counting + return 0; + } + } + return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } diff --git a/ggml-blas.c b/ggml-blas.c index f826e7ab85c12..edb5474ddfebe 100644 --- a/ggml-blas.c +++ b/ggml-blas.c @@ -144,9 +144,6 @@ static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, s // GGML_ASSERT(nb1 <= nb2); // GGML_ASSERT(nb2 <= nb3); - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) // src0: (k,n) // src1: (k,m) From 2bfdb7fe4e6c71d919bef19324f100589af6bd2f Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Jun 2024 03:12:50 +0200 Subject: [PATCH 05/14] support multithreaded dequantization with std::async when openmp is not available --- CMakeLists.txt | 2 +- Makefile | 6 ++-- ggml-blas.c => ggml-blas.cpp | 66 ++++++++++++++++++++---------------- 3 files changed, 41 insertions(+), 33 deletions(-) rename ggml-blas.c => ggml-blas.cpp (84%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e5baa6a460a2..d8f7780f709a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -381,7 +381,7 @@ if (LLAMA_BLAS) endif() set(GGML_HEADERS_BLAS ggml-blas.h) - set(GGML_SOURCES_BLAS ggml-blas.c) + set(GGML_SOURCES_BLAS ggml-blas.cpp) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) diff --git a/Makefile b/Makefile index 59dd8533666d3..e57fcd9adb239 100644 --- a/Makefile +++ b/Makefile @@ -448,9 +448,6 @@ ifdef LLAMA_RPC OBJS += ggml-rpc.o endif # LLAMA_RPC -ggml-blas.o: ggml-blas.c ggml-blas.h - $(CC) $(CFLAGS) -c $< -o $@ - ifdef LLAMA_CUBLAS # LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 @@ -752,6 +749,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ +ggml-blas.o: ggml-blas.cpp ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + unicode.o: unicode.cpp unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/ggml-blas.c b/ggml-blas.cpp similarity index 84% rename from ggml-blas.c rename to ggml-blas.cpp index edb5474ddfebe..2537a4a0f0526 100644 --- a/ggml-blas.c +++ b/ggml-blas.cpp @@ -1,7 +1,8 @@ #include "ggml-blas.h" #include "ggml-backend-impl.h" -#include +#include +#include #if defined(GGML_USE_ACCELERATE) # include @@ -13,7 +14,7 @@ struct ggml_backend_blas_context { int n_threads; - void * work_data; + char * work_data; size_t work_size; }; @@ -41,7 +42,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { return false; } -static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { +static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -74,15 +75,15 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); if (ctx->work_size < desired_wsize) { - free(ctx->work_data); - ctx->work_data = malloc(desired_wsize); - GGML_ASSERT(ctx->work_data != NULL); + delete[] ctx->work_data; + ctx->work_data = new char[desired_wsize]; ctx->work_size = desired_wsize; } void * wdata = ctx->work_data; // convert src0 to float if (type != GGML_TYPE_F32) { + std::vector> tasks; ggml_to_float_t const to_float = type_traits.to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -92,12 +93,26 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(ctx->n_threads) -#endif for (int64_t i01 = 0; i01 < ne01; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } +#else + for (int i = 0; i < ctx->n_threads; i++) { + tasks.push_back(std::async(std::launch::async, [=]() { + const int64_t start = i*ne01/ctx->n_threads; + const int64_t end = (i + 1)*ne01/ctx->n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } +#endif } } + // wait for all tasks to finish + for (auto & task : tasks) { + task.get(); + } } for (int64_t i13 = 0; i13 < ne13; i13++) { @@ -105,7 +120,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); @@ -122,7 +137,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st } } -static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -163,7 +178,7 @@ static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, s int k = src0->ne[1]; int m = src1->ne[0]; - int transposeA; + CBLAS_TRANSPOSE transposeA; int lda; if (!ggml_is_transposed(src1)) { @@ -192,10 +207,10 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { } GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { - struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; - free(ctx->work_data); - free(ctx); - free(backend); + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete[] ctx->work_data; + delete ctx; + delete backend; } GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { @@ -205,7 +220,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer } GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -287,20 +302,13 @@ static ggml_guid_t ggml_backend_blas_guid(void) { } ggml_backend_t ggml_backend_blas_init(void) { - ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); - if (backend == NULL) { - return NULL; - } - struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context)); - if (ctx == NULL) { - return NULL; - } - - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + ggml_backend_blas_context * ctx = new ggml_backend_blas_context{ + /* .n_threads = */ GGML_DEFAULT_N_THREADS, + /* .work_data = */ NULL, + /* .work_size = */ 0, + }; - *backend = (struct ggml_backend) { + ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), /* .interface = */ blas_backend_i, /* .context = */ ctx, @@ -316,6 +324,6 @@ GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { GGML_ASSERT(ggml_backend_is_blas(backend_blas)); - struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context; + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; ctx->n_threads = n_threads; } From 0425305d3249be9626b008d559d44b974300dea8 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Jun 2024 19:54:14 +0200 Subject: [PATCH 06/14] fix apple build --- CMakeLists.txt | 11 ++++++++--- Makefile | 2 +- ggml-alloc.c | 1 - ggml-blas.cpp | 12 ++++++++---- ggml.c | 2 +- llama.cpp | 8 ++++---- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d8f7780f709a6..5576c26e1cc5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,8 +39,12 @@ endif() if (APPLE) set(LLAMA_METAL_DEFAULT ON) + set(LLAMA_BLAS_DEFAULT ON) + set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") else() set(LLAMA_METAL_DEFAULT OFF) + set(LLAMA_BLAS_DEFAULT OFF) + set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") endif() set(LLAMA_LLAMAFILE_DEFAULT ON) @@ -91,8 +95,9 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) -set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") +option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) +set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING + "llama: BLAS library vendor") option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) @@ -321,7 +326,7 @@ if (LLAMA_BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - if ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 find_package(PkgConfig REQUIRED) diff --git a/Makefile b/Makefile index e57fcd9adb239..adc9fa4347de7 100644 --- a/Makefile +++ b/Makefile @@ -404,7 +404,7 @@ ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate diff --git a/ggml-alloc.c b/ggml-alloc.c index 8973ef81323e2..0048e5c922e79 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -706,7 +706,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); if (node->view_src || node->data) { node_alloc->dst.buffer_id = -1; node_alloc->dst.offset = SIZE_MAX; diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 2537a4a0f0526..608ead1902a02 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -16,6 +16,7 @@ struct ggml_backend_blas_context { int n_threads; char * work_data; size_t work_size; + std::vector> tasks; }; // helper function to determine if it is better to use BLAS or not @@ -33,7 +34,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->type == GGML_TYPE_F32 && - ((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ return true; @@ -83,7 +84,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { - std::vector> tasks; ggml_to_float_t const to_float = type_traits.to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -98,7 +98,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } #else for (int i = 0; i < ctx->n_threads; i++) { - tasks.push_back(std::async(std::launch::async, [=]() { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { const int64_t start = i*ne01/ctx->n_threads; const int64_t end = (i + 1)*ne01/ctx->n_threads; for (int64_t i01 = start; i01 < end; i01++) { @@ -109,10 +109,14 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #endif } } + +#ifndef GGML_USE_OPENMP // wait for all tasks to finish - for (auto & task : tasks) { + for (auto & task : ctx->tasks) { task.get(); } + ctx->tasks.clear(); +#endif } for (int64_t i13 = 0; i13 < ne13; i13++) { diff --git a/ggml.c b/ggml.c index e4ef34f2565fb..01589c10e9cec 100644 --- a/ggml.c +++ b/ggml.c @@ -22645,7 +22645,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) return 1; #else return 0; diff --git a/llama.cpp b/llama.cpp index 57d007f3313a3..ec087a0a31173 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,7 +21,7 @@ # include "ggml-kompute.h" #endif -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS # include "ggml-blas.h" #endif @@ -2303,7 +2303,7 @@ struct llama_context { #ifdef GGML_USE_METAL ggml_backend_t backend_metal = nullptr; #endif -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS ggml_backend_t backend_blas = nullptr; #endif ggml_backend_t backend_cpu = nullptr; @@ -12025,7 +12025,7 @@ static void llama_graph_compute( ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS if (lctx.backend_blas != nullptr) { ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads); } @@ -16240,7 +16240,7 @@ struct llama_context * llama_new_context_with_model( } #endif -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS ctx->backend_blas = ggml_backend_blas_init(); if (ctx->backend_blas == nullptr) { LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__); From 2dd049ed4527d093a63f8b44d4759dfb03863f0d Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 7 Jun 2024 02:18:27 +0200 Subject: [PATCH 07/14] ggml-ci --- ggml-blas.cpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 608ead1902a02..92d0e66372cee 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -13,10 +13,12 @@ #endif struct ggml_backend_blas_context { - int n_threads; - char * work_data; - size_t work_size; + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +#ifndef GGML_USE_OPENMP std::vector> tasks; +#endif }; // helper function to determine if it is better to use BLAS or not @@ -76,11 +78,10 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); if (ctx->work_size < desired_wsize) { - delete[] ctx->work_data; - ctx->work_data = new char[desired_wsize]; + ctx->work_data.reset(new char[desired_wsize]); ctx->work_size = desired_wsize; } - void * wdata = ctx->work_data; + void * wdata = ctx->work_data.get(); // convert src0 to float if (type != GGML_TYPE_F32) { @@ -212,7 +213,6 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; - delete[] ctx->work_data; delete ctx; delete backend; } @@ -306,11 +306,7 @@ static ggml_guid_t ggml_backend_blas_guid(void) { } ggml_backend_t ggml_backend_blas_init(void) { - ggml_backend_blas_context * ctx = new ggml_backend_blas_context{ - /* .n_threads = */ GGML_DEFAULT_N_THREADS, - /* .work_data = */ NULL, - /* .work_size = */ 0, - }; + ggml_backend_blas_context * ctx = new ggml_backend_blas_context; ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), From 63e06d0c28452b2618e620316f14ad06fd89ecd5 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 7 Jun 2024 04:35:11 +0200 Subject: [PATCH 08/14] reuse main thread --- ggml-blas.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 92d0e66372cee..ade10b9ac586a 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -98,15 +98,23 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } #else - for (int i = 0; i < ctx->n_threads; i++) { + for (int i = 0; i < ctx->n_threads - 1; i++) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { - const int64_t start = i*ne01/ctx->n_threads; + const int64_t start = i*ne01/ctx->n_threads; const int64_t end = (i + 1)*ne01/ctx->n_threads; for (int64_t i01 = start; i01 < end; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } })); } + { + // reuse the current thread for the last task + const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads; + const int64_t end = ne01; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } #endif } } From a8a1bf798104568d0cdad7011fd40c3ae228467c Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 7 Jun 2024 19:29:35 +0200 Subject: [PATCH 09/14] set number of threads automatically for openblas and blis --- ggml-blas.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ggml-blas.cpp b/ggml-blas.cpp index ade10b9ac586a..3d146fc013913 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -10,6 +10,9 @@ # include #else # include +# ifdef BLIS_ENABLE_CBLAS +# include +# endif #endif struct ggml_backend_blas_context { @@ -128,6 +131,15 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #endif } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(BLIS_ENABLE_CBLAS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i12 = 0; i12 < ne12; i12++) { const int64_t i03 = i13/r3; @@ -322,6 +334,16 @@ ggml_backend_t ggml_backend_blas_init(void) { /* .context = */ ctx, }; +#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) + if (openblas_get_parallel() != OPENBLAS_OPENMP) { + fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + } +#endif + +#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#endif + return backend; } From ecb75b5f54cab6ca7f77ec51eb5f7d87c87be6cd Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 7 Jun 2024 20:00:38 +0200 Subject: [PATCH 10/14] sched : print assignments when GGML_SCHED_DEBUG env variable is set --- ggml-backend.c | 45 +++++++++++++++++---------------------------- ggml-blas.h | 1 + llama.cpp | 34 ++++++++++++---------------------- 3 files changed, 30 insertions(+), 50 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 2cc9e09a8385f..80e129cf843f8 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1078,6 +1078,8 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; + bool debug; + // align context_buffer to GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) @@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED #define GET_CAUSE(node) "" #endif -//#define DEBUG_PASS1 -//#define DEBUG_PASS2 -//#define DEBUG_PASS3 -//#define DEBUG_PASS4 - // returns the backend that should be used for the node based on the current locations static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { // TODO: use supports_op to check if the backend supports the op @@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } } -static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { +static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.2"); @@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node return cur_backend_id; } -static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) { +static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) { ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; ggml_backend_buffer_type_t buft = NULL; @@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } -#ifdef DEBUG_PASS1 - fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 2: expand current backend assignments // assign the same backend to adjacent nodes @@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } else if (cur_backend_id != -1) { // FIXME: clean this - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); if (cur_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; @@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); if (cur_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; @@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); } } } @@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); } } } -#ifdef DEBUG_PASS2 - fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif - // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } -#ifdef DEBUG_PASS3 - fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 4: split graph, find tensors that need to be copied { @@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); int src_backend_id = sched->tensor_backend_id[id]; - bool supported = buffer_supported(sched, src, cur_backend_id); + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; @@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - bool supported = buffer_supported(sched, src, cur_backend_id); + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && !supported) { // create a copy of the input in the split's backend const size_t id = hash_id(src); @@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->i_end = graph->n_nodes; sched->n_splits = i_split + 1; } -#ifdef DEBUG_PASS4 - fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif - // create copies of the graph for each split - // TODO: avoid this copy + if (sched->debug) { + ggml_backend_sched_print_assignments(sched, graph); + } // swap node_backend_ids and leaf_backend_ids and prevs { @@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } + // create copies of the graph for each split + // TODO: avoid this copy struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; @@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); diff --git a/ggml-blas.h b/ggml-blas.h index 646ca84ef8122..f2e37de06f609 100644 --- a/ggml-blas.h +++ b/ggml-blas.h @@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); // number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); diff --git a/llama.cpp b/llama.cpp index 7e76c022b75d9..225ea977f4612 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; - //if (batch.n_tokens < 32 || full_offload) { - // if (il != -1 && strcmp(name, "norm") == 0) { - // for (auto * backend : lctx.backends) { - // if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { - // ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); - // break; - // } - // } - // } - //} + const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; + if (batch.n_tokens < 32 || full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + for (auto * backend : lctx.backends) { + if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) && + (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) { + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + break; + } + } + } + } }; struct ggml_cgraph * result = NULL; @@ -12261,17 +12262,6 @@ static int llama_decode_internal( } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well - // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering - // with the BLAS calls. need a better solution - // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is - // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - // n_threads = std::min(4, n_threads); - //} - ggml_backend_sched_alloc_graph(lctx.sched, gf); llama_set_inputs(lctx, u_batch); From e06659811e118462b22ed5c16ed3106544d522d9 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 8 Jun 2024 17:00:26 +0200 Subject: [PATCH 11/14] fixes --- examples/llama-bench/llama-bench.cpp | 1 + ggml-backend.c | 56 +++++++++++++++++++++++----- ggml-blas.cpp | 36 ++++++++++-------- ggml.c | 18 ++++++--- 4 files changed, 80 insertions(+), 31 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 5c31548a6c25c..2a263d2a281b1 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; for (int i = 1; i < argc; i++) { arg = argv[i]; diff --git a/ggml-backend.c b/ggml-backend.c index 80e129cf843f8..68094e0543a41 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1232,7 +1232,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.2"); + SET_CAUSE(node, "2.1"); } else { for (int b = 0; b < sched->n_backends; b++) { if (b == cur_backend_id) { @@ -1326,7 +1326,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - // pass 2.2 expand gpu down + // expand gpu down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1352,7 +1352,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - // pass 2.1 expand gpu up + // expand gpu up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1377,7 +1377,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - // pass 2.4 expand rest down + // expand rest down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1393,7 +1393,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - // pass 2.3 expand rest up + // expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1410,13 +1410,48 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - // pass 3: assign backends to remaining src from dst and view_src + // pass 3 + // upgrade nodes to higher prio backends with compatible buffer types + // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there + // however, we also need to verify that the sources are in compatible buffer types + // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph + // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same + // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + int * node_backend_id = &tensor_backend_id(node); + for (int b = 0; b < *node_backend_id; b++) { + if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { + bool supported = true; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if (!ggml_backend_sched_buffer_supported(sched, src, b)) { + supported = false; + break; + } + } + if (supported) { + *node_backend_id = b; + SET_CAUSE(node, "3.upg"); + break; + } + } + } + } + + // pass 4: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "3.vsrc"); + SET_CAUSE(node, "4.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1428,10 +1463,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src->view_src != NULL) { // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); - SET_CAUSE(src, "3.vsrc"); + SET_CAUSE(src, "4.vsrc"); } else { *src_backend_id = *cur_backend_id; - SET_CAUSE(src, "3.cur"); + SET_CAUSE(src, "4.cur"); } } } @@ -1848,6 +1883,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched->tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); + free(sched->prev_node_backend_ids); + free(sched->prev_leaf_backend_ids); free(sched); } @@ -1944,6 +1981,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; + SET_CAUSE(node, "usr"); } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 3d146fc013913..089c73dd34060 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -56,8 +56,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const enum ggml_type type = src0->type; - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); GGML_ASSERT(ne2 == ne12); @@ -88,32 +86,39 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); ggml_to_float_t const to_float = type_traits.to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)); #ifdef GGML_USE_OPENMP - #pragma omp parallel for num_threads(ctx->n_threads) + #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } #else - for (int i = 0; i < ctx->n_threads - 1; i++) { - ctx->tasks.push_back(std::async(std::launch::async, [=]() { - const int64_t start = i*ne01/ctx->n_threads; - const int64_t end = (i + 1)*ne01/ctx->n_threads; - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - })); + for (int i = 1; i < n_threads; i++) { + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } } { // reuse the current thread for the last task - const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads; - const int64_t end = ne01; + const int64_t start = 0; + const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } @@ -131,7 +136,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #endif } - #if defined(OPENBLAS_VERSION) openblas_set_num_threads(ctx->n_threads); #endif @@ -150,7 +154,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { - x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, diff --git a/ggml.c b/ggml.c index a5d143b5cb13a..e372fee932b4a 100644 --- a/ggml.c +++ b/ggml.c @@ -18749,6 +18749,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: + case GGML_OP_CONT: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_ACC: @@ -18833,7 +18834,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ } break; case GGML_OP_SCALE: case GGML_OP_SET: - case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -18993,8 +18993,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput sched_yield(); } - * node_n = atomic_load(&state->shared->node_n); - if (* node_n != last_node_n) break; + *node_n = atomic_load(&state->shared->node_n); + if (*node_n != last_node_n) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); @@ -19004,15 +19007,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { // wait for other threads to finish - const int last_task_phase = * task_phase; + const int last_task_phase = *task_phase; while (true) { if (do_yield) { sched_yield(); } - * task_phase = atomic_load(&state->shared->node_task); - if (* task_phase != last_task_phase) break; + *task_phase = atomic_load(&state->shared->node_task); + if (*task_phase != last_task_phase) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); From a54b791211823f0c0cbf74aa317c09e501440967 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 12 Jun 2024 10:32:20 +0200 Subject: [PATCH 12/14] Apply suggestions from code review Co-authored-by: Georgi Gerganov --- ggml-blas.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 089c73dd34060..d709a357bbf29 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -96,7 +96,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const int min_cols_per_thread = 4096; const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); - const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(n_threads) @@ -116,7 +116,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } } { - // reuse the current thread for the last task + // reuse the current thread for the first task const int64_t start = 0; const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { From ae9cd856980696e26e1d7f8df3737572ea304927 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 13 Jun 2024 02:04:06 +0200 Subject: [PATCH 13/14] fix metal being used in layers not offloaded --- ggml-backend.c | 112 ++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 53 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 68094e0543a41..21636a6ae5061 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1229,27 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } } -static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { - if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.1"); - } else { - for (int b = 0; b < sched->n_backends; b++) { - if (b == cur_backend_id) { - continue; - } - if (ggml_backend_supports_op(sched->backends[b], node)) { - *node_backend_id = b; - cur_backend_id = b; - SET_CAUSE(node, "2.2"); - break; - } - } - } - return cur_backend_id; -} - -static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) { +static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) { ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; ggml_backend_buffer_type_t buft = NULL; @@ -1259,12 +1239,22 @@ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, stru } else { // see if the tensor already has a backend assigned, and use the buffer type of that backend int tensor_backend_id = tensor_backend_id(t); + if (tensor_backend_id == -1 && t->view_src) { + tensor_backend_id = tensor_backend_id(t->view_src); + } if (tensor_backend_id != -1) { buft = sched->bufts[tensor_backend_id]; } } - return buft != NULL && ggml_backend_supports_buft(sched->backends[cur_backend_id], buft); + return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft); +} + +static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { + if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { + *node_backend_id = cur_backend_id; + SET_CAUSE(node, "2.sup"); + } } // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend @@ -1324,8 +1314,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // assign the same backend to adjacent nodes // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - - + // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known // expand gpu down { int cur_backend_id = -1; @@ -1343,12 +1332,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { - // FIXME: clean this - cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); - if (cur_backend_id == sched->n_backends - 1) { - // skip cpu (lowest prio backend) - cur_backend_id = -1; - } + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } @@ -1369,11 +1353,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { - cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); - if (cur_backend_id == sched->n_backends - 1) { - // skip cpu (lowest prio backend) - cur_backend_id = -1; - } + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } @@ -1389,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { - cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } @@ -1405,41 +1385,67 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { - cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 3 - // upgrade nodes to higher prio backends with compatible buffer types + // pass 3: upgrade nodes to higher prio backends with compatible buffer types // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there // however, we also need to verify that the sources are in compatible buffer types // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) + // additionally, set remaining unassigned nodes to the backend with the most supported inputs + // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } int * node_backend_id = &tensor_backend_id(node); - for (int b = 0; b < *node_backend_id; b++) { - if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { - bool supported = true; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; + if (*node_backend_id == -1) { + // unassigned node: find the backend with the most supported inputs + int n_supported_best = -1; + for (int b = 0; b < sched->n_backends; b++) { + if (ggml_backend_supports_op(sched->backends[b], node)) { + int n_supported = 0; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) { + n_supported++; + } } - if (!ggml_backend_sched_buffer_supported(sched, src, b)) { - supported = false; - break; + if (n_supported > n_supported_best) { + n_supported_best = n_supported; + *node_backend_id = b; + SET_CAUSE(node, "3.best"); } } - if (supported) { - *node_backend_id = b; - SET_CAUSE(node, "3.upg"); - break; + } + } else { + // assigned node: upgrade to higher prio backend if possible + for (int b = 0; b < *node_backend_id; b++) { + if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { + bool supported = true; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if (!ggml_backend_sched_buffer_supported(sched, src, b)) { + supported = false; + break; + } + } + if (supported) { + *node_backend_id = b; + SET_CAUSE(node, "3.upg"); + break; + } } } } From 211fb045f1c9cd4c949389817624ec510c0fcefd Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 13 Jun 2024 02:38:36 +0200 Subject: [PATCH 14/14] sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. ggml-ci --- ggml-backend.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 21636a6ae5061..2bec7bea38a85 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1116,9 +1116,10 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co } } - fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n", - __func__, ggml_backend_buffer_name(buffer), tensor->name); - GGML_ASSERT(false); +#ifndef NDEBUG + fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", + __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); +#endif return -1; }