From 84471ffc73b6aa0a49736b16823a291df8e77501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Tue, 12 Sep 2023 23:28:04 +0200 Subject: [PATCH 01/11] metal : reusing llama.cpp logging --- Makefile | 6 ++-- ggml-metal.m | 85 ++++++++++++++++++++++++++-------------------------- llama.cpp | 30 +++---------------- llama.h | 30 +++++++++++++++++++ 4 files changed, 80 insertions(+), 71 deletions(-) diff --git a/Makefile b/Makefile index a774dc50f372d..913d0d80a1851 100644 --- a/Makefile +++ b/Makefile @@ -427,7 +427,7 @@ endif endif # LLAMA_METAL ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h +ggml-metal.o: ggml-metal.m ggml-metal.h llama.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_METAL @@ -551,7 +551,7 @@ speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o co $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifdef LLAMA_METAL -metal: examples/metal/metal.cpp ggml.o $(OBJS) +metal: examples/metal/metal.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) endif @@ -573,7 +573,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ -vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) +vdot: pocs/vdot/vdot.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) diff --git a/ggml-metal.m b/ggml-metal.m index 4f3f14e24d9d8..22c267d8dc71e 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -11,11 +11,12 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG -#define metal_printf(...) +#define LLAMA_LOG_INFO(...) +#define LLAMA_LOG_WARN(...) +#define LLAMA_LOG_ERROR(...) #else -#define metal_printf(...) fprintf(stderr, __VA_ARGS__) +#import "llama.h" #endif #define UNUSED(x) (void)(x) @@ -118,7 +119,7 @@ @implementation GGMLMetalClass @end struct ggml_metal_context * ggml_metal_init(int n_cb) { - metal_printf("%s: allocating\n", __func__); + LLAMA_LOG_INFO("%s: allocating\n", __func__); id device; NSString * s; @@ -128,14 +129,14 @@ @implementation GGMLMetalClass NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; - metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + LLAMA_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); } #endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); s = [device name]; - metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); + LLAMA_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -162,7 +163,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -176,11 +177,11 @@ @implementation GGMLMetalClass //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); + LLAMA_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -192,7 +193,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -204,11 +205,11 @@ @implementation GGMLMetalClass #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + LLAMA_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + LLAMA_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -264,13 +265,13 @@ @implementation GGMLMetalClass #undef GGML_METAL_ADD_KERNEL } - metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + LLAMA_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX - metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); + LLAMA_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -278,7 +279,7 @@ @implementation GGMLMetalClass } void ggml_metal_free(struct ggml_metal_context * ctx) { - metal_printf("%s: deallocating\n", __func__); + LLAMA_LOG_INFO("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -350,7 +351,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { void * data = NULL; const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - metal_printf("%s: error: posix_memalign failed\n", __func__); + LLAMA_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -378,7 +379,7 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //LLAMA_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -389,13 +390,13 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //LLAMA_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - metal_printf("%s: error: buffer is nil\n", __func__); + LLAMA_LOG_ERROR("%s: error: buffer is nil\n", __func__); return nil; } @@ -407,7 +408,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - metal_printf("%s: too many buffers\n", __func__); + LLAMA_LOG_ERROR("%s: error: too many buffers\n", __func__); return false; } @@ -417,7 +418,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + LLAMA_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -438,11 +439,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + LLAMA_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -462,13 +463,13 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + LLAMA_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + LLAMA_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - metal_printf("\n"); + LLAMA_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -476,17 +477,17 @@ bool ggml_metal_add_buffer( } #if TARGET_OS_OSX - metal_printf(", (%8.2f / %8.2f)", + LLAMA_LOG_INFO(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); + LLAMA_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { - metal_printf("\n"); + LLAMA_LOG_INFO("\n"); } #else - metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + LLAMA_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); #endif } @@ -599,7 +600,7 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); + LLAMA_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -653,7 +654,7 @@ void ggml_metal_graph_compute( continue; } - //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //LLAMA_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -697,17 +698,17 @@ void ggml_metal_graph_compute( id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //LLAMA_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // LLAMA_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // LLAMA_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // LLAMA_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -813,7 +814,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + LLAMA_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -993,7 +994,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("Asserting on type %d\n",(int)src0t); + LLAMA_LOG_ERROR("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1234,7 +1235,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + LLAMA_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1259,7 +1260,7 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); + LLAMA_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } } diff --git a/llama.cpp b/llama.cpp index 2a2a0c9c63cef..552e655ef4d5b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -76,28 +76,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) -#endif - -// -// logging -// - -LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (llama_log_level level, const char* format, ...); -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); - -#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) - // // helpers // @@ -6366,7 +6344,7 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; } -static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { +void llama_log_v(llama_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); char buffer[128]; @@ -6383,14 +6361,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_ va_end(args_copy); } -static void llama_log_internal(llama_log_level level, const char * format, ...) { +void llama_log(llama_log_level level, const char * format, ...) { va_list args; va_start(args, format); - llama_log_internal_v(level, format, args); + llama_log_v(level, format, args); va_end(args); } -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { +void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; fputs(text, stderr); diff --git a/llama.h b/llama.h index 37975bebed22e..6ab779c059493 100644 --- a/llama.h +++ b/llama.h @@ -532,6 +532,36 @@ extern "C" { } #endif +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) +#endif + +// +// logging +// + +#ifdef __cplusplus +extern "C" { +#endif + +LLAMA_ATTRIBUTE_FORMAT(2, 3) +void llama_log (enum llama_log_level level, const char* format, ...); +void llama_log_callback_default(enum llama_log_level level, const char * text, void * user_data); + +#ifdef __cplusplus +} +#endif + +#define LLAMA_LOG_INFO(...) llama_log(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) + // Internal API to be implemented by llama.cpp and used by tests/benchmarks only #ifdef LLAMA_API_INTERNAL From 6ff3f2ee2d3ead37809f051c38e01d6b00453110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Wed, 13 Sep 2023 08:01:57 +0200 Subject: [PATCH 02/11] cmake : build fix --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 537eadc27b913..deadba63190ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -615,6 +615,7 @@ add_library(ggml OBJECT ggml.c ggml.h ggml-alloc.c + llama.cpp ggml-alloc.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} From 4f0e09598fe49f30018eada1241b284ac012263e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Wed, 13 Sep 2023 15:45:27 +0200 Subject: [PATCH 03/11] metal : logging callback --- CMakeLists.txt | 1 - Makefile | 4 +- ggml-metal.h | 4 ++ ggml-metal.m | 113 +++++++++++++++++++++++++++++++------------------ llama.cpp | 34 +++++++++++++-- llama.h | 30 ------------- 6 files changed, 107 insertions(+), 79 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index deadba63190ba..537eadc27b913 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -615,7 +615,6 @@ add_library(ggml OBJECT ggml.c ggml.h ggml-alloc.c - llama.cpp ggml-alloc.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} diff --git a/Makefile b/Makefile index 913d0d80a1851..320fc04ca0d39 100644 --- a/Makefile +++ b/Makefile @@ -551,7 +551,7 @@ speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o co $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifdef LLAMA_METAL -metal: examples/metal/metal.cpp ggml.o llama.o $(OBJS) +metal: examples/metal/metal.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) endif @@ -573,7 +573,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ -vdot: pocs/vdot/vdot.cpp ggml.o llama.o $(OBJS) +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) diff --git a/ggml-metal.h b/ggml-metal.h index fca28d37ef970..cd45318024da1 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -22,6 +22,8 @@ #include #include +#include "llama.h" + // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 #define GGML_METAL_MAX_COMMAND_BUFFERS 32 @@ -33,6 +35,8 @@ struct ggml_cgraph; extern "C" { #endif +void ggml_metal_log_set_callback(void (*log_callback)(enum llama_log_level level, const char * text, void * user_data), void * user_data); + struct ggml_metal_context; // number of command buffers to use diff --git a/ggml-metal.m b/ggml-metal.m index 22c267d8dc71e..8530df7924a8b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -12,9 +12,9 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) #ifdef GGML_METAL_NDEBUG -#define LLAMA_LOG_INFO(...) -#define LLAMA_LOG_WARN(...) -#define LLAMA_LOG_ERROR(...) +#define ggml_metal_log_info(...) +#define ggml_metal_log_warn(...) +#define ggml_metal_log_error(...) #else #import "llama.h" #endif @@ -118,8 +118,37 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end +void (*ggml_metal_log_callback)(enum llama_log_level level, const char * text, void * user_data) = NULL; +void *ggml_metal_log_user_data = NULL; + +void ggml_metal_log_set_callback(void (*log_callback)(enum llama_log_level level, const char * text, void * user_data), void * user_data) { + ggml_metal_log_callback = log_callback; + ggml_metal_log_user_data = user_data; +} + +static void ggml_metal_log(enum llama_log_level level, const char* format, ...) { + if ( ggml_metal_log_callback != NULL ) { + va_list arg; + va_start(arg, format); + char const* text = va_arg(arg, char const*); + ggml_metal_log_callback(level, text, ggml_metal_log_user_data); + va_end(arg); + } +} + +#ifdef GGML_METAL_NDEBU +#define ggml_metal_log_info(...) +#define ggml_metal_log_warn(...) +#define ggml_metal_log_error(...) +#else +#define ggml_metal_log_info(...) ggml_metal_log(LLAMA_LOG_LEVEL_INFO, __VA_ARGS__) +#define ggml_metal_log_warn(...) ggml_metal_log(LLAMA_LOG_LEVEL_WARN, __VA_ARGS__) +#define ggml_metal_log_error(...) ggml_metal_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +#endif + + struct ggml_metal_context * ggml_metal_init(int n_cb) { - LLAMA_LOG_INFO("%s: allocating\n", __func__); + ggml_metal_log_info("%s: allocating\n", __func__); id device; NSString * s; @@ -129,14 +158,14 @@ @implementation GGMLMetalClass NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; - LLAMA_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); + ggml_metal_log_info("%s: found device: %s\n", __func__, [s UTF8String]); } #endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); s = [device name]; - LLAMA_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); + ggml_metal_log_info("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -163,7 +192,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + ggml_metal_log_error("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -177,11 +206,11 @@ @implementation GGMLMetalClass //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - LLAMA_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); + ggml_metal_log_info("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + ggml_metal_log_error("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -193,7 +222,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + ggml_metal_log_error("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -205,11 +234,11 @@ @implementation GGMLMetalClass #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - LLAMA_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + ggml_metal_log_info("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - LLAMA_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + ggml_metal_log_error("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -265,13 +294,13 @@ @implementation GGMLMetalClass #undef GGML_METAL_ADD_KERNEL } - LLAMA_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + ggml_metal_log_info("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX - LLAMA_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + ggml_metal_log_info("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - LLAMA_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + ggml_metal_log_info("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - LLAMA_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); + ggml_metal_log_info("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -279,7 +308,7 @@ @implementation GGMLMetalClass } void ggml_metal_free(struct ggml_metal_context * ctx) { - LLAMA_LOG_INFO("%s: deallocating\n", __func__); + ggml_metal_log_info("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -351,7 +380,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { void * data = NULL; const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - LLAMA_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); + ggml_metal_log_error("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -379,7 +408,7 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //LLAMA_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //ggml_metal_log_info("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -390,13 +419,13 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //LLAMA_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //ggml_metal_log_info("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - LLAMA_LOG_ERROR("%s: error: buffer is nil\n", __func__); + ggml_metal_log_error("%s: error: buffer is nil\n", __func__); return nil; } @@ -408,7 +437,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - LLAMA_LOG_ERROR("%s: error: too many buffers\n", __func__); + ggml_metal_log_error("%s: error: too many buffers\n", __func__); return false; } @@ -418,7 +447,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - LLAMA_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + ggml_metal_log_error("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -439,11 +468,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - LLAMA_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + ggml_metal_log_error("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - LLAMA_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + ggml_metal_log_info("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -463,13 +492,13 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - LLAMA_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + ggml_metal_log_error("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - LLAMA_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + ggml_metal_log_info("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - LLAMA_LOG_INFO("\n"); + ggml_metal_log_info("\n"); } ++ctx->n_buffers; @@ -477,17 +506,17 @@ bool ggml_metal_add_buffer( } #if TARGET_OS_OSX - LLAMA_LOG_INFO(", (%8.2f / %8.2f)", + ggml_metal_log_info(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - LLAMA_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + ggml_metal_log_warn("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { - LLAMA_LOG_INFO("\n"); + ggml_metal_log_info("\n"); } #else - LLAMA_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + ggml_metal_log_info(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); #endif } @@ -600,7 +629,7 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - LLAMA_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); + ggml_metal_log_warn("%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -654,7 +683,7 @@ void ggml_metal_graph_compute( continue; } - //LLAMA_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //ggml_metal_log_info("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -698,17 +727,17 @@ void ggml_metal_graph_compute( id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //LLAMA_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //ggml_metal_log_info("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // LLAMA_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_metal_log_info("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // LLAMA_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_metal_log_info("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // LLAMA_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // ggml_metal_log_info("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -814,7 +843,7 @@ void ggml_metal_graph_compute( } break; default: { - LLAMA_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + ggml_metal_log_warn("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -994,7 +1023,7 @@ void ggml_metal_graph_compute( } break; default: { - LLAMA_LOG_ERROR("Asserting on type %d\n",(int)src0t); + ggml_metal_log_error("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1235,7 +1264,7 @@ void ggml_metal_graph_compute( } break; default: { - LLAMA_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + ggml_metal_log_error("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1260,7 +1289,7 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - LLAMA_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + ggml_metal_log_info("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } } diff --git a/llama.cpp b/llama.cpp index 552e655ef4d5b..9779dcb3e59c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -76,6 +76,31 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) +#endif + +// +// logging +// + +LLAMA_ATTRIBUTE_FORMAT(2, 3) +static void llama_log_internal (enum llama_log_level level, const char* format, ...); +static void llama_log_callback_default(enum llama_log_level level, const char * text, void * user_data); + +#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) + + + + // // helpers // @@ -5510,6 +5535,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return NULL; } + ggml_metal_log_set_callback(llama_log_callback_default, NULL); ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } @@ -6344,7 +6370,7 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; } -void llama_log_v(llama_log_level level, const char * format, va_list args) { +static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); char buffer[128]; @@ -6361,14 +6387,14 @@ void llama_log_v(llama_log_level level, const char * format, va_list args) { va_end(args_copy); } -void llama_log(llama_log_level level, const char * format, ...) { +static void llama_log_internal(llama_log_level level, const char * format, ...) { va_list args; va_start(args, format); - llama_log_v(level, format, args); + llama_log_internal_v(level, format, args); va_end(args); } -void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { +static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; fputs(text, stderr); diff --git a/llama.h b/llama.h index 6ab779c059493..37975bebed22e 100644 --- a/llama.h +++ b/llama.h @@ -532,36 +532,6 @@ extern "C" { } #endif -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) -#endif - -// -// logging -// - -#ifdef __cplusplus -extern "C" { -#endif - -LLAMA_ATTRIBUTE_FORMAT(2, 3) -void llama_log (enum llama_log_level level, const char* format, ...); -void llama_log_callback_default(enum llama_log_level level, const char * text, void * user_data); - -#ifdef __cplusplus -} -#endif - -#define LLAMA_LOG_INFO(...) llama_log(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) - // Internal API to be implemented by llama.cpp and used by tests/benchmarks only #ifdef LLAMA_API_INTERNAL From 1f55026cd5029e867cae3c5c2b4af8b67a2c54f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Wed, 13 Sep 2023 16:01:46 +0200 Subject: [PATCH 04/11] metal : logging va_args memory fix --- ggml-metal.m | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8530df7924a8b..c508e48534948 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -126,13 +126,22 @@ void ggml_metal_log_set_callback(void (*log_callback)(enum llama_log_level level ggml_metal_log_user_data = user_data; } -static void ggml_metal_log(enum llama_log_level level, const char* format, ...) { +static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ if ( ggml_metal_log_callback != NULL ) { - va_list arg; - va_start(arg, format); - char const* text = va_arg(arg, char const*); - ggml_metal_log_callback(level, text, ggml_metal_log_user_data); - va_end(arg); + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); + } else { + char* buffer2 = malloc(len+1); + vsnprintf(buffer2, len+1, format, args); + buffer2[len] = 0; + ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); + free(buffer2); + } + va_end(args); } } From 86abc77b214d07ac30db462367e058851edc1561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Wed, 13 Sep 2023 16:06:16 +0200 Subject: [PATCH 05/11] metal : minor cleanup --- ggml-metal.m | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index c508e48534948..c6e3e6fd25601 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -16,7 +16,9 @@ #define ggml_metal_log_warn(...) #define ggml_metal_log_error(...) #else -#import "llama.h" +#define ggml_metal_log_info(...) ggml_metal_log(LLAMA_LOG_LEVEL_INFO, __VA_ARGS__) +#define ggml_metal_log_warn(...) ggml_metal_log(LLAMA_LOG_LEVEL_WARN, __VA_ARGS__) +#define ggml_metal_log_error(...) ggml_metal_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -145,15 +147,6 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ } } -#ifdef GGML_METAL_NDEBU -#define ggml_metal_log_info(...) -#define ggml_metal_log_warn(...) -#define ggml_metal_log_error(...) -#else -#define ggml_metal_log_info(...) ggml_metal_log(LLAMA_LOG_LEVEL_INFO, __VA_ARGS__) -#define ggml_metal_log_warn(...) ggml_metal_log(LLAMA_LOG_LEVEL_WARN, __VA_ARGS__) -#define ggml_metal_log_error(...) ggml_metal_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) -#endif struct ggml_metal_context * ggml_metal_init(int n_cb) { From 8d5004b760c4829ddda1c07be1dd9e35a9c4adbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Wed, 13 Sep 2023 16:23:03 +0200 Subject: [PATCH 06/11] metal : setting function like logging macro to capital letters --- ggml-metal.m | 90 ++++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index c6e3e6fd25601..7ad59db339907 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -12,13 +12,13 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) #ifdef GGML_METAL_NDEBUG -#define ggml_metal_log_info(...) -#define ggml_metal_log_warn(...) -#define ggml_metal_log_error(...) +#define GGML_METAL_LOG_INFO(...) +#define GGML_METAL_LOG_WARN(...) +#define GGML_METAL_LOG_ERROR(...) #else -#define ggml_metal_log_info(...) ggml_metal_log(LLAMA_LOG_LEVEL_INFO, __VA_ARGS__) -#define ggml_metal_log_warn(...) ggml_metal_log(LLAMA_LOG_LEVEL_WARN, __VA_ARGS__) -#define ggml_metal_log_error(...) ggml_metal_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(LLAMA_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(LLAMA_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -150,7 +150,7 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ struct ggml_metal_context * ggml_metal_init(int n_cb) { - ggml_metal_log_info("%s: allocating\n", __func__); + GGML_METAL_LOG_INFO("%s: allocating\n", __func__); id device; NSString * s; @@ -160,14 +160,14 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; - ggml_metal_log_info("%s: found device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); } #endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); s = [device name]; - ggml_metal_log_info("%s: picking default device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -194,7 +194,7 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - ggml_metal_log_error("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -208,11 +208,11 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - ggml_metal_log_info("%s: loading '%s'\n", __func__, [path UTF8String]); + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - ggml_metal_log_error("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -224,7 +224,7 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - ggml_metal_log_error("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -236,11 +236,11 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - ggml_metal_log_info("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - ggml_metal_log_error("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -296,13 +296,13 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ #undef GGML_METAL_ADD_KERNEL } - ggml_metal_log_info("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX - ggml_metal_log_info("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - ggml_metal_log_info("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - ggml_metal_log_info("%s: maxTransferRate = built-in GPU\n", __func__); + GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -310,7 +310,7 @@ static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ } void ggml_metal_free(struct ggml_metal_context * ctx) { - ggml_metal_log_info("%s: deallocating\n", __func__); + GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -382,7 +382,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { void * data = NULL; const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - ggml_metal_log_error("%s: error: posix_memalign failed\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -410,7 +410,7 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //ggml_metal_log_info("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -421,13 +421,13 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //ggml_metal_log_info("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - ggml_metal_log_error("%s: error: buffer is nil\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__); return nil; } @@ -439,7 +439,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - ggml_metal_log_error("%s: error: too many buffers\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__); return false; } @@ -449,7 +449,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - ggml_metal_log_error("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -470,11 +470,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - ggml_metal_log_error("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - ggml_metal_log_info("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -494,13 +494,13 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - ggml_metal_log_error("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - ggml_metal_log_info("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - ggml_metal_log_info("\n"); + GGML_METAL_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -508,17 +508,17 @@ bool ggml_metal_add_buffer( } #if TARGET_OS_OSX - ggml_metal_log_info(", (%8.2f / %8.2f)", + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - ggml_metal_log_warn("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { - ggml_metal_log_info("\n"); + GGML_METAL_LOG_INFO("\n"); } #else - ggml_metal_log_info(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); #endif } @@ -631,7 +631,7 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - ggml_metal_log_warn("%s: too many elements for metal ctx->concur_list!\n", __func__); + GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -685,7 +685,7 @@ void ggml_metal_graph_compute( continue; } - //ggml_metal_log_info("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -729,17 +729,17 @@ void ggml_metal_graph_compute( id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //ggml_metal_log_info("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // ggml_metal_log_info("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // ggml_metal_log_info("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // ggml_metal_log_info("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -845,7 +845,7 @@ void ggml_metal_graph_compute( } break; default: { - ggml_metal_log_warn("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -1025,7 +1025,7 @@ void ggml_metal_graph_compute( } break; default: { - ggml_metal_log_error("Asserting on type %d\n",(int)src0t); + GGML_METAL_LOG_ERROR("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1266,7 +1266,7 @@ void ggml_metal_graph_compute( } break; default: { - ggml_metal_log_error("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1291,7 +1291,7 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - ggml_metal_log_info("%s: command buffer %d failed with status %lu\n", __func__, i, status); + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } } From 696bf0595ad2b31ff3049b390e5901e0fd638101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Wed, 13 Sep 2023 16:40:23 +0200 Subject: [PATCH 07/11] llama.cpp : trailing whitespace fix --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 9779dcb3e59c1..139074993cc4f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5535,7 +5535,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return NULL; } - ggml_metal_log_set_callback(llama_log_callback_default, NULL); + ggml_metal_log_set_callback(llama_log_callback_default, NULL); ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } From d266e15c81b04759664593ef9fc093fb8f56c116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Sun, 17 Sep 2023 16:38:46 +0200 Subject: [PATCH 08/11] ggml : log level enum used by llama --- examples/llama-bench/llama-bench.cpp | 2 +- ggml-metal.h | 6 +++--- ggml-metal.m | 12 ++++++------ ggml.h | 6 ++++++ llama.cpp | 19 ++++++++----------- llama.h | 8 +------- 6 files changed, 25 insertions(+), 28 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index dedaa34fd84ba..8eccb8967d53f 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -911,7 +911,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) } } -static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) { +static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { (void) level; (void) text; (void) user_data; diff --git a/ggml-metal.h b/ggml-metal.h index cd45318024da1..97d859f2cb108 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -19,11 +19,11 @@ #pragma once +#include "ggml.h" + #include #include -#include "llama.h" - // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 #define GGML_METAL_MAX_COMMAND_BUFFERS 32 @@ -35,7 +35,7 @@ struct ggml_cgraph; extern "C" { #endif -void ggml_metal_log_set_callback(void (*log_callback)(enum llama_log_level level, const char * text, void * user_data), void * user_data); +void ggml_metal_log_set_callback(void (*log_callback)(enum ggml_log_level level, const char * text, void * user_data), void * user_data); struct ggml_metal_context; diff --git a/ggml-metal.m b/ggml-metal.m index 7ad59db339907..5291d9f811b77 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -16,9 +16,9 @@ #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(LLAMA_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(LLAMA_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -120,15 +120,15 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -void (*ggml_metal_log_callback)(enum llama_log_level level, const char * text, void * user_data) = NULL; +void (*ggml_metal_log_callback)(enum ggml_log_level level, const char * text, void * user_data) = NULL; void *ggml_metal_log_user_data = NULL; -void ggml_metal_log_set_callback(void (*log_callback)(enum llama_log_level level, const char * text, void * user_data), void * user_data) { +void ggml_metal_log_set_callback(void (*log_callback)(enum ggml_log_level level, const char * text, void * user_data), void * user_data) { ggml_metal_log_callback = log_callback; ggml_metal_log_user_data = user_data; } -static void ggml_metal_log(enum llama_log_level level, const char* format, ...){ +static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ if ( ggml_metal_log_callback != NULL ) { va_list args; va_start(args, format); diff --git a/ggml.h b/ggml.h index c936823d66140..230217664de82 100644 --- a/ggml.h +++ b/ggml.h @@ -437,6 +437,12 @@ extern "C" { GGML_OBJECT_WORK_BUFFER }; + enum ggml_log_level { + GGML_LOG_LEVEL_ERROR = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_INFO = 4 + }; + // ggml object struct ggml_object { size_t offs; diff --git a/llama.cpp b/llama.cpp index 139074993cc4f..223bbac73bced 100644 --- a/llama.cpp +++ b/llama.cpp @@ -91,15 +91,12 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (enum llama_log_level level, const char* format, ...); -static void llama_log_callback_default(enum llama_log_level level, const char * text, void * user_data); - -#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) - - +static void llama_log_internal (ggml_log_level level, const char* format, ...); +static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); +#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) // // helpers @@ -6370,7 +6367,7 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; } -static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { +static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); char buffer[128]; @@ -6387,14 +6384,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_ va_end(args_copy); } -static void llama_log_internal(llama_log_level level, const char * format, ...) { +static void llama_log_internal(ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); llama_log_internal_v(level, format, args); va_end(args); } -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { +static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; fputs(text, stderr); diff --git a/llama.h b/llama.h index 37975bebed22e..ca5ee3a871163 100644 --- a/llama.h +++ b/llama.h @@ -62,12 +62,6 @@ extern "C" { typedef int llama_token; - enum llama_log_level { - LLAMA_LOG_LEVEL_ERROR = 2, - LLAMA_LOG_LEVEL_WARN = 3, - LLAMA_LOG_LEVEL_INFO = 4 - }; - enum llama_vocab_type { LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding @@ -156,7 +150,7 @@ extern "C" { // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it // if it exists. // It might not exist for progress report where '.' is output repeatedly. - typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); + typedef void (*llama_log_callback)(enum ggml_log_level level, const char * text, void * user_data); // model quantization parameters typedef struct llama_model_quantize_params { From 78de0dff085c2c9e31ae281fd776f4f931589536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Sun, 17 Sep 2023 16:42:24 +0200 Subject: [PATCH 09/11] Makefile : cleanup ggml-metal recipe --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 00d68ff2f7be4..7ab1b7a0958b0 100644 --- a/Makefile +++ b/Makefile @@ -418,7 +418,7 @@ endif endif # LLAMA_METAL ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h llama.h +ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_METAL From e0eba91bea3c117e05cc846e4140c0d2e284553d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Sun, 17 Sep 2023 18:19:37 +0200 Subject: [PATCH 10/11] ggml : ggml_log_callback typedef --- ggml-metal.h | 2 +- ggml-metal.m | 38 +++++++++++++++++++------------------- ggml.h | 6 ++++++ llama.cpp | 4 ++-- llama.h | 9 +-------- 5 files changed, 29 insertions(+), 30 deletions(-) diff --git a/ggml-metal.h b/ggml-metal.h index 97d859f2cb108..790cf0bf7b963 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -35,7 +35,7 @@ struct ggml_cgraph; extern "C" { #endif -void ggml_metal_log_set_callback(void (*log_callback)(enum ggml_log_level level, const char * text, void * user_data), void * user_data); +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); struct ggml_metal_context; diff --git a/ggml-metal.m b/ggml-metal.m index df1bed00055d0..919e51797b2d6 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -121,31 +121,31 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -void (*ggml_metal_log_callback)(enum ggml_log_level level, const char * text, void * user_data) = NULL; +ggml_log_callback ggml_metal_log_callback = NULL; void *ggml_metal_log_user_data = NULL; -void ggml_metal_log_set_callback(void (*log_callback)(enum ggml_log_level level, const char * text, void * user_data), void * user_data) { - ggml_metal_log_callback = log_callback; - ggml_metal_log_user_data = user_data; +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { + ggml_metal_log_callback = log_callback; + ggml_metal_log_user_data = user_data; } static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ - if ( ggml_metal_log_callback != NULL ) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); - } else { - char* buffer2 = malloc(len+1); - vsnprintf(buffer2, len+1, format, args); - buffer2[len] = 0; - ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); - free(buffer2); + if ( ggml_metal_log_callback != NULL ) { + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); + } else { + char* buffer2 = malloc(len+1); + vsnprintf(buffer2, len+1, format, args); + buffer2[len] = 0; + ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); + free(buffer2); + } + va_end(args); } - va_end(args); - } } diff --git a/ggml.h b/ggml.h index 5753c70cfc14d..266c91a845855 100644 --- a/ggml.h +++ b/ggml.h @@ -1688,6 +1688,12 @@ extern "C" { }; typedef void (*ggml_opt_callback)(void * data, float * sched); + // Signature for logging events + // Note that text includes the new line character at the end for most events. + // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it + // if it exists. + // It might not exist for progress report where '.' is output repeatedly. + typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); // optimization parameters // diff --git a/llama.cpp b/llama.cpp index ac1aa140d8b37..9b12173be4d3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -886,7 +886,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to struct llama_state { // We save the log callback globally - llama_log_callback log_callback = llama_log_callback_default; + ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; }; @@ -6834,7 +6834,7 @@ const std::vector>& llama_internal_ return ctx->model.tensors_by_name; } -void llama_log_set(llama_log_callback log_callback, void * user_data) { +void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; } diff --git a/llama.h b/llama.h index ca5ee3a871163..eba6aabfbb073 100644 --- a/llama.h +++ b/llama.h @@ -145,13 +145,6 @@ extern "C" { bool embedding; // embedding mode only }; - // Signature for logging events - // Note that text includes the new line character at the end for most events. - // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it - // if it exists. - // It might not exist for progress report where '.' is output repeatedly. - typedef void (*llama_log_callback)(enum ggml_log_level level, const char * text, void * user_data); - // model quantization parameters typedef struct llama_model_quantize_params { int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() @@ -518,7 +511,7 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); From b6434356ed3cfb0ccf3ef67eae2a03a7c8639c85 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 27 Sep 2023 18:47:54 +0300 Subject: [PATCH 11/11] ggml : minor --- ggml-metal.m | 12 ++++++------ ggml.h | 5 ----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 919e51797b2d6..5dfd722bda9c1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -122,15 +122,15 @@ @implementation GGMLMetalClass @end ggml_log_callback ggml_metal_log_callback = NULL; -void *ggml_metal_log_user_data = NULL; +void * ggml_metal_log_user_data = NULL; void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_metal_log_callback = log_callback; + ggml_metal_log_callback = log_callback; ggml_metal_log_user_data = user_data; } static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ - if ( ggml_metal_log_callback != NULL ) { + if (ggml_metal_log_callback != NULL) { va_list args; va_start(args, format); char buffer[128]; @@ -208,7 +208,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; @@ -518,7 +518,7 @@ bool ggml_metal_add_buffer( ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { GGML_METAL_LOG_INFO("\n"); } @@ -1039,7 +1039,7 @@ void ggml_metal_graph_compute( } break; default: { - GGML_METAL_LOG_ERROR("Asserting on type %d\n",(int)src0t); + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_ASSERT(false && "not implemented"); } }; diff --git a/ggml.h b/ggml.h index 266c91a845855..f8070a33caf01 100644 --- a/ggml.h +++ b/ggml.h @@ -1688,11 +1688,6 @@ extern "C" { }; typedef void (*ggml_opt_callback)(void * data, float * sched); - // Signature for logging events - // Note that text includes the new line character at the end for most events. - // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it - // if it exists. - // It might not exist for progress report where '.' is output repeatedly. typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); // optimization parameters