From 75031d5c23a85f45dd98939be9dd92c8cf1c3c76 Mon Sep 17 00:00:00 2001 From: grahameth <-> Date: Mon, 26 Jun 2023 18:22:39 +0200 Subject: [PATCH] Add verbose flag to control console output about model information on load. --- examples/common.cpp | 4 +++ examples/common.h | 1 + llama.cpp | 76 +++++++++++++++++++++++++++------------------ llama.h | 1 + 4 files changed, 51 insertions(+), 31 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 6ac4845559172..04eef7401c887 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -395,6 +395,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.input_suffix = argv[i]; + } else if (arg == "--no-verbose") { + params.verbose = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); @@ -501,6 +503,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stderr, " --no-verbose do not print model info on startup\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); @@ -551,6 +554,7 @@ std::tuple llama_init_from_gpt_par lparams.use_mlock = params.use_mlock; lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; + lparams.verbose = params.verbose; llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); if (model == NULL) { diff --git a/examples/common.h b/examples/common.h index 713320179e2be..b38625f7697ed 100644 --- a/examples/common.h +++ b/examples/common.h @@ -78,6 +78,7 @@ struct gpt_params { bool mem_test = false; // compute maximum memory usage bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation + bool verbose = true; // print model info on load }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/llama.cpp b/llama.cpp index ac22a48f8ab97..be9375035358d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -472,9 +472,11 @@ struct llama_file_loader { llama_hparams hparams; llama_vocab vocab; - llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map, bool verbose) : file(fname, "rb") { - fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + if (verbose) { + fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + } read_magic(); read_hparams(); read_vocab(); @@ -662,13 +664,13 @@ struct llama_model_loader { struct ggml_context * ggml_ctx = NULL; std::unique_ptr mapping; - llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { - auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only, bool verbose) { + auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map, verbose); file_loaders.emplace_back(first_file); uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); for (uint32_t i = 1; i < n_parts; i++) { std::string fname = fname_base + "." + std::to_string(i); - auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map, verbose); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { throw std::runtime_error(format("llama.cpp: hparams inconsistent between files")); @@ -949,6 +951,7 @@ struct llama_context_params llama_context_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, + /*.verbose =*/ true, }; return result; @@ -1055,11 +1058,12 @@ static void llama_model_load_internal( bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, - void * progress_callback_user_data) { + void * progress_callback_user_data, + bool verbose) { model.t_start_us = ggml_time_us(); - std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only, verbose)); vocab = std::move(ml->file_loaders.at(0)->vocab); model.hparams = ml->file_loaders.at(0)->hparams; @@ -1087,7 +1091,7 @@ static void llama_model_load_internal( const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - { + if (verbose) { fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); @@ -1127,7 +1131,9 @@ static void llama_model_load_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); - fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + if (verbose) { + fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + } // create the ggml context { @@ -1151,12 +1157,16 @@ static void llama_model_load_internal( (void) main_gpu; #if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + if (verbose) { + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + } ggml_cuda_set_main_device(main_gpu); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) - fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); + if (verbose) { + fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); + } #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU #else @@ -1256,20 +1266,20 @@ static void llama_model_load_internal( const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + if (verbose) fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); (void) vram_scratch; (void) n_batch; #ifdef GGML_USE_CUBLAS if (low_vram) { - fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); + if (verbose) fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch } else { vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n", + if (verbose) fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n", __func__, vram_scratch / MB); } } @@ -1277,32 +1287,34 @@ static void llama_model_load_internal( #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); - if (n_gpu_layers > (int) hparams.n_layer) { + if (verbose) fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); + if (verbose && n_gpu_layers > (int) hparams.n_layer) { fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__); } size_t vram_kv_cache = 0; if (n_gpu_layers > (int) hparams.n_layer + 1) { if (low_vram) { - fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); + if (verbose) fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); } else { - fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); + if (verbose) fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; } } if (n_gpu_layers > (int) hparams.n_layer + 2) { if (low_vram) { - fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); + if (verbose) fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); } else { - fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); + if (verbose) fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; } } - const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; - fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n", - __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3); - fprintf(stderr, "%s: total VRAM used: %zu MB\n", - __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up + if (verbose) { + const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; + fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n", + __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3); + fprintf(stderr, "%s: total VRAM used: %zu MB\n", + __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up + } #else (void) n_gpu_layers; #endif @@ -1348,10 +1360,11 @@ static bool llama_model_load( bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, - void *progress_callback_user_data) { + void *progress_callback_user_data, + bool verbose) { try { llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, - use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); + use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data, verbose); return true; } catch (const std::exception & err) { fprintf(stderr, "error loading model: %s\n", err.what()); @@ -2444,7 +2457,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, - /*vocab_only*/ false)); + /*vocab_only*/ false, + /*verbose*/ true)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype); #ifdef GGML_USE_K_QUANTS @@ -2656,7 +2670,7 @@ struct llama_model * llama_load_model_from_file( if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + params.vocab_only, params.progress_callback, params.progress_callback_user_data, params.verbose)) { delete model; fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; @@ -2713,7 +2727,7 @@ struct llama_context * llama_new_context_with_model( return nullptr; } - { + if (params.verbose) { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -2872,7 +2886,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const llama_buffer base_buf; if (path_base_model) { fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false, /*verbose*/ true)); size_t ctx_size; size_t mmapped_size; diff --git a/llama.h b/llama.h index a833a7f4d66cc..85512e441c6c0 100644 --- a/llama.h +++ b/llama.h @@ -100,6 +100,7 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only + bool verbose; // show information on stderr on model load. This doesn't affect error messages. }; // model file types enum llama_ftype {