From 75031d5c23a85f45dd98939be9dd92c8cf1c3c76 Mon Sep 17 00:00:00 2001
From: grahameth <->
Date: Mon, 26 Jun 2023 18:22:39 +0200
Subject: [PATCH] Add verbose flag to control console output about model
 information on load.

---
 examples/common.cpp |  4 +++
 examples/common.h   |  1 +
 llama.cpp           | 76 +++++++++++++++++++++++++++------------------
 llama.h             |  1 +
 4 files changed, 51 insertions(+), 31 deletions(-)
diff --git a/examples/common.cpp b/examples/common.cpp
index 6ac4845559172..04eef7401c887 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -395,6 +395,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.input_suffix = argv[i];
+        } else if (arg == "--no-verbose") {
+            params.verbose = false;
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, default_params);
@@ -501,6 +503,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stderr, "  --no-verbose          do not print model info on startup\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
@@ -551,6 +554,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
+    lparams.verbose      = params.verbose;
 
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
     if (model == NULL) {
diff --git a/examples/common.h b/examples/common.h
index 713320179e2be..b38625f7697ed 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -78,6 +78,7 @@ struct gpt_params {
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
+    bool verbose           = true;  // print model info on load
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/llama.cpp b/llama.cpp
index ac22a48f8ab97..be9375035358d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -472,9 +472,11 @@ struct llama_file_loader {
     llama_hparams hparams;
     llama_vocab vocab;
 
-    llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
+    llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map, bool verbose)
         : file(fname, "rb") {
-        fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
+        if (verbose) {
+            fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
+        }
         read_magic();
         read_hparams();
         read_vocab();
@@ -662,13 +664,13 @@ struct llama_model_loader {
     struct ggml_context * ggml_ctx = NULL;
     std::unique_ptr<llama_mmap> mapping;
 
-    llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
-        auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
+    llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only, bool verbose) {
+        auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map, verbose);
         file_loaders.emplace_back(first_file);
         uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
         for (uint32_t i = 1; i < n_parts; i++) {
             std::string fname = fname_base + "." + std::to_string(i);
-            auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
+            auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map, verbose);
             file_loaders.emplace_back(ith_file);
             if (ith_file->hparams != first_file->hparams) {
                 throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
@@ -949,6 +951,7 @@ struct llama_context_params llama_context_default_params() {
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.embedding                   =*/ false,
+        /*.verbose                     =*/ true,
     };
 
     return result;
@@ -1055,11 +1058,12 @@ static void llama_model_load_internal(
         bool use_mlock,
         bool vocab_only,
         llama_progress_callback progress_callback,
-        void * progress_callback_user_data) {
+        void * progress_callback_user_data,
+        bool verbose) {
 
     model.t_start_us = ggml_time_us();
 
-    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
+    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only, verbose));
 
     vocab = std::move(ml->file_loaders.at(0)->vocab);
     model.hparams = ml->file_loaders.at(0)->hparams;
@@ -1087,7 +1091,7 @@ static void llama_model_load_internal(
 
     const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
 
-    {
+    if (verbose) {
         fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
         fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
@@ -1127,7 +1131,9 @@ static void llama_model_load_internal(
     size_t ctx_size;
     size_t mmapped_size;
     ml->calc_sizes(&ctx_size, &mmapped_size);
-    fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
+    if (verbose) {
+        fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
+    }
 
     // create the ggml context
     {
@@ -1151,12 +1157,16 @@ static void llama_model_load_internal(
 
     (void) main_gpu;
 #if defined(GGML_USE_CUBLAS)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    if (verbose) {
+        fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    }
     ggml_cuda_set_main_device(main_gpu);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
 #elif defined(GGML_USE_CLBLAST)
-    fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
+    if (verbose) {
+        fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
+    }
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
 #else
@@ -1256,20 +1266,20 @@ static void llama_model_load_internal(
         const size_t mem_required_state =
             scale*MEM_REQ_KV_SELF().at(model.type);
 
-        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+        if (verbose) fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
 
         (void) vram_scratch;
         (void) n_batch;
 #ifdef GGML_USE_CUBLAS
         if (low_vram) {
-            fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
+            if (verbose) fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
             ggml_cuda_set_scratch_size(0); // disable scratch
         } else {
             vram_scratch = n_batch * MB;
             ggml_cuda_set_scratch_size(vram_scratch);
             if (n_gpu_layers > 0) {
-                fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
+                if (verbose) fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
                         __func__, vram_scratch / MB);
             }
         }
@@ -1277,32 +1287,34 @@ static void llama_model_load_internal(
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
-        fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
+        if (verbose) fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        if (verbose && n_gpu_layers > (int) hparams.n_layer) {
             fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
         }
         size_t vram_kv_cache = 0;
         if (n_gpu_layers > (int) hparams.n_layer + 1) {
             if (low_vram) {
-                fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
+                if (verbose) fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
             } else {
-                fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
+                if (verbose) fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
                 vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
             }
         }
         if (n_gpu_layers > (int) hparams.n_layer + 2) {
             if (low_vram) {
-                fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
+                if (verbose) fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
             } else {
-                fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
+                if (verbose) fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
                 vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
             }
         }
-        const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
-        fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
-                __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
-        fprintf(stderr, "%s: total VRAM used: %zu MB\n",
-                __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
+        if (verbose) {
+            const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
+            fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
+                    __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
+            fprintf(stderr, "%s: total VRAM used: %zu MB\n",
+                    __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
+        }
 #else
         (void) n_gpu_layers;
 #endif
@@ -1348,10 +1360,11 @@ static bool llama_model_load(
         bool use_mlock,
         bool vocab_only,
         llama_progress_callback progress_callback,
-        void *progress_callback_user_data) {
+        void *progress_callback_user_data,
+        bool verbose) {
     try {
         llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
-                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
+                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data, verbose);
         return true;
     } catch (const std::exception & err) {
         fprintf(stderr, "error loading model: %s\n", err.what());
@@ -2444,7 +2457,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 
     std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
-                                                                            /*vocab_only*/ false));
+                                                                            /*vocab_only*/ false,
+                                                                            /*verbose*/ true));
     llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
 
 #ifdef GGML_USE_K_QUANTS
@@ -2656,7 +2670,7 @@ struct llama_model * llama_load_model_from_file(
 
     if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
                 params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                params.vocab_only, params.progress_callback, params.progress_callback_user_data, params.verbose)) {
         delete model;
         fprintf(stderr, "%s: failed to load model\n", __func__);
         return nullptr;
@@ -2713,7 +2727,7 @@ struct llama_context * llama_new_context_with_model(
             return nullptr;
         }
 
-        {
+        if (params.verbose) {
             const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
             fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
@@ -2872,7 +2886,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     llama_buffer base_buf;
     if (path_base_model) {
         fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
-        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
+        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false, /*verbose*/ true));
 
         size_t ctx_size;
         size_t mmapped_size;
diff --git a/llama.h b/llama.h
index a833a7f4d66cc..85512e441c6c0 100644
--- a/llama.h
+++ b/llama.h
@@ -100,6 +100,7 @@ extern "C" {
         bool use_mmap;   // use mmap if possible
         bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
+        bool verbose;    // show information on stderr on model load. This doesn't affect error messages.
     };
     // model file types
     enum llama_ftype {