ggml-org
diff --git a/‎common/common.cpp
+26-5 b/‎common/common.cpp
+26-5
diff --git a/‎common/common.h
+2-1 b/‎common/common.h
+2-1
diff --git a/‎examples/llama-bench/llama-bench.cpp
+49-18 b/‎examples/llama-bench/llama-bench.cpp
+49-18
diff --git a/‎examples/main/README.md
+2-2 b/‎examples/main/README.md
+2-2
diff --git a/‎examples/quantize-stats/quantize-stats.cpp
+1-1 b/‎examples/quantize-stats/quantize-stats.cpp
+1-1
diff --git a/‎examples/save-load-state/save-load-state.cpp
+1-1 b/‎examples/save-load-state/save-load-state.cpp
+1-1
diff --git a/‎examples/server/README.md
+1-1 b/‎examples/server/README.md
+1-1
diff --git a/‎examples/server/server.cpp
+26-3 b/‎examples/server/server.cpp
+26-3
@@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = 1.0f/std::stof(argv[i]);
+        } else if (arg == "--kv-type" || arg == "-kvt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+
+            std::string type_name(argv[i]);
+            for (char & c : type_name) {
+                c = std::tolower(c);
+            }
+
+            if (type_name == "q8_0") {
+                params.kv_type = GGML_TYPE_Q8_0;
+            } else if (type_name == "f16") {
+                params.kv_type = GGML_TYPE_F16;
+            } else if (type_name == "f32") {
+                params.kv_type = GGML_TYPE_F32;
+            } else {
+                fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
         } else if (arg == "--memory-f32") {
-            params.memory_f16 = false;
+            params.kv_type = GGML_TYPE_F32;
         } else if (arg == "--top-p") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -643,8 +665,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
     fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "  -kvt, --kv-type       the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
     fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
     fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
     fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -725,7 +746,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.low_vram        = params.low_vram;
     lparams.mul_mat_q       = params.mul_mat_q;
     lparams.seed            = params.seed;
-    lparams.f16_kv          = params.memory_f16;
+    lparams.kv_type         = params.kv_type;
     lparams.use_mmap        = params.use_mmap;
     lparams.use_mlock       = params.use_mlock;
     lparams.logits_all      = params.perplexity;
@@ -1191,6 +1212,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
     fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
     fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
+    fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
 
     fprintf(stream, "logit_bias:\n");
@@ -1205,7 +1227,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
     fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
     fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
 
@@ -84,9 +84,10 @@ struct gpt_params {
     bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
+    ggml_type kv_type      = GGML_TYPE_Q8_0; // the type to use for the KV cache
+
     bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
 
@@ -135,7 +135,7 @@ struct cmd_params {
     std::vector<int> n_prompt;
     std::vector<int> n_gen;
     std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> kv_type;
     std::vector<int> n_threads;
     std::vector<int> n_gpu_layers;
     std::vector<int> main_gpu;
@@ -152,7 +152,7 @@ static const cmd_params cmd_params_defaults = {
     /* n_prompt      */ {512},
     /* n_gen         */ {128},
     /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* kv_type       */ {GGML_TYPE_Q8_0},
     /* n_threads     */ {get_num_physical_cores()},
     /* n_gpu_layers  */ {99},
     /* main_gpu      */ {0},
@@ -173,7 +173,16 @@ static void print_usage(int /* argc */, char ** argv) {
     fprintf(stdout, "  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
     fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+
+    std::string kv_type_default;
+    for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
+        if (i > 0) {
+            kv_type_default += ",";
+        }
+        kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
+    }
+    fprintf(stdout, "  -kvt, kv_type <q8_0|f16|f32>      (default: %s)\n", kv_type_default.c_str());
+
     fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
     fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
     fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -236,13 +245,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = split<int>(argv[i], split_delim);
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-kvt" || arg == "--kv-type") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            auto p = split<std::string>(argv[i], split_delim);
+
+            std::vector<ggml_type> kvt;
+            for (const std::string & type_name : p) {
+                if (type_name == "q8_0") {
+                    kvt.push_back(GGML_TYPE_Q8_0);
+                } else if (type_name == "f16") {
+                    kvt.push_back(GGML_TYPE_F16);
+                } else if (type_name == "f32") {
+                    kvt.push_back(GGML_TYPE_F32);
+                } else {
+                    invalid_param = true;
+                    break;
+                }
+            }
+            if (invalid_param) {
+                fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
+                break;
+            }
+
+            params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -340,7 +368,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
     if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
     if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.kv_type.empty())      { params.kv_type = cmd_params_defaults.kv_type; }
     if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
     if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
     if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -356,7 +384,7 @@ struct cmd_params_instance {
     int n_prompt;
     int n_gen;
     int n_batch;
-    bool f32_kv;
+    ggml_type kv_type;
     int n_threads;
     int n_gpu_layers;
     int main_gpu;
@@ -368,7 +396,7 @@ struct cmd_params_instance {
         llama_context_params lparams = llama_context_default_params();
         lparams.n_ctx = n_prompt + n_gen;
         lparams.n_batch = n_batch;
-        lparams.f16_kv = !f32_kv;
+        lparams.kv_type = kv_type;
         lparams.n_gpu_layers = n_gpu_layers;
         lparams.main_gpu = main_gpu;
         lparams.mul_mat_q = mul_mat_q;
@@ -384,7 +412,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
 
     for (const auto & m : params.model)
     for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & kvt : params.kv_type)
     for (const auto & nl : params.n_gpu_layers)
     for (const auto & mg : params.main_gpu)
     for (const auto & mmq : params.mul_mat_q)
@@ -396,7 +424,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
             /* .n_prompt     = */ n_prompt,
             /* .n_gen        = */ n_gen,
             /* .n_batch      = */ nb,
-            /* .f32_kv       = */ fk,
+            /* .kv_type      = */ kvt,
             /* .n_threads    = */ nt,
             /* .n_gpu_layers = */ nl,
             /* .main_gpu     = */ mg,
@@ -447,7 +475,7 @@ struct test {
     uint64_t model_n_params;
     int n_batch;
     int n_threads;
-    bool f32_kv;
+    ggml_type kv_type;
     int n_gpu_layers;
     int main_gpu;
     bool mul_mat_q;
@@ -467,7 +495,7 @@ struct test {
         model_n_params = llama_model_n_params(lmodel);
         n_batch = inst.n_batch;
         n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        kv_type = inst.kv_type;
         n_gpu_layers = inst.n_gpu_layers;
         main_gpu = inst.main_gpu;
         mul_mat_q = inst.mul_mat_q;
@@ -531,7 +559,7 @@ struct test {
             "cuda", "opencl", "metal", "gpu_blas", "blas",
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "kv_type",
             "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
@@ -551,7 +579,7 @@ struct test {
             return INT;
         }
         if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
+            field == "mul_mat_q" || field == "low_vram") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -581,7 +609,7 @@ struct test {
             std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
             std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -765,8 +793,8 @@ struct markdown_printer : public printer {
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
             fields.push_back("n_batch");
         }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
-            fields.push_back("f16_kv");
+        if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
+            fields.push_back("kv_type");
         }
         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
             fields.push_back("main_gpu");
@@ -834,6 +862,9 @@ struct markdown_printer : public printer {
             } else if (field == "t/s") {
                 snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                 value = buf;
+            } else if (field == "kv_type") {
+                snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
+                value = buf;
             } else if (vmap.find(field) != vmap.end()) {
                 value = vmap.at(field);
             } else {
 
@@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 -   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
 
-### Memory Float 32
+### KV cache type
 
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
+-   `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
 
 ### Batch Size
 
 
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
 
         lparams.n_ctx      = 256;
         lparams.seed       = 1;
-        lparams.f16_kv     = false;
+        lparams.kv_type    = GGML_TYPE_F32;
         lparams.use_mlock  = false;
 
         model = llama_load_model_from_file(params.model.c_str(), lparams);
 
@@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
 
     lparams.n_ctx     = params.n_ctx;
     lparams.seed      = params.seed;
-    lparams.f16_kv    = params.memory_f16;
+    lparams.kv_type   = params.kv_type;
     lparams.use_mmap  = params.use_mmap;
     lparams.use_mlock = params.use_mlock;
 
 
@@ -13,7 +13,7 @@ Command line options:
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
+-   `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
 -   `--numa`: Attempt optimizations that help on some NUMA systems.
 
@@ -704,8 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "  -kvt, --kv-type       the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
     if (llama_mlock_supported())
     {
         fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -838,9 +837,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.rope_freq_scale = std::stof(argv[i]);
         }
+        }
+        else if (arg == "--kv-type" || arg == "-kvt")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+
+            std::string type_name(argv[i]);
+            for (char & c : type_name) {
+                c = std::tolower(c);
+            }
+
+            if (type_name == "q8_0") {
+                params.kv_type = GGML_TYPE_Q8_0;
+            } else if (type_name == "f16") {
+                params.kv_type = GGML_TYPE_F16;
+            } else if (type_name == "f32") {
+                params.kv_type = GGML_TYPE_F32;
+            } else {
+                fprintf(stderr, "error: unknown KV type: %s. Known types: q8_0, f16, f32.\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
-            params.memory_f16 = false;
+            params.kv_type = GGML_TYPE_F32;
         }
         else if (arg == "--threads" || arg == "-t")
         {