Skip to content

Commit e569a9f

Browse files
KV cache quantized to q8_0
1 parent c9c3220 commit e569a9f

File tree

14 files changed

+801
-317
lines changed

14 files changed

+801
-317
lines changed

common/common.cpp

+26-5
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
198198
break;
199199
}
200200
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
201+
} else if (arg == "--kv-type" || arg == "-kvt") {
202+
if (++i >= argc) {
203+
invalid_param = true;
204+
break;
205+
}
206+
207+
std::string type_name(argv[i]);
208+
for (char & c : type_name) {
209+
c = std::tolower(c);
210+
}
211+
212+
if (type_name == "q8_0") {
213+
params.kv_type = GGML_TYPE_Q8_0;
214+
} else if (type_name == "f16") {
215+
params.kv_type = GGML_TYPE_F16;
216+
} else if (type_name == "f32") {
217+
params.kv_type = GGML_TYPE_F32;
218+
} else {
219+
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
220+
invalid_param = true;
221+
break;
222+
}
201223
} else if (arg == "--memory-f32") {
202-
params.memory_f16 = false;
224+
params.kv_type = GGML_TYPE_F32;
203225
} else if (arg == "--top-p") {
204226
if (++i >= argc) {
205227
invalid_param = true;
@@ -643,8 +665,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
643665
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
644666
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
645667
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
646-
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
647-
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
668+
fprintf(stdout, " -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
648669
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
649670
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
650671
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -725,7 +746,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
725746
lparams.low_vram = params.low_vram;
726747
lparams.mul_mat_q = params.mul_mat_q;
727748
lparams.seed = params.seed;
728-
lparams.f16_kv = params.memory_f16;
749+
lparams.kv_type = params.kv_type;
729750
lparams.use_mmap = params.use_mmap;
730751
lparams.use_mlock = params.use_mlock;
731752
lparams.logits_all = params.perplexity;
@@ -1191,6 +1212,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
11911212
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
11921213
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
11931214
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
1215+
fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
11941216
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
11951217

11961218
fprintf(stream, "logit_bias:\n");
@@ -1205,7 +1227,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12051227
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
12061228
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
12071229
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1208-
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
12091230
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
12101231
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
12111232
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);

common/common.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,10 @@ struct gpt_params {
8484
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
8585
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
8686

87+
ggml_type kv_type = GGML_TYPE_Q8_0; // the type to use for the KV cache
88+
8789
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
8890
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
89-
bool memory_f16 = true; // use f16 instead of f32 for memory kv
9091
bool random_prompt = false; // do not randomize prompt if none provided
9192
bool use_color = false; // use color to distinguish generations and inputs
9293
bool interactive = false; // interactive mode

examples/llama-bench/llama-bench.cpp

+49-18
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ struct cmd_params {
135135
std::vector<int> n_prompt;
136136
std::vector<int> n_gen;
137137
std::vector<int> n_batch;
138-
std::vector<bool> f32_kv;
138+
std::vector<ggml_type> kv_type;
139139
std::vector<int> n_threads;
140140
std::vector<int> n_gpu_layers;
141141
std::vector<int> main_gpu;
@@ -152,7 +152,7 @@ static const cmd_params cmd_params_defaults = {
152152
/* n_prompt */ {512},
153153
/* n_gen */ {128},
154154
/* n_batch */ {512},
155-
/* f32_kv */ {false},
155+
/* kv_type */ {GGML_TYPE_Q8_0},
156156
/* n_threads */ {get_num_physical_cores()},
157157
/* n_gpu_layers */ {99},
158158
/* main_gpu */ {0},
@@ -173,7 +173,16 @@ static void print_usage(int /* argc */, char ** argv) {
173173
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
174174
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
175175
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
176-
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
176+
177+
std::string kv_type_default;
178+
for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
179+
if (i > 0) {
180+
kv_type_default += ",";
181+
}
182+
kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
183+
}
184+
fprintf(stdout, " -kvt, kv_type <q8_0|f16|f32> (default: %s)\n", kv_type_default.c_str());
185+
177186
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
178187
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
179188
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -236,13 +245,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
236245
}
237246
auto p = split<int>(argv[i], split_delim);
238247
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
239-
} else if (arg == "--memory-f32") {
248+
} else if (arg == "-kvt" || arg == "--kv-type") {
240249
if (++i >= argc) {
241250
invalid_param = true;
242251
break;
243252
}
244-
auto p = split<int>(argv[i], split_delim);
245-
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
253+
auto p = split<std::string>(argv[i], split_delim);
254+
255+
std::vector<ggml_type> kvt;
256+
for (const std::string & type_name : p) {
257+
if (type_name == "q8_0") {
258+
kvt.push_back(GGML_TYPE_Q8_0);
259+
} else if (type_name == "f16") {
260+
kvt.push_back(GGML_TYPE_F16);
261+
} else if (type_name == "f32") {
262+
kvt.push_back(GGML_TYPE_F32);
263+
} else {
264+
invalid_param = true;
265+
break;
266+
}
267+
}
268+
if (invalid_param) {
269+
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
270+
break;
271+
}
272+
273+
params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
246274
} else if (arg == "-t" || arg == "--threads") {
247275
if (++i >= argc) {
248276
invalid_param = true;
@@ -340,7 +368,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
340368
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
341369
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
342370
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
343-
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
371+
if (params.kv_type.empty()) { params.kv_type = cmd_params_defaults.kv_type; }
344372
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
345373
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
346374
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -356,7 +384,7 @@ struct cmd_params_instance {
356384
int n_prompt;
357385
int n_gen;
358386
int n_batch;
359-
bool f32_kv;
387+
ggml_type kv_type;
360388
int n_threads;
361389
int n_gpu_layers;
362390
int main_gpu;
@@ -368,7 +396,7 @@ struct cmd_params_instance {
368396
llama_context_params lparams = llama_context_default_params();
369397
lparams.n_ctx = n_prompt + n_gen;
370398
lparams.n_batch = n_batch;
371-
lparams.f16_kv = !f32_kv;
399+
lparams.kv_type = kv_type;
372400
lparams.n_gpu_layers = n_gpu_layers;
373401
lparams.main_gpu = main_gpu;
374402
lparams.mul_mat_q = mul_mat_q;
@@ -384,7 +412,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
384412

385413
for (const auto & m : params.model)
386414
for (const auto & nb : params.n_batch)
387-
for (const auto & fk : params.f32_kv)
415+
for (const auto & kvt : params.kv_type)
388416
for (const auto & nl : params.n_gpu_layers)
389417
for (const auto & mg : params.main_gpu)
390418
for (const auto & mmq : params.mul_mat_q)
@@ -396,7 +424,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
396424
/* .n_prompt = */ n_prompt,
397425
/* .n_gen = */ n_gen,
398426
/* .n_batch = */ nb,
399-
/* .f32_kv = */ fk,
427+
/* .kv_type = */ kvt,
400428
/* .n_threads = */ nt,
401429
/* .n_gpu_layers = */ nl,
402430
/* .main_gpu = */ mg,
@@ -447,7 +475,7 @@ struct test {
447475
uint64_t model_n_params;
448476
int n_batch;
449477
int n_threads;
450-
bool f32_kv;
478+
ggml_type kv_type;
451479
int n_gpu_layers;
452480
int main_gpu;
453481
bool mul_mat_q;
@@ -467,7 +495,7 @@ struct test {
467495
model_n_params = llama_model_n_params(lmodel);
468496
n_batch = inst.n_batch;
469497
n_threads = inst.n_threads;
470-
f32_kv = inst.f32_kv;
498+
kv_type = inst.kv_type;
471499
n_gpu_layers = inst.n_gpu_layers;
472500
main_gpu = inst.main_gpu;
473501
mul_mat_q = inst.mul_mat_q;
@@ -531,7 +559,7 @@ struct test {
531559
"cuda", "opencl", "metal", "gpu_blas", "blas",
532560
"cpu_info", "gpu_info",
533561
"model_filename", "model_type", "model_size", "model_n_params",
534-
"n_batch", "n_threads", "f16_kv",
562+
"n_batch", "n_threads", "kv_type",
535563
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
536564
"n_prompt", "n_gen", "test_time",
537565
"avg_ns", "stddev_ns",
@@ -551,7 +579,7 @@ struct test {
551579
return INT;
552580
}
553581
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
554-
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
582+
field == "mul_mat_q" || field == "low_vram") {
555583
return BOOL;
556584
}
557585
if (field == "avg_ts" || field == "stddev_ts") {
@@ -581,7 +609,7 @@ struct test {
581609
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
582610
cpu_info, gpu_info,
583611
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
584-
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
612+
std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
585613
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
586614
std::to_string(n_prompt), std::to_string(n_gen), test_time,
587615
std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -765,8 +793,8 @@ struct markdown_printer : public printer {
765793
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
766794
fields.push_back("n_batch");
767795
}
768-
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
769-
fields.push_back("f16_kv");
796+
if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
797+
fields.push_back("kv_type");
770798
}
771799
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
772800
fields.push_back("main_gpu");
@@ -834,6 +862,9 @@ struct markdown_printer : public printer {
834862
} else if (field == "t/s") {
835863
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
836864
value = buf;
865+
} else if (field == "kv_type") {
866+
snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
867+
value = buf;
837868
} else if (vmap.find(field) != vmap.end()) {
838869
value = vmap.at(field);
839870
} else {

examples/main/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.
276276

277277
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
278278

279-
### Memory Float 32
279+
### KV cache type
280280

281-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
281+
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
282282

283283
### Batch Size
284284

examples/quantize-stats/quantize-stats.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
328328

329329
lparams.n_ctx = 256;
330330
lparams.seed = 1;
331-
lparams.f16_kv = false;
331+
lparams.kv_type = GGML_TYPE_F32;
332332
lparams.use_mlock = false;
333333

334334
model = llama_load_model_from_file(params.model.c_str(), lparams);

examples/save-load-state/save-load-state.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
2727

2828
lparams.n_ctx = params.n_ctx;
2929
lparams.seed = params.seed;
30-
lparams.f16_kv = params.memory_f16;
30+
lparams.kv_type = params.kv_type;
3131
lparams.use_mmap = params.use_mmap;
3232
lparams.use_mlock = params.use_mlock;
3333

examples/server/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Command line options:
1313
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
1414
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
1515
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
16-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
16+
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
1717
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
1818
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
1919
- `--numa`: Attempt optimizations that help on some NUMA systems.

examples/server/server.cpp

+26-3
Original file line numberDiff line numberDiff line change
@@ -704,8 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
704704
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
705705
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
706706
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
707-
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
708-
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
707+
fprintf(stdout, " -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
709708
if (llama_mlock_supported())
710709
{
711710
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
@@ -838,9 +837,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
838837
}
839838
params.rope_freq_scale = std::stof(argv[i]);
840839
}
840+
}
841+
else if (arg == "--kv-type" || arg == "-kvt")
842+
{
843+
if (++i >= argc) {
844+
invalid_param = true;
845+
break;
846+
}
847+
848+
std::string type_name(argv[i]);
849+
for (char & c : type_name) {
850+
c = std::tolower(c);
851+
}
852+
853+
if (type_name == "q8_0") {
854+
params.kv_type = GGML_TYPE_Q8_0;
855+
} else if (type_name == "f16") {
856+
params.kv_type = GGML_TYPE_F16;
857+
} else if (type_name == "f32") {
858+
params.kv_type = GGML_TYPE_F32;
859+
} else {
860+
fprintf(stderr, "error: unknown KV type: %s. Known types: q8_0, f16, f32.\n", argv[i]);
861+
invalid_param = true;
862+
break;
863+
}
841864
else if (arg == "--memory-f32" || arg == "--memory_f32")
842865
{
843-
params.memory_f16 = false;
866+
params.kv_type = GGML_TYPE_F32;
844867
}
845868
else if (arg == "--threads" || arg == "-t")
846869
{

0 commit comments

Comments
 (0)