Skip to content

quantize: Handle user-defined quantization levels for additional tensors #12511

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Apr 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
09f716d
Add llama_model_quantize_params parameters
EAddario Mar 13, 2025
ac908af
Add new quantize parameters parsing and validation
EAddario Mar 13, 2025
337d979
Update usage
EAddario Mar 13, 2025
6f8d16d
Add new parameters defaults
EAddario Mar 13, 2025
71c9f93
Add new quantization parameters logic
EAddario Mar 13, 2025
8e18131
Add llama_model_quantize_params parameters
EAddario Mar 13, 2025
a77d947
Add new quantize parameters parsing and validation
EAddario Mar 13, 2025
2414eaa
Update usage
EAddario Mar 13, 2025
0dd66b8
Add new parameters defaults
EAddario Mar 13, 2025
1d841c6
Add new quantization parameters logic
EAddario Mar 13, 2025
120f71b
Merge main changes into branch
EAddario Mar 14, 2025
dbcc0b5
Merge branch 'master' into quantize
EAddario Mar 14, 2025
d86de03
Minor refactoring as per the contributors' coding guidelines
EAddario Mar 14, 2025
99bae5e
Update descriptions to match existing style
EAddario Mar 14, 2025
60b0a53
Merge branch 'master' into quantize
EAddario Mar 14, 2025
3e2063d
Merge branch 'master' into quantize
EAddario Mar 16, 2025
b99fa62
Merge branch 'master' into quantize
EAddario Mar 19, 2025
f97b693
Add llama_model_quantize_params parameters
EAddario Mar 19, 2025
f11e3da
Add new quantize parameters parsing and validation
EAddario Mar 19, 2025
ad1e352
Update usage
EAddario Mar 19, 2025
4e5c96a
Add new parameters defaults
EAddario Mar 19, 2025
9b3ccb5
Add new quantization parameters logic
EAddario Mar 19, 2025
35f45f1
Minor refactoring as per the contributors' guidelines
EAddario Mar 19, 2025
071e9ef
Merge branch 'master' into quantize
EAddario Mar 22, 2025
54e13cf
Implement general --tensor-type instead of tensor-specific command op…
EAddario Mar 29, 2025
31d642c
Merge branch 'master' into quantize
EAddario Mar 29, 2025
b3c7db5
Fix implied type bug
EAddario Mar 30, 2025
625f0ae
Restore missing #includes
EAddario Mar 31, 2025
2fd0b41
Add regex capability for tensor selection
EAddario Apr 1, 2025
3e9f565
Merge branch 'master' into quantize
EAddario Apr 2, 2025
054ede4
Refactor function name and update ALLOWED_TENSOR_TYPE
EAddario Apr 3, 2025
5a304b8
Add missing #include
EAddario Apr 3, 2025
1acb9f4
Handle edge case when tensor name is cls.output
EAddario Apr 3, 2025
04604a4
Minor logging improvement
EAddario Apr 7, 2025
30443a5
Merge branch 'master' into quantize
EAddario Apr 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 115 additions & 2 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
#include <fstream>
#include <cmath>
#include <cctype>
#include <algorithm>

struct quant_option {
std::string name;
llama_ftype ftype;
std::string desc;
};

static const std::vector<struct quant_option> QUANT_OPTIONS = {
static const std::vector<quant_option> QUANT_OPTIONS = {
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
Expand Down Expand Up @@ -105,7 +106,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
Expand All @@ -114,6 +116,8 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
Expand Down Expand Up @@ -244,6 +248,107 @@ static ggml_type parse_ggml_type(const char * arg) {
return GGML_TYPE_COUNT;
}

// Allowed tensors for arbitrary quantization with --tensor-type option
static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm rethinking about this, maybe we can simplify this functionality by adding just 2 flags:

  • --dump-mapping to get the list of tensors and the target quantized type. User can then modify the target quant directly
  • --mapping FILE user can then specify a custom mapping file given from step above

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to only allow certain tensors to be quantized, otherwise users will lobotomize their model and then complain that llama.cpp is broken

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with @ddh0 although I can see how down the line something similar to what @ngxson is suggesting may be useful: I'm testing the layer-wise quant using the modified llama-imatrix for guidance, and whilst I'm getting some really encouraging results (I'll publish the full model in my HF repo over the weekend), the process is overly manual and the regexes can get unwieldy (e.g. --tensor-type "(1[3-9]|2[0-9]|30)\.attn_v=q6_k" --tensor-type "([0-9]|[1-2][0-9]|30|31)\.ffn_down=q3_k" --tensor-type "(10|1[3-9]|2[0-9]|30)\.attn_q=q5_k" ...

I think it would be nice to have a way to AutoMagically generate optimum regexes to be fed into llama-quantize!

Copy link

@joseph777111 joseph777111 Apr 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to only allow certain tensors to be quantized, otherwise users will lobotomize their model and then complain that llama.cpp is broken

This is the case of full granular-control vs guided hand-holding to competence. And, we can have the best of both worlds. What we will need is a brief, informative How-To Guide, which introduces and explains the concepts of per-tensor and per-layer quantization, and then gives concrete examples, which they can base their quantization decisions on. Adding to this: the guide should further educate the User on which tensors/weights are targets for quantization (Embeddings, ATTN_K, ATTN_Q, ATTN_V, ATTN_Output, FFN_Down, FFN_Gate, FFN_Up, and, Output) and which are more likely not to be good targets for quantization (FFN_NORM, etc) and why. And, then for the more dense of mankind: a brief disclaimer stating that any and all modifications that they make to their custom quantized model is their business and responsibility and thereby waiving ggml-org or any of you guys from liability. 🤔

Just because some choose not to read and learn, doesn't mean we should have to suffer a loss of "power-user" features, because those who aren't paying attention will lobotomize their quantized models. This is all a fun game of trial-error and experimentation. If the users have made it this far, they will have to learn.

"attn_k",
"attn_kv_a_mqa",
"attn_kv_b",
"attn_o",
"attn_output",
"attn_q",
"attn_q_a",
"attn_q_b",
"attn_qkv",
"attn_v",
"channel_mix_key",
"channel_mix_receptance",
"channel_mix_value",
"cls",
"cls.output",
"cross_attn_k",
"cross_attn_o",
"cross_attn_q",
"cross_attn_v",
"ffn_act",
"ffn_down",
"ffn_down_exps",
"ffn_down_shexp",
"ffn_gate",
"ffn_gate_exps",
"ffn_gate_shexp",
"ffn_up",
"ffn_up_exps",
"ffn_up_shexp",
"ssm_in",
"ssm_out",
"time_mix_gate",
"time_mix_key",
"time_mix_output",
"time_mix_receptance",
"time_mix_value",
};

// changes to this struct must be replicated in llama-quant.cpp
struct tensor_quantization {
std::string name;
ggml_type quant = GGML_TYPE_COUNT;
};

static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
const char * sep = strchr(data, '=');
if (sep == nullptr) {
printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
return false;
}

const size_t tn_len = sep - data;
if (tn_len == 0) {
printf("\n%s: missing tensor name\n\n", __func__);
return false;
}

if (const size_t qt_len = strlen(sep); qt_len == 1) {
printf("\n%s: missing quantization type\n\n", __func__);
return false;
}

std::string tn(data, tn_len);
std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
sep++;
const std::string qt(sep);

bool found = false;
for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
std::string tensor;
tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
// handle special case of cls.output
std::string cls_output = "cls.output";
if (tn.find(cls_output) != std::string::npos) {
tensor = "cls.output";
}
// check if an allowed tensor exists and it's at the end of the kv string
if (tensor == allowed) {
found = true;
break;
}
}
if (!found) {
printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
return false;
}

if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
return false;
}

tensor_quantization tqz;
tqz.name = tn;
tqz.quant = parse_ggml_type(qt.c_str());
tensor_type.emplace_back(std::move(tqz));
return true;
}

int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
Expand All @@ -255,6 +360,7 @@ int main(int argc, char ** argv) {
std::string imatrix_file;
std::vector<std::string> included_weights, excluded_weights;
std::vector<llama_model_kv_override> kv_overrides;
std::vector<tensor_quantization> tensor_types;

for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
Expand All @@ -277,6 +383,10 @@ int main(int argc, char ** argv) {
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
Expand Down Expand Up @@ -361,6 +471,9 @@ int main(int argc, char ** argv) {
kv_overrides.back().key[0] = 0;
params.kv_overrides = &kv_overrides;
}
if (!tensor_types.empty()) {
params.tensor_types = &tensor_types;
}

llama_backend_init();

Expand Down
23 changes: 12 additions & 11 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -366,17 +366,18 @@ extern "C" {

// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
} llama_model_quantize_params;
Comment on lines +379 to 381
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the public interface, so add a comment in #9289.

Note that passing C++ objects here is not correct and we eventually have to fix this API to not do that. It hasn't become a problem yet because the quantization functions are likely not used frequently by 3rd party applications.

@EAddario If you are interested, you can give it a shot in another PR and fix these structs to become C compatible.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @ggerganov, happy to


typedef struct llama_logit_bias {
Expand Down
35 changes: 28 additions & 7 deletions src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <cinttypes>
#include <fstream>
#include <mutex>
#include <regex>
#include <thread>
#include <unordered_map>

Expand Down Expand Up @@ -47,8 +48,14 @@ struct quantize_state_impl {
{}
};

// changes to this struct must be replicated in quantize.cpp
struct tensor_quantization {
std::string name;
ggml_type quant = GGML_TYPE_COUNT;
};

static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
if (output.size() < nelements) {
Expand Down Expand Up @@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
model.load_hparams(ml);
model.load_stats (ml);

struct quantize_state_impl qs(model, params);
quantize_state_impl qs(model, params);

if (params->only_copy) {
ftype = ml.ftype;
Expand Down Expand Up @@ -661,7 +668,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// populate the original tensors so we get an initial meta data
for (const auto * it : tensors) {
uint16_t i_split = params->keep_split ? it->idx : 0;
struct ggml_tensor * tensor = it->tensor;
ggml_tensor * tensor = it->tensor;
if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty());
}
Expand Down Expand Up @@ -710,7 +717,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_ofstream(0);
for (const auto * it : tensors) {
const auto & weight = *it;
struct ggml_tensor * tensor = weight.tensor;
ggml_tensor * tensor = weight.tensor;
if (weight.idx != cur_split && params->keep_split) {
close_ofstream();
new_ofstream(weight.idx);
Expand Down Expand Up @@ -776,7 +783,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;

enum ggml_type new_type;
ggml_type new_type;
void * new_data;
size_t new_size;

Expand All @@ -786,6 +793,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
// unless the user specifies a type
if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
}
new_type = qtype;
break;
}
}
}
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;
Expand Down Expand Up @@ -910,8 +930,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// interface implementation
//

struct llama_model_quantize_params llama_model_quantize_default_params() {
struct llama_model_quantize_params result = {
llama_model_quantize_params llama_model_quantize_default_params() {
llama_model_quantize_params result = {
/*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
Expand All @@ -923,6 +943,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.keep_split =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
};

return result;
Expand Down
Loading