Skip to content

Commit f477288

Browse files
author
ochafik
committed
Merge remote-tracking branch 'origin/master' into tool-diffs
2 parents 4358d5d + add2a3a commit f477288

File tree

81 files changed

+15561
-12418
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+15561
-12418
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,7 @@ jobs:
774774
env:
775775
OPENBLAS_VERSION: 0.3.23
776776
SDE_VERSION: 9.33.0-2024-01-07
777-
VULKAN_VERSION: 1.3.261.1
777+
VULKAN_VERSION: 1.4.304.1
778778

779779
strategy:
780780
matrix:

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ ifdef GGML_MUSA
836836
else
837837
MUSA_PATH ?= /opt/musa
838838
endif
839-
MUSA_ARCHITECTURES ?= 21;22
839+
MUSA_ARCHITECTURES ?= 21;22;31
840840

841841
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
842842
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
172172
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
173173
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
174174
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
175+
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
175176
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
176177
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
177178
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)

common/arg.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
764764
).set_env("LLAMA_ARG_CTX_SIZE"));
765765
add_opt(common_arg(
766766
{"-n", "--predict", "--n-predict"}, "N",
767-
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
767+
string_format(
768+
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
769+
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770+
: "number of tokens to predict (default: %d, -1 = infinity)",
771+
params.n_predict),
768772
[](common_params & params, int value) {
769773
params.n_predict = value;
770774
}

common/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -975,8 +975,8 @@ struct common_init_result common_init_from_params(common_params & params) {
975975
return iparams;
976976
}
977977

978-
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
979-
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
978+
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
979+
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
980980
params.ctx_shift = false;
981981
}
982982

@@ -1080,7 +1080,7 @@ struct common_init_result common_init_from_params(common_params & params) {
10801080
if (llama_model_has_decoder(model)) {
10811081
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
10821082
}
1083-
llama_kv_cache_clear(lctx);
1083+
llama_kv_self_clear(lctx);
10841084
llama_synchronize(lctx);
10851085
llama_perf_context_reset(lctx);
10861086
}

common/speculative.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
173173
result.reserve(params.n_draft);
174174

175175
if (reuse_n == 0) {
176-
llama_kv_cache_clear(ctx);
176+
llama_kv_self_clear(ctx);
177177

178178
prompt.clear();
179179
} else {
@@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
192192
}
193193

194194
if (reuse_i > 0) {
195-
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
196-
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
195+
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196+
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197197

198198
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199199
}
200200

201201
if (reuse_n < (int) prompt.size()) {
202-
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
202+
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
203203

204204
prompt.erase(prompt.begin() + reuse_n, prompt.end());
205205
}

convert_hf_to_gguf.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,9 @@ def _create_vocab_sentencepiece(self):
861861
for token_id, token_data in added_tokens_decoder.items():
862862
token_id = int(token_id)
863863
token: str = token_data["content"]
864+
if token_id >= vocab_size:
865+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
866+
continue
864867
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
865868
if tokens[token_id] != token.encode("utf-8"):
866869
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
@@ -3322,6 +3325,83 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33223325
return [(self.map_tensor_name(name), data_torch)]
33233326

33243327

3328+
@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
3329+
class Gemma3Model(Model):
3330+
model_arch = gguf.MODEL_ARCH.GEMMA3
3331+
has_vision: bool = False
3332+
3333+
# we need to merge the text_config into the root level of hparams
3334+
def __init__(self, *args, **kwargs):
3335+
hparams = Model.load_hparams(kwargs["dir_model"])
3336+
if "text_config" in hparams:
3337+
hparams = {**hparams, **hparams["text_config"]}
3338+
kwargs["hparams"] = hparams
3339+
super().__init__(*args, **kwargs)
3340+
if "vision_config" in hparams:
3341+
logger.info("Has vision encoder, but it will be ignored")
3342+
self.has_vision = True
3343+
3344+
def write(self):
3345+
super().write()
3346+
if self.has_vision:
3347+
logger.info("NOTE: this script only convert the language model to GGUF")
3348+
logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py")
3349+
3350+
def set_vocab(self):
3351+
self._set_vocab_sentencepiece()
3352+
3353+
self.gguf_writer.add_add_space_prefix(False)
3354+
3355+
def set_gguf_parameters(self):
3356+
hparams = self.hparams
3357+
block_count = hparams["num_hidden_layers"]
3358+
3359+
# some default values are not specified in the hparams
3360+
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
3361+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
3362+
self.gguf_writer.add_block_count(block_count)
3363+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
3364+
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
3365+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
3366+
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
3367+
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
3368+
self.gguf_writer.add_file_type(self.ftype)
3369+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
3370+
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
3371+
assert hparams.get("attn_logit_softcapping") is None
3372+
assert hparams.get("final_logit_softcapping") is None
3373+
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
3374+
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
3375+
if hparams.get("rope_scaling") is not None:
3376+
assert hparams["rope_scaling"]["rope_type"] == "linear"
3377+
# important: this rope_scaling is only applied for global layers, and not used by 1B model
3378+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3379+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3380+
3381+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3382+
del bid # unused
3383+
3384+
if name.startswith("language_model."):
3385+
name = name.replace("language_model.", "")
3386+
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
3387+
or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
3388+
# ignore vision tensors
3389+
return []
3390+
3391+
# remove OOV (out-of-vocabulary) rows in token_embd
3392+
if "embed_tokens.weight" in name:
3393+
vocab = self._create_vocab_sentencepiece()
3394+
tokens = vocab[0]
3395+
data_torch = data_torch[:len(tokens)]
3396+
3397+
# ref code in Gemma3RMSNorm
3398+
# output = output * (1.0 + self.weight.float())
3399+
if name.endswith("norm.weight"):
3400+
data_torch = data_torch + 1
3401+
3402+
return [(self.map_tensor_name(name), data_torch)]
3403+
3404+
33253405
@Model.register("Starcoder2ForCausalLM")
33263406
class StarCoder2Model(Model):
33273407
model_arch = gguf.MODEL_ARCH.STARCODER2

docs/build.md

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -197,28 +197,52 @@ The following compilation options are also available to tweak performance:
197197

198198
## MUSA
199199

200-
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
200+
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
201201

202-
- Using `CMake`:
202+
#### Download directly from Moore Threads
203203

204-
```bash
205-
cmake -B build -DGGML_MUSA=ON
206-
cmake --build build --config Release
207-
```
204+
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
208205

209-
For static build:
206+
### Compilation
210207

211-
```bash
208+
```bash
209+
cmake -B build -DGGML_MUSA=ON
210+
cmake --build build --config Release
211+
```
212+
213+
#### Override Compute Capability Specifications
214+
215+
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
216+
217+
```bash
218+
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
219+
```
220+
221+
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
222+
223+
#### Compilation options
224+
225+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
226+
227+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
228+
```
212229
cmake -B build -DGGML_MUSA=ON \
213230
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
214231
cmake --build build --config Release
215232
```
216233

217-
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
234+
### Runtime MUSA environmental variables
218235

219-
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
236+
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
220237

221-
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
238+
```bash
239+
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
240+
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
241+
```
242+
243+
### Unified Memory
244+
245+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
222246

223247
## HIP
224248

examples/batched-bench/batched-bench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
132132

133133
const auto t_pp_start = ggml_time_us();
134134

135-
llama_kv_cache_clear(ctx);
135+
llama_kv_self_clear(ctx);
136136

137137
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
138138
LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
141141

142142
if (is_pp_shared) {
143143
for (int32_t i = 1; i < pl; ++i) {
144-
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
144+
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
145145
}
146146
}
147147

examples/batched.swift/Sources/main.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
116116
}
117117

118118
for i in 1 ..< n_parallel {
119-
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
119+
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
120120
}
121121

122122
if n_parallel > 1 {

examples/cvector-generator/cvector-generator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
342342
}
343343

344344
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
345-
llama_kv_cache_clear(ctx);
345+
llama_kv_self_clear(ctx);
346346
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
347347
fprintf(stderr, "%s : failed to eval\n", __func__);
348348
return false;

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
3838
const struct llama_model * model = llama_get_model(ctx);
3939

4040
// clear previous kv_cache values (irrelevant for embeddings)
41-
llama_kv_cache_clear(ctx);
41+
llama_kv_self_clear(ctx);
4242

4343
// run model
4444
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);

examples/gritlm/gritlm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
4545
}
4646

4747
// clear previous kv_cache values (irrelevant for embeddings)
48-
llama_kv_cache_clear(ctx);
48+
llama_kv_self_clear(ctx);
4949
llama_set_embeddings(ctx, true);
5050
llama_set_causal_attn(ctx, false);
5151

@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
102102

103103
llama_token eos_token = llama_vocab_eos(vocab);
104104

105-
llama_kv_cache_clear(ctx);
105+
llama_kv_self_clear(ctx);
106106
llama_set_embeddings(ctx, false);
107107
llama_set_causal_attn(ctx, true);
108108

examples/imatrix/imatrix.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
495495
const auto t_start = std::chrono::high_resolution_clock::now();
496496

497497
// clear the KV cache
498-
llama_kv_cache_clear(ctx);
498+
llama_kv_self_clear(ctx);
499499

500500
llama_batch batch = llama_batch_init(n_batch, 0, 1);
501501

examples/infill/infill.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
332332
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
333333
n_past, n_left, n_ctx, params.n_keep, n_discard);
334334

335-
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336-
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
335+
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336+
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
337337

338338
n_past -= n_discard;
339339

examples/llama-bench/llama-bench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
15781578

15791579
test t(inst, lmodel, ctx);
15801580

1581-
llama_kv_cache_clear(ctx);
1581+
llama_kv_self_clear(ctx);
15821582

15831583
// cool off before the test
15841584
if (params.delay) {
@@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
16181618
}
16191619

16201620
for (int i = 0; i < params.reps; i++) {
1621-
llama_kv_cache_clear(ctx);
1621+
llama_kv_self_clear(ctx);
16221622

16231623
uint64_t t_start = get_time_ns();
16241624

examples/llama.android/llama/src/main/cpp/llama-android.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
194194
}
195195

196196
batch->logits[batch->n_tokens - 1] = true;
197-
llama_kv_cache_clear(context);
197+
llama_kv_self_clear(context);
198198

199199
const auto t_pp_start = ggml_time_us();
200200
if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
206206

207207
LOGi("Benchmark text generation (tg)");
208208

209-
llama_kv_cache_clear(context);
209+
llama_kv_self_clear(context);
210210
const auto t_tg_start = ggml_time_us();
211211
for (i = 0; i < tg; i++) {
212212

@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
223223

224224
const auto t_tg_end = ggml_time_us();
225225

226-
llama_kv_cache_clear(context);
226+
llama_kv_self_clear(context);
227227

228228
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
229229
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
448448
extern "C"
449449
JNIEXPORT void JNICALL
450450
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
451-
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
451+
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
452452
}

examples/llama.swiftui/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build
1616
a simulator or a real device.
1717

1818
To use the framework with a different project, the XCFramework can be added to the project by
19-
adding `build-ios/llama.xcframework` by dragging and dropping it into the project navigator, or
19+
adding `build-apple/llama.xcframework` by dragging and dropping it into the project navigator, or
2020
by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
2121
of the project settings.
2222

0 commit comments

Comments
 (0)