Skip to content

[pull] master from ggml-org:master #191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b44890d
model : disable SWA for Phi models (#13676)
ggerganov May 21, 2025
797f2ac
kv-cache : simplify the interface (#13660)
ggerganov May 21, 2025
42158ae
server : fix first message identification (#13634)
doringeman May 21, 2025
0d5c742
server : Add the endpoints /api/tags and /api/chat (#13659)
R-Dson May 21, 2025
cf4cb59
ggml : add ggml_gelu_erf() (#13667)
ngxson May 21, 2025
eb0f5c2
gguf-py : display the invalid gguf type (#13687)
emmanuel-ferdman May 21, 2025
2aa777d
examples : switch retrieval to llama_encode (#13685)
CISC May 21, 2025
c76532e
convert : add qwen2vl support for unsloth merges (#13686)
antichristHater May 21, 2025
5fbfe38
server : improve error reporting (#13680)
ggerganov May 21, 2025
8e186ef
hparams : support models for which all layers use SWA (#13682)
ggerganov May 21, 2025
d643bb2
releases : build CPU backend separately (windows) (#13642)
slaren May 21, 2025
edbf42e
opencl: fix couple crashes (#12795)
linehill May 21, 2025
a4e8912
opencl: Add support for multiple devices (#12622)
linehill May 21, 2025
6b56a64
SYCL: Avoid using with SYCL-Graph for unsupported nodes (#13587)
EwanC May 22, 2025
d394a9a
sycl : Remove waits from function calls (#13702)
s-Nick May 22, 2025
5be24af
gguf-py : correct charsmap parameter typing (#13701)
CISC May 22, 2025
cc74d5b
server : pad small embedding batches (#13692)
ggerganov May 22, 2025
ab86335
common: Include torch package for s390x (#13699)
taronaeo May 22, 2025
797990c
mtmd : add ultravox audio input (#13623)
ngxson May 22, 2025
8a1d206
tts : fix n_ubatch + make WavTokenizer cache-less (#13713)
ggerganov May 22, 2025
3079e9a
release : fix windows hip release (#13707)
slaren May 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,7 @@ end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

[tools/mtmd/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
279 changes: 151 additions & 128 deletions .github/workflows/release.yml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
12 changes: 6 additions & 6 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
using json = nlohmann::ordered_json;

std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
};

Expand Down Expand Up @@ -1678,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.warmup = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
Expand Down Expand Up @@ -2233,12 +2233,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching",
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_MTMD}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down Expand Up @@ -2868,7 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ enum llama_example {
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
Expand Down
164 changes: 120 additions & 44 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
- [llama-mtmd-cli](../tools/mtmd/README.md)
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API

To enable it, can use use one of the 2 methods below:
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.

To enable it, you can use one of the 2 methods below:

- Use `-hf` option with a supported model (see a list of pre-quantized model below)
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
Expand Down Expand Up @@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,

NOTE: some models may require large context window, for example: `-c 8192`

**Vision models**:

```sh
# Gemma 3
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
Expand Down Expand Up @@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Llama 4 Scout
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
```

**Audio models**:

```sh
# Ultravox 0.5
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
```
12 changes: 6 additions & 6 deletions examples/retrieval/retrieval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
}
}

static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_self_clear(ctx);

// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
LOG_ERR("%s : failed to decode\n", __func__);
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
}

for (int i = 0; i < batch.n_tokens; i++) {
Expand Down Expand Up @@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
batch_encode(ctx, batch, out, s, n_embd);
common_batch_clear(batch);
p += s;
s = 0;
Expand All @@ -246,7 +246,7 @@ int main(int argc, char ** argv) {

// final batch
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
batch_encode(ctx, batch, out, s, n_embd);

// save embeddings to chunks
for (int i = 0; i < n_chunks; i++) {
Expand All @@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
batch_add_seq(query_batch, query_tokens, 0);

std::vector<float> query_emb(n_embd, 0);
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);

common_batch_clear(query_batch);

Expand Down
4 changes: 2 additions & 2 deletions examples/simple-chat/simple-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
auto generate = [&](const std::string & prompt) {
std::string response;

const bool is_first = llama_kv_self_used_cells(ctx) == 0;
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;

// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
Expand All @@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_kv_self_used_cells(ctx);
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
Expand Down
13 changes: 12 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -528,14 +528,15 @@ extern "C" {
GGML_UNARY_OP_STEP,
GGML_UNARY_OP_TANH,
GGML_UNARY_OP_ELU,
GGML_UNARY_OP_RELU,
GGML_UNARY_OP_SIGMOID,
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_ERF,
GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,
GGML_UNARY_OP_RELU,

GGML_UNARY_OP_COUNT,
};
Expand Down Expand Up @@ -1024,6 +1025,16 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

// GELU using erf (error function) when possible
// some backends may fallback to approximation based on Abramowitz and Stegun formula
GGML_API struct ggml_tensor * ggml_gelu_erf(
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_gelu_quick(
struct ggml_context * ctx,
struct ggml_tensor * a);
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;

case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
{
Expand Down
107 changes: 107 additions & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
}
}

// ggml_compute_forward_gelu_erf

static void ggml_compute_forward_gelu_erf_f32(
const ggml_compute_params * params,
ggml_tensor * dst) {

const ggml_tensor * src0 = dst->src[0];

assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));

const int ith = params->ith;
const int nth = params->nth;

const int nc = src0->ne[0];
const int nr = ggml_nrows(src0);

// rows per thread
const int dr = (nr + nth - 1)/nth;

// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);

for (int i1 = ir0; i1 < ir1; i1++) {
ggml_vec_gelu_erf_f32(nc,
(float *) ((char *) dst->data + i1*( dst->nb[1])),
(float *) ((char *) src0->data + i1*(src0->nb[1])));

#ifndef NDEBUG
for (int k = 0; k < nc; k++) {
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
GGML_UNUSED(x);
assert(!isnan(x));
assert(!isinf(x));
}
#endif
}
}

static void ggml_compute_forward_gelu_erf_f16(
const ggml_compute_params * params,
ggml_tensor * dst) {

const ggml_tensor * src0 = dst->src[0];

assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));

const int ith = params->ith;
const int nth = params->nth;

const int nc = src0->ne[0];
const int nr = ggml_nrows(src0);

// rows per thread
const int dr = (nr + nth - 1)/nth;

// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);

for (int i1 = ir0; i1 < ir1; i1++) {
ggml_vec_gelu_erf_f16(nc,
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));

#ifndef NDEBUG
for (int k = 0; k < nc; k++) {
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
const float v = GGML_FP16_TO_FP32(x);
GGML_UNUSED(v);
assert(!isnan(v));
assert(!isinf(v));
}
#endif
}
}

static void ggml_compute_forward_gelu_erf(
const ggml_compute_params * params,
ggml_tensor * dst) {

const ggml_tensor * src0 = dst->src[0];

switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_gelu_erf_f32(params, dst);
} break;
case GGML_TYPE_F16:
{
ggml_compute_forward_gelu_erf_f16(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
}
}
}

// ggml_compute_forward_gelu_quick

static void ggml_compute_forward_gelu_quick_f32(
Expand Down Expand Up @@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
{
ggml_compute_forward_gelu(params, dst);
} break;
case GGML_UNARY_OP_GELU_ERF:
{
ggml_compute_forward_gelu_erf(params, dst);
} break;
case GGML_UNARY_OP_GELU_QUICK:
{
ggml_compute_forward_gelu_quick(params, dst);
Expand Down
16 changes: 16 additions & 0 deletions ggml/src/ggml-cpu/vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
static const float GELU_COEF_A = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f;
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
static const float SQRT_2_INV = 0.70710678118654752440084436210484f;

inline static float ggml_gelu_f32(float x) {
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
Expand All @@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
}
}

inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float xi = GGML_FP16_TO_FP32(x[i]);
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
y[i] = GGML_FP32_TO_FP16(res);
}
}

#ifdef GGML_GELU_FP16
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
uint16_t t;
Expand All @@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
}
#endif

inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
for (int i = 0; i < n; ++i) {
float xi = x[i];
y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
}
}

inline static float ggml_gelu_quick_f32(float x) {
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
}
Expand Down
Loading