Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
791e60a
CUDA: faster tile FA, add oob checks, more HSs (llama/16492)
JohannesGaessler Oct 11, 2025
7847625
ggml: Correct SVE implementation in ggml_vec_dot_f16_unroll (llama/16…
sirus20x6 Oct 12, 2025
45e26a5
ggml : Fix FP16 ELU positive branch (llama/16519)
sirus20x6 Oct 12, 2025
99d0741
fix UT fault cases: count-equal, argsort, pad OPs (llama/16521)
NeoZhangJianyu Oct 12, 2025
1a08f1d
metal : add opt_step_adamw and op_sum (llama/16529)
cern1710 Oct 12, 2025
f9de4e2
CANN: Update several operators to support FP16 data format (llama/16251)
hipudding Oct 13, 2025
6dd8608
ggml : fix scalar path for computing norm (llama/16558)
ggerganov Oct 13, 2025
5ef1117
metal: add support for opt_step_sgd (llama/16539)
cern1710 Oct 13, 2025
b7c7d0c
CANN: fix CPU memory leak in CANN backend (llama/16549)
noemotiovon Oct 13, 2025
7ce6c53
ggml : fix build broken with -march=armv9-a on MacOS (llama/16520)
DamonFool Oct 13, 2025
e2b9c20
CUDA: fix numerical issues in tile FA kernel (llama/16540)
JohannesGaessler Oct 13, 2025
6839554
opencl: fix build targeting CL 2 (llama/16554)
lhez Oct 13, 2025
d98a164
metal : FA support F32 K and V and head size = 32 (llama/16531)
ggerganov Oct 13, 2025
d541d24
cuda : remove legacy copy-op pointer indirection code (llama/16485)
anavp-nvidia Oct 14, 2025
17d67ca
CUDA: add fp kernel for larger batch size MoE (llama/16512)
am17an Oct 14, 2025
360acc7
CUDA: use fastdiv + ggml_cuda_mad for mmvf (llama/16557)
am17an Oct 14, 2025
395008b
CUDA: enable FA for FP32 KV cache (llama/16546)
JohannesGaessler Oct 14, 2025
0f82a3c
vulkan: Improve build time for MSVC (llama/16545)
jeffbolznv Oct 14, 2025
c03c434
vulkan: Support FA with K/V in F32 (llama/16543)
jeffbolznv Oct 14, 2025
296f8c0
CUDA + openCL: fix bug in accessing rms_norm->src while doing fusion …
am17an Oct 14, 2025
8452144
vulkan: Add ACC_TYPE_VEC2 implementation (llama/16203)
SavicStefan Oct 14, 2025
c5d5a80
sync : ggml
ggerganov Oct 14, 2025
2eb25b1
talk-llama : sync llama.cpp
ggerganov Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 74 additions & 43 deletions examples/talk-llama/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
}
}

static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
const char * swa_type_str = "unknown";

switch (swa_type) {
case LLAMA_SWA_TYPE_NONE: swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
};

LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
Expand Down Expand Up @@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const int64_t n_kv = ubatch->n_tokens;
const int64_t n_tokens = ubatch->n_tokens;

GGML_ASSERT(kq_mask);
GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));

float * data = (float *) kq_mask->data;

// [TAG_NO_CACHE_ISWA]
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
for (int h = 0; h < 1; ++h) {
for (int i1 = 0; i1 < n_tokens; ++i1) {
const llama_seq_id s1 = ubatch->seq_id[i1][0];
const llama_pos p1 = ubatch->pos[i1];

for (int h = 0; h < 1; ++h) {
for (int i1 = 0; i1 < n_tokens; ++i1) {
const llama_seq_id s1 = ubatch->seq_id[i1][0];
const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;

for (int i0 = 0; i0 < n_tokens; ++i0) {
float f = -INFINITY;

for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
for (int i0 = 0; i0 < n_tokens; ++i0) {
const llama_seq_id s0 = ubatch->seq_id[i0][0];
const llama_pos p0 = ubatch->pos[i0];

// mask different sequences
if (s0 != s1) {
continue; // skip different sequences
continue;
}

if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
continue; // skip future tokens for causal attention
// mask future tokens
if (cparams.causal_attn && p0 > p1) {
continue;
}

// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
// continue; // skip masked tokens for SWA
//}

// TODO: reimplement this like in llama_kv_cache_unified
if (hparams.use_alibi) {
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
} else {
f = 0.0f;
// apply SWA if any
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
continue;
}

data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
}
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
}
}
};

{
GGML_ASSERT(self_kq_mask);
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));

float * data = (float *) self_kq_mask->data;

std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);

fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);

if (debug) {
print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
}
}
if (debug) {
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);

if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
GGML_ASSERT(self_kq_mask_swa);
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));

float * data = (float *) self_kq_mask_swa->data;

std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);

fill_mask(data, hparams.n_swa, hparams.swa_type);

if (debug) {
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
}
}
}

Expand Down Expand Up @@ -1299,12 +1321,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
v = ggml_permute(ctx0, v, 0, 2, 1, 3);

const auto n_kv = k->ne[1];

ggml_tensor * cur;

// TODO: replace hardcoded padding with ggml-provided padding
if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
if (cparams.flash_attn && kq_b == nullptr) {
GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");

if (v_trans) {
Expand Down Expand Up @@ -1419,10 +1438,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);

// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
ggml_set_input(inp->kq_mask);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
ggml_set_input(inp->self_kq_mask);

inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;

inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
ggml_set_input(inp->self_kq_mask_swa);

inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
} else {
inp->self_kq_mask_swa = nullptr;
inp->self_kq_mask_swa_cnv = nullptr;
}

return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
}
Expand All @@ -1447,7 +1476,9 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_build_forward_expand(gf, k_cur);
ggml_build_forward_expand(gf, v_cur);

const auto & kq_mask = inp->get_kq_mask();
const bool is_swa = hparams.is_swa(il);

const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();

// [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
Expand Down
10 changes: 7 additions & 3 deletions examples/talk-llama/llama-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,14 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {

void set_input(const llama_ubatch * ubatch) override;

ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }

ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
// n_tokens == n_batch
ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]

const llama_hparams hparams;
const llama_cparams cparams;
Expand Down
11 changes: 5 additions & 6 deletions examples/talk-llama/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11358,8 +11358,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
}
};

struct llm_build_gemma_embedding_iswa : public llm_graph_context {
llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
struct llm_build_gemma_embedding : public llm_graph_context {
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;

ggml_tensor * cur;
Expand All @@ -11376,8 +11376,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

// TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
auto * inp_attn = build_attn_inp_kv_iswa();
auto * inp_attn = build_attn_inp_no_cache();

ggml_tensor * inp_out_ids = build_inp_out_ids();

Expand Down Expand Up @@ -19378,7 +19377,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
//case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
case LLM_ARCH_GEMMA_EMBEDDING:
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE:
Expand Down Expand Up @@ -19671,7 +19670,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break;
case LLM_ARCH_GEMMA_EMBEDDING:
{
llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
} break;
case LLM_ARCH_STARCODER2:
{
Expand Down
1 change: 1 addition & 0 deletions examples/talk-llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ struct llama_model * llama_model_load_from_splits(
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
return nullptr;
}
splits.reserve(n_paths);
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
Expand Down
Loading
Loading