Skip to content

Commit 3bc786b

Browse files
committed
Version which exactly reproduces outputs of the Python implementation of Falcon-7B and Falcon-40B minimodels; tested with three different n_head_kv configs of the latter; still includes the same poor man's implementation of extracting query heads from fused_qkv, which probably won't scale to a real 40B model
1 parent 3352043 commit 3bc786b

File tree

2 files changed

+60
-38
lines changed

2 files changed

+60
-38
lines changed

examples/falcon/convert-hf-to-ggml.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def bytes_to_unicode():
7979
fout.write(struct.pack("i", hparams["n_head"]))
8080
fout.write(struct.pack("i", hparams["n_head_kv"] if "n_head_kv" in hparams else 1))
8181
fout.write(struct.pack("i", hparams["n_layer"]))
82+
fout.write(struct.pack("i", 40 if "n_head_kv" in hparams else 7))
8283
fout.write(struct.pack("i", ftype))
8384

8485
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

examples/falcon/main.cpp

Lines changed: 59 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct falcon_hparams {
2121
int32_t n_head = 71;
2222
int32_t n_head_kv = 1;
2323
int32_t n_layer = 32;
24+
int32_t version = 7; // 7 for Falcon-7B, 40 for Falcon-40B
2425
int32_t ftype = 1;
2526
};
2627

@@ -87,8 +88,14 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
8788
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
8889
fin.read((char *) &hparams.n_head_kv, sizeof(hparams.n_head_kv));
8990
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
91+
fin.read((char *) &hparams.version, sizeof(hparams.version));
9092
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
9193

94+
if (hparams.version != 7 && hparams.version != 40) {
95+
fprintf(stderr, "%s: invalid model file '%s' (bad Falcon version: %d)\n", __func__, fname.c_str(), hparams.version);
96+
return false;
97+
}
98+
9299
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
93100

94101
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
@@ -162,7 +169,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
162169
n_layer *
163170
(n_embd * ggml_type_sizef(GGML_TYPE_F32)); // input_layernorm_b
164171

165-
if (n_head_kv > 1) { // Falcon-40B
172+
if (hparams.version == 40) { // Falcon-40B
166173
ctx_size +=
167174
n_layer *
168175
(n_embd * ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
@@ -245,7 +252,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
245252
layer.input_layernorm_b =
246253
ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
247254

248-
if (n_head_kv > 1) { // Falcon-40B
255+
if (hparams.version == 40) { // for Falcon-40B only
249256
layer.attention_norm =
250257
ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
251258
layer.attention_norm_b =
@@ -261,21 +268,23 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
261268
layer.ffn_down = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
262269

263270
// map by name
264-
// Falcon-7B:
265-
model.tensors["transformer.h." + std::to_string(i) +
266-
".input_layernorm.weight"] = layer.input_layernorm;
267-
model.tensors["transformer.h." + std::to_string(i) +
268-
".input_layernorm.bias"] = layer.input_layernorm_b;
269-
270-
// Falcon-40B:
271-
model.tensors["transformer.h." + std::to_string(i) +
272-
".ln_mlp.weight"] = layer.input_layernorm;
273-
model.tensors["transformer.h." + std::to_string(i) +
274-
".ln_mlp.bias"] = layer.input_layernorm_b;
275-
model.tensors["transformer.h." + std::to_string(i) +
276-
".ln_attn.weight"] = layer.attention_norm;
277-
model.tensors["transformer.h." + std::to_string(i) +
278-
".ln_attn.bias"] = layer.attention_norm_b;
271+
if (hparams.version == 40) {
272+
// Falcon-40B:
273+
model.tensors["transformer.h." + std::to_string(i) +
274+
".ln_mlp.weight"] = layer.input_layernorm;
275+
model.tensors["transformer.h." + std::to_string(i) +
276+
".ln_mlp.bias"] = layer.input_layernorm_b;
277+
model.tensors["transformer.h." + std::to_string(i) +
278+
".ln_attn.weight"] = layer.attention_norm;
279+
model.tensors["transformer.h." + std::to_string(i) +
280+
".ln_attn.bias"] = layer.attention_norm_b;
281+
} else {
282+
// Falcon-7B:
283+
model.tensors["transformer.h." + std::to_string(i) +
284+
".input_layernorm.weight"] = layer.input_layernorm;
285+
model.tensors["transformer.h." + std::to_string(i) +
286+
".input_layernorm.bias"] = layer.input_layernorm_b;
287+
}
279288

280289
model.tensors["transformer.h." + std::to_string(i) +
281290
".self_attention.query_key_value.weight"] =
@@ -346,6 +355,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
346355
}
347356

348357
auto tensor = model.tensors[name.data()];
358+
fprintf(stderr, "LOOKING AT %s\n", name.data());
349359
if (ggml_nelements(tensor) != nelements) {
350360
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
351361
return false;
@@ -415,6 +425,7 @@ bool falcon_eval(
415425
const int n_head = hparams.n_head;
416426
const int n_head_kv = hparams.n_head_kv;
417427
const int n_vocab = hparams.n_vocab;
428+
const int version = hparams.version;
418429
const size_t head_dim = n_embd / n_head;
419430

420431
static size_t buf_size = 256u*1024*1024;
@@ -477,7 +488,7 @@ bool falcon_eval(
477488
layernorm_output),
478489
ggml_repeat(ctx0, model.layers[il].input_layernorm_b, layernorm_output));
479490

480-
if (n_head_kv > 1) { // Falcon-40B only
491+
if (version == 40) { // Falcon-40B only
481492
cur = ggml_norm(ctx0, inpL);
482493

483494
cur = ggml_add(ctx0,
@@ -493,25 +504,40 @@ bool falcon_eval(
493504
// compute QKV
494505
cur = ggml_mul_mat(ctx0, model.layers[il].query_key_value, cur);
495506

496-
struct ggml_tensor* Qcur = ggml_view_4d(
497-
ctx0, cur, head_dim, n_head / n_head_kv, n_head_kv, N,
507+
// Below is the "qkv" view which splits up QKV into kv groups,
508+
// each group containing n_head / n_head_kv query heads,
509+
// one key head and one value head (hence + 2). We don't really
510+
// need this view as we access Q,K,V through cur directly by
511+
// applying offsets and strides.
512+
513+
/*struct ggml_tensor* qkv = ggml_view_4d(
514+
ctx0, cur, head_dim, n_head / n_head_kv + 2, n_head_kv, N,
498515
head_dim * sizeof_wtype,
499516
head_dim * (n_head / n_head_kv + 2) * sizeof_wtype,
500517
head_dim * (n_head / n_head_kv + 2) * n_head_kv * sizeof_wtype,
501-
0);
518+
0);*/
519+
520+
// Note that the strides for Kcur, Vcur are set up so that the
521+
// resulting views are misaligned with the tensor's storage
522+
// (by applying the K/V offset we shift the tensor's original
523+
// view to stick out behind the viewed QKV tensor's allocated
524+
// memory, so to say). This is ok because no actual accesses
525+
// happen to that out-of-range memory, but it can require some
526+
// trickery when trying to accurately dump these views for
527+
// debugging.
502528

503529
struct ggml_tensor* Kcur = ggml_view_4d(
504-
ctx0, cur, head_dim, 1, N, n_head_kv,
530+
ctx0, cur, head_dim, 1, n_head_kv, N,
505531
head_dim * sizeof_wtype,
506532
head_dim * (n_head / n_head_kv + 2) * sizeof_wtype,
507-
head_dim * (n_head / n_head_kv + 2) * N * sizeof_wtype,
533+
head_dim * (n_head / n_head_kv + 2) * n_head_kv * sizeof_wtype,
508534
head_dim * (n_head / n_head_kv) * sizeof_wtype);
509535

510536
struct ggml_tensor* Vcur = ggml_view_4d(
511-
ctx0, cur, head_dim, 1, N, n_head_kv,
537+
ctx0, cur, head_dim, 1, n_head_kv, N,
512538
head_dim * sizeof_wtype,
513539
head_dim * (n_head / n_head_kv + 2) * sizeof_wtype,
514-
head_dim * (n_head / n_head_kv + 2) * N * sizeof_wtype,
540+
head_dim * (n_head / n_head_kv + 2) * n_head_kv * sizeof_wtype,
515541
head_dim * (n_head / n_head_kv + 1) * sizeof_wtype);
516542

517543
// TODO: The crazy piecewise copying below works (well, until GGML_MAX_NODES is hit),
@@ -540,7 +566,7 @@ bool falcon_eval(
540566
Q->nb[2] * i);
541567

542568
struct ggml_tensor* src = ggml_view_1d(
543-
ctx0, Qcur, head_dim, src_offset);
569+
ctx0, cur, head_dim, src_offset);
544570

545571
struct ggml_tensor* dst = ggml_view_1d(
546572
ctx0, Q, head_dim, dst_offset);
@@ -552,7 +578,9 @@ bool falcon_eval(
552578

553579
// using mode = 2 for neox mode
554580
Q = ggml_rope_inplace(ctx0, Q, n_past, head_dim, 2);
581+
Kcur = ggml_permute(ctx0, Kcur, 0, 1, 3, 2);
555582
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2);
583+
Kcur = ggml_permute(ctx0, Kcur, 0, 1, 3, 2);
556584

557585
// store key and value to memory
558586
{
@@ -583,14 +611,7 @@ bool falcon_eval(
583611

584612
// K * Q
585613

586-
// TODO Unfortunately this ggml_repeat does not do what we need it to do:
587-
// [ K1, K2 ] will be broadcast into [ [K1, K2], [K1, K2] ], while we actually
588-
// need them to become [ [K1, K1], [K2, K2] ] ... And I suppose there will be same
589-
// problem with V below as well.
590-
// Here too perhaps GGML conversion could do some preprocessing to obtain
591-
// a more GGML-friendly memory format.
592-
593-
K = ggml_cont(ctx0, ggml_repeat(ctx0, K, repeat_dummy));
614+
K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy));
594615
Q = ggml_permute(ctx0, Q, 0, 2, 1, 3);
595616

596617
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
@@ -611,17 +632,17 @@ bool falcon_eval(
611632
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
612633
struct ggml_tensor* V = ggml_permute(
613634
ctx0,
614-
ggml_reshape_4d(
635+
ggml_reshape_3d(
615636
ctx0,
616637
ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim,
617638
il * n_ctx *
618639
ggml_element_size(model.memory_v) *
619640
n_head_kv *
620641
head_dim),
621-
head_dim, 1, n_head_kv, n_past + N),
622-
0, 3, 2, 1);
642+
head_dim, n_head_kv, n_past + N),
643+
0, 2, 1, 3);
623644

624-
V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat(ctx0, V, repeat_dummy)));
645+
V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy)));
625646

626647
// KQV = transpose(V) * KQ_soft_max
627648
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

0 commit comments

Comments
 (0)