@@ -21,6 +21,7 @@ struct falcon_hparams {
21
21
int32_t n_head = 71 ;
22
22
int32_t n_head_kv = 1 ;
23
23
int32_t n_layer = 32 ;
24
+ int32_t version = 7 ; // 7 for Falcon-7B, 40 for Falcon-40B
24
25
int32_t ftype = 1 ;
25
26
};
26
27
@@ -87,8 +88,14 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
87
88
fin.read ((char *) &hparams.n_head , sizeof (hparams.n_head ));
88
89
fin.read ((char *) &hparams.n_head_kv , sizeof (hparams.n_head_kv ));
89
90
fin.read ((char *) &hparams.n_layer , sizeof (hparams.n_layer ));
91
+ fin.read ((char *) &hparams.version , sizeof (hparams.version ));
90
92
fin.read ((char *) &hparams.ftype , sizeof (hparams.ftype ));
91
93
94
+ if (hparams.version != 7 && hparams.version != 40 ) {
95
+ fprintf (stderr, " %s: invalid model file '%s' (bad Falcon version: %d)\n " , __func__, fname.c_str (), hparams.version );
96
+ return false ;
97
+ }
98
+
92
99
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
93
100
94
101
printf (" %s: n_vocab = %d\n " , __func__, hparams.n_vocab );
@@ -162,7 +169,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
162
169
n_layer *
163
170
(n_embd * ggml_type_sizef (GGML_TYPE_F32)); // input_layernorm_b
164
171
165
- if (n_head_kv > 1 ) { // Falcon-40B
172
+ if (hparams. version == 40 ) { // Falcon-40B
166
173
ctx_size +=
167
174
n_layer *
168
175
(n_embd * ggml_type_sizef (GGML_TYPE_F32)); // attention_norm
@@ -245,7 +252,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
245
252
layer.input_layernorm_b =
246
253
ggml_new_tensor_1d (ctx, GGML_TYPE_F32, n_embd);
247
254
248
- if (n_head_kv > 1 ) { // Falcon-40B
255
+ if (hparams. version == 40 ) { // for Falcon-40B only
249
256
layer.attention_norm =
250
257
ggml_new_tensor_1d (ctx, GGML_TYPE_F32, n_embd);
251
258
layer.attention_norm_b =
@@ -261,21 +268,23 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
261
268
layer.ffn_down = ggml_new_tensor_2d (ctx, wtype, n_ff, n_embd);
262
269
263
270
// map by name
264
- // Falcon-7B:
265
- model.tensors [" transformer.h." + std::to_string (i) +
266
- " .input_layernorm.weight" ] = layer.input_layernorm ;
267
- model.tensors [" transformer.h." + std::to_string (i) +
268
- " .input_layernorm.bias" ] = layer.input_layernorm_b ;
269
-
270
- // Falcon-40B:
271
- model.tensors [" transformer.h." + std::to_string (i) +
272
- " .ln_mlp.weight" ] = layer.input_layernorm ;
273
- model.tensors [" transformer.h." + std::to_string (i) +
274
- " .ln_mlp.bias" ] = layer.input_layernorm_b ;
275
- model.tensors [" transformer.h." + std::to_string (i) +
276
- " .ln_attn.weight" ] = layer.attention_norm ;
277
- model.tensors [" transformer.h." + std::to_string (i) +
278
- " .ln_attn.bias" ] = layer.attention_norm_b ;
271
+ if (hparams.version == 40 ) {
272
+ // Falcon-40B:
273
+ model.tensors [" transformer.h." + std::to_string (i) +
274
+ " .ln_mlp.weight" ] = layer.input_layernorm ;
275
+ model.tensors [" transformer.h." + std::to_string (i) +
276
+ " .ln_mlp.bias" ] = layer.input_layernorm_b ;
277
+ model.tensors [" transformer.h." + std::to_string (i) +
278
+ " .ln_attn.weight" ] = layer.attention_norm ;
279
+ model.tensors [" transformer.h." + std::to_string (i) +
280
+ " .ln_attn.bias" ] = layer.attention_norm_b ;
281
+ } else {
282
+ // Falcon-7B:
283
+ model.tensors [" transformer.h." + std::to_string (i) +
284
+ " .input_layernorm.weight" ] = layer.input_layernorm ;
285
+ model.tensors [" transformer.h." + std::to_string (i) +
286
+ " .input_layernorm.bias" ] = layer.input_layernorm_b ;
287
+ }
279
288
280
289
model.tensors [" transformer.h." + std::to_string (i) +
281
290
" .self_attention.query_key_value.weight" ] =
@@ -346,6 +355,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
346
355
}
347
356
348
357
auto tensor = model.tensors [name.data ()];
358
+ fprintf (stderr, " LOOKING AT %s\n " , name.data ());
349
359
if (ggml_nelements (tensor) != nelements) {
350
360
fprintf (stderr, " %s: tensor '%s' has wrong size in model file\n " , __func__, name.data ());
351
361
return false ;
@@ -415,6 +425,7 @@ bool falcon_eval(
415
425
const int n_head = hparams.n_head ;
416
426
const int n_head_kv = hparams.n_head_kv ;
417
427
const int n_vocab = hparams.n_vocab ;
428
+ const int version = hparams.version ;
418
429
const size_t head_dim = n_embd / n_head;
419
430
420
431
static size_t buf_size = 256u *1024 *1024 ;
@@ -477,7 +488,7 @@ bool falcon_eval(
477
488
layernorm_output),
478
489
ggml_repeat (ctx0, model.layers [il].input_layernorm_b , layernorm_output));
479
490
480
- if (n_head_kv > 1 ) { // Falcon-40B only
491
+ if (version == 40 ) { // Falcon-40B only
481
492
cur = ggml_norm (ctx0, inpL);
482
493
483
494
cur = ggml_add (ctx0,
@@ -493,25 +504,40 @@ bool falcon_eval(
493
504
// compute QKV
494
505
cur = ggml_mul_mat (ctx0, model.layers [il].query_key_value , cur);
495
506
496
- struct ggml_tensor * Qcur = ggml_view_4d (
497
- ctx0, cur, head_dim, n_head / n_head_kv, n_head_kv, N,
507
+ // Below is the "qkv" view which splits up QKV into kv groups,
508
+ // each group containing n_head / n_head_kv query heads,
509
+ // one key head and one value head (hence + 2). We don't really
510
+ // need this view as we access Q,K,V through cur directly by
511
+ // applying offsets and strides.
512
+
513
+ /* struct ggml_tensor* qkv = ggml_view_4d(
514
+ ctx0, cur, head_dim, n_head / n_head_kv + 2, n_head_kv, N,
498
515
head_dim * sizeof_wtype,
499
516
head_dim * (n_head / n_head_kv + 2) * sizeof_wtype,
500
517
head_dim * (n_head / n_head_kv + 2) * n_head_kv * sizeof_wtype,
501
- 0 );
518
+ 0);*/
519
+
520
+ // Note that the strides for Kcur, Vcur are set up so that the
521
+ // resulting views are misaligned with the tensor's storage
522
+ // (by applying the K/V offset we shift the tensor's original
523
+ // view to stick out behind the viewed QKV tensor's allocated
524
+ // memory, so to say). This is ok because no actual accesses
525
+ // happen to that out-of-range memory, but it can require some
526
+ // trickery when trying to accurately dump these views for
527
+ // debugging.
502
528
503
529
struct ggml_tensor * Kcur = ggml_view_4d (
504
- ctx0, cur, head_dim, 1 , N, n_head_kv ,
530
+ ctx0, cur, head_dim, 1 , n_head_kv, N ,
505
531
head_dim * sizeof_wtype,
506
532
head_dim * (n_head / n_head_kv + 2 ) * sizeof_wtype,
507
- head_dim * (n_head / n_head_kv + 2 ) * N * sizeof_wtype,
533
+ head_dim * (n_head / n_head_kv + 2 ) * n_head_kv * sizeof_wtype,
508
534
head_dim * (n_head / n_head_kv) * sizeof_wtype);
509
535
510
536
struct ggml_tensor * Vcur = ggml_view_4d (
511
- ctx0, cur, head_dim, 1 , N, n_head_kv ,
537
+ ctx0, cur, head_dim, 1 , n_head_kv, N ,
512
538
head_dim * sizeof_wtype,
513
539
head_dim * (n_head / n_head_kv + 2 ) * sizeof_wtype,
514
- head_dim * (n_head / n_head_kv + 2 ) * N * sizeof_wtype,
540
+ head_dim * (n_head / n_head_kv + 2 ) * n_head_kv * sizeof_wtype,
515
541
head_dim * (n_head / n_head_kv + 1 ) * sizeof_wtype);
516
542
517
543
// TODO: The crazy piecewise copying below works (well, until GGML_MAX_NODES is hit),
@@ -540,7 +566,7 @@ bool falcon_eval(
540
566
Q->nb [2 ] * i);
541
567
542
568
struct ggml_tensor * src = ggml_view_1d (
543
- ctx0, Qcur , head_dim, src_offset);
569
+ ctx0, cur , head_dim, src_offset);
544
570
545
571
struct ggml_tensor * dst = ggml_view_1d (
546
572
ctx0, Q, head_dim, dst_offset);
@@ -552,7 +578,9 @@ bool falcon_eval(
552
578
553
579
// using mode = 2 for neox mode
554
580
Q = ggml_rope_inplace (ctx0, Q, n_past, head_dim, 2 );
581
+ Kcur = ggml_permute (ctx0, Kcur, 0 , 1 , 3 , 2 );
555
582
Kcur = ggml_rope_inplace (ctx0, Kcur, n_past, head_dim, 2 );
583
+ Kcur = ggml_permute (ctx0, Kcur, 0 , 1 , 3 , 2 );
556
584
557
585
// store key and value to memory
558
586
{
@@ -583,14 +611,7 @@ bool falcon_eval(
583
611
584
612
// K * Q
585
613
586
- // TODO Unfortunately this ggml_repeat does not do what we need it to do:
587
- // [ K1, K2 ] will be broadcast into [ [K1, K2], [K1, K2] ], while we actually
588
- // need them to become [ [K1, K1], [K2, K2] ] ... And I suppose there will be same
589
- // problem with V below as well.
590
- // Here too perhaps GGML conversion could do some preprocessing to obtain
591
- // a more GGML-friendly memory format.
592
-
593
- K = ggml_cont (ctx0, ggml_repeat (ctx0, K, repeat_dummy));
614
+ K = ggml_cont (ctx0, ggml_repeat2 (ctx0, K, repeat_dummy));
594
615
Q = ggml_permute (ctx0, Q, 0 , 2 , 1 , 3 );
595
616
596
617
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
@@ -611,17 +632,17 @@ bool falcon_eval(
611
632
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
612
633
struct ggml_tensor * V = ggml_permute (
613
634
ctx0,
614
- ggml_reshape_4d (
635
+ ggml_reshape_3d (
615
636
ctx0,
616
637
ggml_view_1d (ctx0, model.memory_v , (n_past + N) * n_head_kv * head_dim,
617
638
il * n_ctx *
618
639
ggml_element_size (model.memory_v ) *
619
640
n_head_kv *
620
641
head_dim),
621
- head_dim, 1 , n_head_kv, n_past + N),
622
- 0 , 3 , 2 , 1 );
642
+ head_dim, n_head_kv, n_past + N),
643
+ 0 , 2 , 1 , 3 );
623
644
624
- V = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_repeat (ctx0, V, repeat_dummy)));
645
+ V = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_repeat2 (ctx0, V, repeat_dummy)));
625
646
626
647
// KQV = transpose(V) * KQ_soft_max
627
648
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
0 commit comments