@@ -30,13 +30,14 @@ llama_kv_cache_unified::llama_kv_cache_unified(
30
30
bool v_trans,
31
31
bool offload,
32
32
uint32_t kv_size,
33
- uint32_t padding,
33
+ uint32_t n_seq_max,
34
+ uint32_t n_pad,
34
35
uint32_t n_swa,
35
- llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
36
- GGML_ASSERT (kv_size % padding == 0 && " kv_size must be a multiple of padding" );
36
+ llama_swa_type swa_type) :
37
+ model(model), hparams(model.hparams), v_trans(v_trans),
38
+ n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
37
39
38
- this ->type_k = type_k;
39
- this ->type_v = type_v;
40
+ GGML_ASSERT (kv_size % n_pad == 0 );
40
41
41
42
// create a context for each buffer type
42
43
std::map<ggml_backend_buffer_type_t , ggml_context *> ctx_map;
@@ -129,8 +130,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
129
130
const size_t memory_size_k = size_k_bytes ();
130
131
const size_t memory_size_v = size_v_bytes ();
131
132
132
- LLAMA_LOG_INFO (" %s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n " , __func__,
133
- (float )(memory_size_k + memory_size_v) / (1024 .0f * 1024 .0f ), kv_size, (int ) layers.size (),
133
+ LLAMA_LOG_INFO (" %s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs ), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n " , __func__,
134
+ (float )(memory_size_k + memory_size_v) / (1024 .0f * 1024 .0f ), kv_size, (int ) layers.size (), n_seq_max,
134
135
ggml_type_name (type_k), (float )memory_size_k / (1024 .0f * 1024 .0f ),
135
136
ggml_type_name (type_v), (float )memory_size_v / (1024 .0f * 1024 .0f ));
136
137
}
@@ -442,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
442
443
void llama_kv_cache_unified::defrag_sched (float thold) {
443
444
// - do not defrag small contexts (i.e. < 2048 tokens)
444
445
// - count the padding towards the number of used tokens
445
- const float fragmentation = n >= 2048 ? std::max (0 .0f , 1 .0f - (float (used + padding )/n)) : 0 .0f ;
446
+ const float fragmentation = n >= 2048 ? std::max (0 .0f , 1 .0f - (float (used + n_pad )/n)) : 0 .0f ;
446
447
447
448
// queue defragmentation for next llama_kv_cache_update
448
449
if (fragmentation > thold) {
@@ -558,7 +559,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
558
559
// a heuristic, to avoid attending the full cache if it is not yet utilized
559
560
// after enough generations, the benefit from this heuristic disappears
560
561
// if we start defragmenting the cache, the benefit from this will be more important
561
- n = std::min (size, std::max (padding , GGML_PAD (cell_max (), padding )));
562
+ n = std::min (size, std::max (n_pad , GGML_PAD (cell_max (), n_pad )));
562
563
563
564
#ifdef FIND_SLOT_DEBUG
564
565
LLAMA_LOG_WARN (" end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n " , n, used, head, n_swa);
@@ -567,20 +568,6 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
567
568
return true ;
568
569
}
569
570
570
- int32_t llama_kv_cache_unified::get_n_tokens () const {
571
- int32_t result = 0 ;
572
-
573
- for (uint32_t i = 0 ; i < size; i++) {
574
- result += cells[i].seq_id .size ();
575
- }
576
-
577
- return result;
578
- }
579
-
580
- int32_t llama_kv_cache_unified::get_used_cells () const {
581
- return used;
582
- }
583
-
584
571
bool llama_kv_cache_unified::get_can_shift () const {
585
572
return true ;
586
573
}
@@ -802,16 +789,6 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
802
789
}
803
790
}
804
791
805
- llama_pos llama_kv_cache_unified::get_pos_max () const {
806
- llama_pos pos_max = -1 ;
807
-
808
- for (const auto & cell : cells) {
809
- pos_max = std::max (pos_max, cell.pos );
810
- }
811
-
812
- return pos_max;
813
- }
814
-
815
792
size_t llama_kv_cache_unified::total_size () const {
816
793
size_t size = 0 ;
817
794
@@ -1501,11 +1478,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1501
1478
llama_seq_id seq_id;
1502
1479
io.read_to (&seq_id, sizeof (seq_id));
1503
1480
1504
- // TODO: llama_kv_cache_unified should have a notion of max sequences
1505
- // if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1506
- if (seq_id < 0 ) {
1507
- // LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
1508
- LLAMA_LOG_ERROR (" %s: invalid seq_id, %d is out of range [0, inf)\n " , __func__, seq_id);
1481
+ if (seq_id < 0 || (uint32_t ) seq_id >= n_seq_max) {
1482
+ LLAMA_LOG_ERROR (" %s: invalid seq_id, %d is out of range [0, %u)\n " , __func__, seq_id, n_seq_max);
1509
1483
return false ;
1510
1484
}
1511
1485
@@ -1655,17 +1629,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
1655
1629
ggml_type type_v,
1656
1630
bool v_trans,
1657
1631
bool offload,
1658
- uint32_t kv_size,
1659
1632
bool swa_full,
1633
+ uint32_t kv_size,
1660
1634
uint32_t n_seq_max,
1661
1635
uint32_t n_batch,
1662
- uint32_t padding ) : hparams(model.hparams) {
1636
+ uint32_t n_pad ) : hparams(model.hparams) {
1663
1637
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams .is_swa (il); };
1664
1638
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams .is_swa (il); };
1665
1639
1666
1640
const uint32_t size_base = kv_size;
1667
1641
1668
- uint32_t size_swa = std::min (size_base, GGML_PAD (hparams.n_swa *n_seq_max + n_batch, padding ));
1642
+ uint32_t size_swa = std::min (size_base, GGML_PAD (hparams.n_swa *n_seq_max + n_batch, n_pad ));
1669
1643
1670
1644
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
1671
1645
if (swa_full) {
@@ -1680,14 +1654,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
1680
1654
1681
1655
kv_base = std::make_unique<llama_kv_cache_unified>(
1682
1656
model, std::move (filter_base), type_k, type_v,
1683
- v_trans, offload, size_base, padding ,
1657
+ v_trans, offload, size_base, n_seq_max, n_pad ,
1684
1658
0 , LLAMA_SWA_TYPE_NONE);
1685
1659
1686
1660
LLAMA_LOG_INFO (" %s: creating SWA KV cache, size = %u cells\n " , __func__, size_swa);
1687
1661
1688
1662
kv_swa = std::make_unique<llama_kv_cache_unified>(
1689
1663
model, std::move (filter_swa), type_k, type_v,
1690
- v_trans, offload, size_swa, padding ,
1664
+ v_trans, offload, size_swa, n_seq_max, n_pad ,
1691
1665
hparams.n_swa , hparams.swa_type );
1692
1666
}
1693
1667
@@ -1810,18 +1784,6 @@ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
1810
1784
return res;
1811
1785
}
1812
1786
1813
- int32_t llama_kv_cache_unified_iswa::get_n_tokens () const {
1814
- return kv_base->get_n_tokens ();
1815
- }
1816
-
1817
- int32_t llama_kv_cache_unified_iswa::get_used_cells () const {
1818
- return kv_base->get_used_cells ();
1819
- }
1820
-
1821
- llama_pos llama_kv_cache_unified_iswa::get_pos_max () const {
1822
- return kv_base->get_pos_max ();
1823
- }
1824
-
1825
1787
bool llama_kv_cache_unified_iswa::get_can_shift () const {
1826
1788
return kv_base->get_size () == kv_swa->get_size ();
1827
1789
}
@@ -1853,19 +1815,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
1853
1815
ggml_type type_k,
1854
1816
ggml_type type_v,
1855
1817
bool offload,
1856
- uint32_t kv_size) : hparams(model.hparams) {
1818
+ uint32_t kv_size,
1819
+ uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
1857
1820
const int32_t n_layer = hparams.n_layer ;
1858
1821
1859
- LLAMA_LOG_INFO (" %s: kv_size = %d , type_k = '%s', type_v = '%s', n_layer = %d\n " ,
1860
- __func__, kv_size, ggml_type_name (type_k), ggml_type_name (type_v), n_layer);
1822
+ LLAMA_LOG_INFO (" %s: kv_size = %u, n_seq_max = %u , type_k = '%s', type_v = '%s', n_layer = %d\n " ,
1823
+ __func__, kv_size, n_seq_max, ggml_type_name (type_k), ggml_type_name (type_v), n_layer);
1861
1824
1862
1825
head = 0 ;
1863
1826
size = kv_size;
1864
1827
used = 0 ;
1865
1828
1866
- this ->type_k = type_k;
1867
- this ->type_v = type_v;
1868
-
1869
1829
cells.clear ();
1870
1830
cells.resize (kv_size);
1871
1831
@@ -2203,8 +2163,8 @@ void llama_kv_cache_recurrent::commit() {
2203
2163
pending.ranges .clear ();
2204
2164
}
2205
2165
2206
- bool llama_kv_cache_recurrent::update (llama_context & lctx ) {
2207
- GGML_UNUSED (lctx );
2166
+ bool llama_kv_cache_recurrent::update (llama_context & ctx ) {
2167
+ GGML_UNUSED (ctx );
2208
2168
return false ;
2209
2169
}
2210
2170
@@ -2265,7 +2225,7 @@ bool llama_kv_cache_recurrent::find_slot(
2265
2225
if (seq_id < 0 || (uint32_t ) seq_id >= size) {
2266
2226
// too big seq_id
2267
2227
// TODO: would it be possible to resize the cache instead?
2268
- LLAMA_LOG_ERROR (" %s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n " , __func__, seq_id, size );
2228
+ LLAMA_LOG_ERROR (" %s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n " , __func__, seq_id, n_seq_max );
2269
2229
return false ;
2270
2230
}
2271
2231
if (j > 0 ) {
@@ -2408,29 +2368,6 @@ bool llama_kv_cache_recurrent::find_slot(
2408
2368
return n >= n_seqs;
2409
2369
}
2410
2370
2411
- int32_t llama_kv_cache_recurrent::get_n_tokens () const {
2412
- int32_t result = 0 ;
2413
-
2414
- for (uint32_t i = 0 ; i < size; i++) {
2415
- result += cells[i].seq_id .size ();
2416
- }
2417
-
2418
- return result;
2419
- }
2420
-
2421
- int32_t llama_kv_cache_recurrent::get_used_cells () const {
2422
- return used;
2423
- }
2424
-
2425
- llama_pos llama_kv_cache_recurrent::get_pos_max () const {
2426
- llama_pos pos_max = -1 ;
2427
- for (const auto & cell : cells) {
2428
- pos_max = std::max (pos_max, cell.pos );
2429
- }
2430
-
2431
- return pos_max;
2432
- }
2433
-
2434
2371
bool llama_kv_cache_recurrent::get_can_shift () const {
2435
2372
return false ;
2436
2373
}
0 commit comments