Skip to content

Commit 30b040a

Browse files
author
Pento
committed
finalized clean c++ smart cache code migration and semplification
1 parent 4fd06bb commit 30b040a

File tree

4 files changed

+255
-108
lines changed

4 files changed

+255
-108
lines changed

expose.cpp

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -520,26 +520,28 @@ extern "C"
520520
int smart_cache_find_best_match(
521521
const int* prompt_tokens,
522522
size_t prompt_len,
523-
int min_tokens
523+
int min_tokens,
524+
int genamt,
525+
int nctx
524526
)
525527
{
526528
if (g_smart_cache_manager == nullptr || prompt_tokens == nullptr) {
527529
return -1;
528530
}
529531

530532
std::vector<int> prompt_vec(prompt_tokens, prompt_tokens + prompt_len);
531-
return g_smart_cache_manager->find_best_match(prompt_vec, min_tokens);
533+
return g_smart_cache_manager->find_best_match(prompt_vec, min_tokens, genamt, nctx);
532534
}
533535

534536
// Metrics
535537
void smart_cache_record_hit(float similarity, size_t tokens_saved)
536538
{
537-
g_smart_cache_metrics.record_hit(similarity, tokens_saved);
539+
g_smart_cache_metrics.record_ram_hit(similarity, tokens_saved);
538540
}
539541

540542
void smart_cache_record_miss(float similarity)
541543
{
542-
g_smart_cache_metrics.record_miss(similarity);
544+
g_smart_cache_metrics.record_ram_miss(similarity);
543545
}
544546

545547
void smart_cache_record_context_switch()
@@ -561,9 +563,11 @@ extern "C"
561563
snprintf(buffer, sizeof(buffer),
562564
"{"
563565
"\"total_requests\":%llu,"
564-
"\"cache_hits\":%llu,"
565-
"\"cache_misses\":%llu,"
566-
"\"hit_rate\":%.3f,"
566+
"\"requests_skipped\":%llu,"
567+
"\"vram_reuse\":%llu,"
568+
"\"ram_hits\":%llu,"
569+
"\"ram_misses\":%llu,"
570+
"\"ram_hit_rate\":%.3f,"
567571
"\"context_switches\":%llu,"
568572
"\"saves_to_ram\":%llu,"
569573
"\"tokens_saved\":%llu,"
@@ -573,9 +577,11 @@ extern "C"
573577
"\"total_slots\":%zu"
574578
"}",
575579
(unsigned long long)g_smart_cache_metrics.total_requests,
576-
(unsigned long long)g_smart_cache_metrics.cache_hits,
577-
(unsigned long long)g_smart_cache_metrics.cache_misses,
578-
g_smart_cache_metrics.get_hit_rate(),
580+
(unsigned long long)g_smart_cache_metrics.requests_skipped,
581+
(unsigned long long)g_smart_cache_metrics.vram_reuse,
582+
(unsigned long long)g_smart_cache_metrics.ram_hits,
583+
(unsigned long long)g_smart_cache_metrics.ram_misses,
584+
g_smart_cache_metrics.get_ram_hit_rate(),
579585
(unsigned long long)g_smart_cache_metrics.context_switches,
580586
(unsigned long long)g_smart_cache_metrics.saves_to_ram,
581587
(unsigned long long)g_smart_cache_metrics.total_saved_prefill_tokens,

gpttype_adapter.cpp

Lines changed: 139 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,7 +1823,8 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num
18231823
}
18241824

18251825
//given an old GGUF context and a new context that has some middle portion removed,
1826-
// Compute prefix match token count (for Smart Cache)
1826+
// Helper: Compute exact prefix match (tokens identical from start until first mismatch)
1827+
// Used for statistics and debugging - does NOT consider LCS
18271828
// Returns absolute number of matching prefix tokens
18281829
int ComputePrefixTokens(
18291830
const std::vector<int>& a,
@@ -1838,30 +1839,36 @@ int ComputePrefixTokens(
18381839
if (a[i] == b[i]) {
18391840
common++;
18401841
} else {
1841-
break;
1842+
break; // Stop at first mismatch
18421843
}
18431844
}
18441845

1845-
return common; // Return count, not percentage
1846+
return common;
18461847
}
18471848

1848-
//find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
1849-
void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
1849+
// Helper: compute purge parameters without modifying anything
1850+
// Factorized from PurgeMissingTokens - EXACT logic from concedo_experimental
1851+
// Returns true if purge is possible and worthwhile
1852+
// Outputs: trimstart (prefix length), purge_offset, purge_length
1853+
bool compute_purge_parameters(
1854+
const std::vector<int>& current_context_tokens,
1855+
const std::vector<int>& new_context_tokens,
1856+
const int genamt,
1857+
const int nctx,
1858+
int* out_trimstart,
1859+
int* out_purge_offset,
1860+
int* out_purge_length)
18501861
{
1851-
//scan from start old and new ctx, until first mismatch found, save as p0
1852-
//check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
1853-
//test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
1854-
//if passed, save beginning of LCQ from old ctx as p1
1855-
//remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
1856-
1857-
const int ShortfallThreshold = 200 + std::min((nctx/30),140); //dont trigger shifting if the distance between trimstart and currhead < this
1858-
const int SlackAllowance = 60 + std::min((nctx/60),70); //in case the end text is slightly modified, be forgiving
1862+
const int ShortfallThreshold = 200 + std::min((nctx/30),140);
1863+
const int SlackAllowance = 60 + std::min((nctx/60),70);
18591864

18601865
int trimstart = 0;
18611866
int new_tokens_len = new_context_tokens.size();
1867+
int curr_tokens_len = current_context_tokens.size();
18621868
bool purgeneeded = true;
18631869

1864-
for (int i = 0; i < current_context_tokens.size(); ++i)
1870+
// Calculate prefix match
1871+
for (int i = 0; i < curr_tokens_len; ++i)
18651872
{
18661873
if (current_context_tokens[i] == new_context_tokens[i])
18671874
{
@@ -1878,49 +1885,69 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
18781885
}
18791886
}
18801887

1881-
//printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);
1888+
*out_trimstart = trimstart;
1889+
*out_purge_offset = 0;
1890+
*out_purge_length = 0;
18821891

1883-
if(!purgeneeded || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < ShortfallThreshold)
1892+
// Early exit conditions (EXACT logic from concedo_experimental)
1893+
if(!purgeneeded || new_tokens_len < 6 || curr_tokens_len < 6 || new_tokens_len - trimstart < ShortfallThreshold)
18841894
{
1885-
return; //no purge is needed
1895+
return false; //no purge is needed
18861896
}
18871897

1888-
//at least this many tokens need to match, otherwise don't bother trimming
1898+
// Calculate LCS threshold (EXACT formula from concedo_experimental)
18891899
const int LCSTokThreshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+SlackAllowance), (int)(nctx*0.45)), ShortfallThreshold-SlackAllowance);
18901900

18911901
auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
18921902
auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
18931903

18941904
auto shared = LongestCommonSubseq(curr_ctx_without_memory, new_ctx_without_memory);
18951905

1896-
//printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
1897-
if (shared.size() > LCSTokThreshold && ArrStartWith(new_ctx_without_memory, shared)) // enough tokens in common
1906+
// Check if LCS is sufficient and starts at beginning (EXACT logic from concedo_experimental)
1907+
if (shared.size() > LCSTokThreshold && ArrStartWith(new_ctx_without_memory, shared))
18981908
{
1899-
int found = ArrFindIndexOf(current_context_tokens,shared);
1900-
if(found>=0 && found > trimstart)
1909+
int found = ArrFindIndexOf(current_context_tokens, shared);
1910+
if(found >= 0 && found > trimstart)
19011911
{
1912+
*out_purge_offset = trimstart;
1913+
*out_purge_length = found - trimstart;
1914+
return true; // Purge is possible
1915+
}
1916+
}
19021917

1903-
//extract the unwanted tokens out from context and KV
1904-
int diff = found - trimstart;
1905-
llama_memory_seq_rm(llama_get_memory(ctx), 0, trimstart, trimstart + diff);
1906-
llama_memory_seq_add(llama_get_memory(ctx), 0, trimstart + diff, -1, -diff);
1907-
if(draft_ctx)
1908-
{
1909-
llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, trimstart, trimstart + diff);
1910-
llama_memory_seq_add(llama_get_memory(draft_ctx), 0, trimstart + diff, -1, -diff);
1911-
}
1918+
return false; // No purge possible
1919+
}
19121920

1913-
for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
1914-
{
1915-
current_context_tokens[i - diff] = current_context_tokens[i];
1916-
}
1921+
//find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
1922+
void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
1923+
{
1924+
int trimstart, purge_offset, purge_length;
19171925

1918-
printf("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
1926+
// Use factorized helper to compute purge parameters
1927+
if (!compute_purge_parameters(current_context_tokens, new_context_tokens, genamt, nctx,
1928+
&trimstart, &purge_offset, &purge_length))
1929+
{
1930+
return; // No purge needed or possible
1931+
}
19191932

1920-
current_context_tokens.resize(current_context_tokens.size() - diff);
1921-
}
1933+
// Execute the purge (remove tokens from KV cache and array)
1934+
int diff = purge_length;
1935+
llama_memory_seq_rm(llama_get_memory(ctx), 0, purge_offset, purge_offset + diff);
1936+
llama_memory_seq_add(llama_get_memory(ctx), 0, purge_offset + diff, -1, -diff);
1937+
if(draft_ctx)
1938+
{
1939+
llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, purge_offset, purge_offset + diff);
1940+
llama_memory_seq_add(llama_get_memory(draft_ctx), 0, purge_offset + diff, -1, -diff);
19221941
}
19231942

1943+
for (size_t i = purge_offset + diff; i < current_context_tokens.size() - 1; i++)
1944+
{
1945+
current_context_tokens[i - diff] = current_context_tokens[i];
1946+
}
1947+
1948+
printf("\n[Context Shifting: Erased %d tokens at position %d]", diff, purge_offset + 1);
1949+
1950+
current_context_tokens.resize(current_context_tokens.size() - diff);
19241951
}
19251952

19261953
static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
@@ -3851,32 +3878,77 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
38513878
const size_t vram_token_count = current_context_tokens.size();
38523879
const size_t prompt_token_count = embd_inp.size();
38533880

3854-
g_smart_cache_metrics.total_requests++;
3855-
3856-
// Skip if prompt too small
3881+
// Skip if prompt too small (but save VRAM if it's large enough)
38573882
if (prompt_token_count < static_cast<size_t>(MIN_TOKENS))
38583883
{
3884+
g_smart_cache_metrics.record_skip();
3885+
38593886
if (debugmode == 1) {
3860-
printf("\n[Smart Cache] Skip (prompt %zu < min %d tokens)", prompt_token_count, MIN_TOKENS);
3887+
printf("\n[Smart Cache] Skip search (prompt %zu < min %d tokens)", prompt_token_count, MIN_TOKENS);
3888+
}
3889+
3890+
// CRITICAL: Save current VRAM context before it gets overwritten
3891+
// Even if new prompt is small, preserve large VRAM context for future reuse
3892+
if (vram_token_count >= static_cast<size_t>(MIN_TOKENS))
3893+
{
3894+
int vram_slot_id = g_smart_cache_manager->get_vram_slot_id();
3895+
if (vram_slot_id == -1) {
3896+
vram_slot_id = g_smart_cache_manager->allocate_slot();
3897+
g_smart_cache_manager->set_active_slot(vram_slot_id);
3898+
}
3899+
3900+
size_t kv_size = llama_state_get_size(llama_ctx_v4);
3901+
g_smart_cache_manager->evict_lru_slots_to_fit(kv_size);
3902+
3903+
size_t saved_bytes = gpttype_save_state_kv(vram_slot_id);
3904+
if (saved_bytes > 0) {
3905+
g_smart_cache_manager->save_to_slot(
3906+
vram_slot_id,
3907+
current_context_tokens,
3908+
saved_bytes
3909+
);
3910+
g_smart_cache_metrics.record_save_to_ram();
3911+
3912+
// CRITICAL: Release this slot from VRAM so it can be found in RAM searches
3913+
// The slot is now saved to RAM and no longer represents active VRAM context
3914+
g_smart_cache_manager->set_active_slot(-1);
3915+
3916+
if (debugmode == 1) {
3917+
printf(", saved to RAM slot %d (%zu MB), released from VRAM", vram_slot_id, saved_bytes / (1024*1024));
3918+
}
3919+
}
38613920
}
38623921
}
38633922
else
38643923
{
3865-
// 1. Check VRAM context prefix match
3866-
int vram_prefix_count = ComputePrefixTokens(current_context_tokens, embd_inp);
3924+
// 1. Check if VRAM context can be reused (prefix + LCS check)
3925+
int trimstart, purge_offset, purge_length;
3926+
bool can_reuse_vram = compute_purge_parameters(
3927+
current_context_tokens, embd_inp,
3928+
inputs.max_length, nctx,
3929+
&trimstart, &purge_offset, &purge_length
3930+
);
3931+
3932+
// Calculate total reusable tokens (prefix + LCS)
3933+
int reusable_tokens = trimstart;
3934+
if (can_reuse_vram && purge_length > 0) {
3935+
// Can purge gap → reusable = prefix + (total - gap)
3936+
reusable_tokens = current_context_tokens.size() - purge_length;
3937+
}
38673938

38683939
if (debugmode == 1) {
3869-
printf("\n[Smart Cache] VRAM prefix=%d/%zu, min=%d",
3870-
vram_prefix_count, vram_token_count, MIN_TOKENS);
3940+
printf("\n[Smart Cache] VRAM reusable=%d/%zu (prefix=%d, purge=%s), min=%d",
3941+
reusable_tokens, vram_token_count, trimstart,
3942+
can_reuse_vram ? "YES" : "NO", MIN_TOKENS);
38713943
}
38723944

3873-
if (vram_prefix_count >= MIN_TOKENS)
3945+
if (reusable_tokens >= MIN_TOKENS)
38743946
{
3875-
// ========== VRAM HIT ==========
3876-
// Let PurgeMissingTokens + ContextFastForward handle reuse
3877-
g_smart_cache_metrics.record_hit(1.0f, vram_prefix_count);
3947+
// ========== VRAM REUSE ==========
3948+
// Normal KoboldCpp behavior - PurgeMissingTokens + ContextFastForward handle reuse
3949+
g_smart_cache_metrics.record_vram_reuse(reusable_tokens);
38783950
if (debugmode == 1) {
3879-
printf(" → VRAM HIT");
3951+
printf(" → VRAM reuse (normal behavior)");
38803952
}
38813953
}
38823954
else
@@ -3910,17 +3982,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39103982
g_smart_cache_metrics.record_save_to_ram();
39113983

39123984
if (debugmode == 1) {
3913-
printf(", saved slot %d (%zu MB)", vram_slot_id, saved_bytes / (1024*1024));
3985+
printf(", saved to RAM slot %d (%zu MB)", vram_slot_id, saved_bytes / (1024*1024));
39143986
}
39153987

3988+
// Release slot from VRAM tracking (now in RAM only)
3989+
g_smart_cache_manager->set_active_slot(-1);
3990+
39163991
// Clear VRAM KV cache AND token array
39173992
gpttype_clear_state_kv(true);
39183993
current_context_tokens.clear();
39193994
}
39203995
}
39213996

3922-
// 3. Search RAM slots for best match
3923-
int best_slot = g_smart_cache_manager->find_best_match(embd_inp, MIN_TOKENS);
3997+
// 3. Search RAM slots for best match (using prefix + LCS like VRAM check)
3998+
int best_slot = g_smart_cache_manager->find_best_match(embd_inp, MIN_TOKENS, inputs.max_length, nctx);
39243999

39254000
if (best_slot >= 0)
39264001
{
@@ -3929,13 +4004,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39294004
if (load_success) {
39304005
const std::vector<int>* slot_tokens = g_smart_cache_manager->get_slot_tokens(best_slot);
39314006
if (slot_tokens) {
4007+
// Calculate prefix for statistics (exact match from start)
39324008
int prefix_count = ComputePrefixTokens(*slot_tokens, embd_inp);
39334009
current_context_tokens = *slot_tokens;
39344010

3935-
g_smart_cache_metrics.record_hit(1.0f, prefix_count);
4011+
// Calculate similarity percentage based on prefix only
4012+
size_t min_len = std::min(slot_tokens->size(), embd_inp.size());
4013+
float similarity = min_len > 0 ? (float)prefix_count / min_len : 0.0f;
4014+
4015+
g_smart_cache_metrics.record_ram_hit(similarity, prefix_count);
39364016

39374017
if (debugmode == 1) {
3938-
printf("\n[Smart Cache] → RAM HIT slot %d (prefix %d tokens)", best_slot, prefix_count);
4018+
printf("\n[Smart Cache] → RAM HIT slot %d (loaded %zu tokens, prefix match %d, sim %.3f)",
4019+
best_slot, slot_tokens->size(), prefix_count, similarity);
39394020
}
39404021

39414022
g_smart_cache_manager->set_active_slot(best_slot);
@@ -3946,9 +4027,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39464027
else
39474028
{
39484029
// ========== RAM MISS ==========
3949-
g_smart_cache_metrics.record_miss(0.0f);
4030+
g_smart_cache_metrics.record_ram_miss(0.0f);
39504031
if (debugmode == 1) {
3951-
printf("\n[Smart Cache] → RAM MISS (no slot with >= %d prefix)", MIN_TOKENS);
4032+
printf("\n[Smart Cache] → RAM MISS (no slot with >= %d reusable tokens)", MIN_TOKENS);
39524033
}
39534034
// Proceed with cold prefill
39544035
}

0 commit comments

Comments
 (0)