@@ -1823,7 +1823,8 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num
18231823}
18241824
18251825// given an old GGUF context and a new context that has some middle portion removed,
1826- // Compute prefix match token count (for Smart Cache)
1826+ // Helper: Compute exact prefix match (tokens identical from start until first mismatch)
1827+ // Used for statistics and debugging - does NOT consider LCS
18271828// Returns absolute number of matching prefix tokens
18281829int ComputePrefixTokens (
18291830 const std::vector<int >& a,
@@ -1838,30 +1839,36 @@ int ComputePrefixTokens(
18381839 if (a[i] == b[i]) {
18391840 common++;
18401841 } else {
1841- break ;
1842+ break ; // Stop at first mismatch
18421843 }
18431844 }
18441845
1845- return common; // Return count, not percentage
1846+ return common;
18461847}
18471848
1848- // find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
1849- void PurgeMissingTokens (llama_context * ctx, llama_context * draft_ctx, std::vector<int > ¤t_context_tokens, std::vector<int > &new_context_tokens, const int genamt, const int nctx)
1849+ // Helper: compute purge parameters without modifying anything
1850+ // Factorized from PurgeMissingTokens - EXACT logic from concedo_experimental
1851+ // Returns true if purge is possible and worthwhile
1852+ // Outputs: trimstart (prefix length), purge_offset, purge_length
1853+ bool compute_purge_parameters (
1854+ const std::vector<int >& current_context_tokens,
1855+ const std::vector<int >& new_context_tokens,
1856+ const int genamt,
1857+ const int nctx,
1858+ int * out_trimstart,
1859+ int * out_purge_offset,
1860+ int * out_purge_length)
18501861{
1851- // scan from start old and new ctx, until first mismatch found, save as p0
1852- // check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
1853- // test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
1854- // if passed, save beginning of LCQ from old ctx as p1
1855- // remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
1856-
1857- const int ShortfallThreshold = 200 + std::min ((nctx/30 ),140 ); // dont trigger shifting if the distance between trimstart and currhead < this
1858- const int SlackAllowance = 60 + std::min ((nctx/60 ),70 ); // in case the end text is slightly modified, be forgiving
1862+ const int ShortfallThreshold = 200 + std::min ((nctx/30 ),140 );
1863+ const int SlackAllowance = 60 + std::min ((nctx/60 ),70 );
18591864
18601865 int trimstart = 0 ;
18611866 int new_tokens_len = new_context_tokens.size ();
1867+ int curr_tokens_len = current_context_tokens.size ();
18621868 bool purgeneeded = true ;
18631869
1864- for (int i = 0 ; i < current_context_tokens.size (); ++i)
1870+ // Calculate prefix match
1871+ for (int i = 0 ; i < curr_tokens_len; ++i)
18651872 {
18661873 if (current_context_tokens[i] == new_context_tokens[i])
18671874 {
@@ -1878,49 +1885,69 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
18781885 }
18791886 }
18801887
1881- // printf("\nPN: %d, NTL: %d, CCT: %d,TS:%d, diff:%d, sft:%d\n",purgeneeded,new_tokens_len,current_context_tokens.size(),trimstart,(new_tokens_len - trimstart),ShortfallThreshold);
1888+ *out_trimstart = trimstart;
1889+ *out_purge_offset = 0 ;
1890+ *out_purge_length = 0 ;
18821891
1883- if (!purgeneeded || new_tokens_len < 6 || current_context_tokens.size () < 6 || new_tokens_len - trimstart < ShortfallThreshold)
1892+ // Early exit conditions (EXACT logic from concedo_experimental)
1893+ if (!purgeneeded || new_tokens_len < 6 || curr_tokens_len < 6 || new_tokens_len - trimstart < ShortfallThreshold)
18841894 {
1885- return ; // no purge is needed
1895+ return false ; // no purge is needed
18861896 }
18871897
1888- // at least this many tokens need to match, otherwise don't bother trimming
1898+ // Calculate LCS threshold (EXACT formula from concedo_experimental)
18891899 const int LCSTokThreshold = std::max (std::min ((new_tokens_len - trimstart) - (genamt+SlackAllowance), (int )(nctx*0.45 )), ShortfallThreshold-SlackAllowance);
18901900
18911901 auto curr_ctx_without_memory = std::vector<int >(current_context_tokens.begin () + trimstart, current_context_tokens.end ());
18921902 auto new_ctx_without_memory = std::vector<int >(new_context_tokens.begin () + trimstart, new_context_tokens.end ());
18931903
18941904 auto shared = LongestCommonSubseq (curr_ctx_without_memory, new_ctx_without_memory);
18951905
1896- // printf("\nSharedSize: %d, LCSTokThreshold: %d, ArrPass: %d\n",shared.size(),LCSTokThreshold,ArrStartWith(new_ctx_without_memory, shared));
1897- if (shared.size () > LCSTokThreshold && ArrStartWith (new_ctx_without_memory, shared)) // enough tokens in common
1906+ // Check if LCS is sufficient and starts at beginning (EXACT logic from concedo_experimental)
1907+ if (shared.size () > LCSTokThreshold && ArrStartWith (new_ctx_without_memory, shared))
18981908 {
1899- int found = ArrFindIndexOf (current_context_tokens,shared);
1900- if (found>= 0 && found > trimstart)
1909+ int found = ArrFindIndexOf (current_context_tokens, shared);
1910+ if (found >= 0 && found > trimstart)
19011911 {
1912+ *out_purge_offset = trimstart;
1913+ *out_purge_length = found - trimstart;
1914+ return true ; // Purge is possible
1915+ }
1916+ }
19021917
1903- // extract the unwanted tokens out from context and KV
1904- int diff = found - trimstart;
1905- llama_memory_seq_rm (llama_get_memory (ctx), 0 , trimstart, trimstart + diff);
1906- llama_memory_seq_add (llama_get_memory (ctx), 0 , trimstart + diff, -1 , -diff);
1907- if (draft_ctx)
1908- {
1909- llama_memory_seq_rm (llama_get_memory (draft_ctx), 0 , trimstart, trimstart + diff);
1910- llama_memory_seq_add (llama_get_memory (draft_ctx), 0 , trimstart + diff, -1 , -diff);
1911- }
1918+ return false ; // No purge possible
1919+ }
19121920
1913- for ( size_t i = trimstart + diff; i < current_context_tokens. size () - 1 ; i++)
1914- {
1915- current_context_tokens[i - diff] = current_context_tokens[i];
1916- }
1921+ // find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
1922+ void PurgeMissingTokens (llama_context * ctx, llama_context * draft_ctx, std::vector< int > ¤t_context_tokens, std::vector< int > &new_context_tokens, const int genamt, const int nctx)
1923+ {
1924+ int trimstart, purge_offset, purge_length;
19171925
1918- printf (" \n [Context Shifting: Erased %d tokens at position %d]" , diff, trimstart + 1 );
1926+ // Use factorized helper to compute purge parameters
1927+ if (!compute_purge_parameters (current_context_tokens, new_context_tokens, genamt, nctx,
1928+ &trimstart, &purge_offset, &purge_length))
1929+ {
1930+ return ; // No purge needed or possible
1931+ }
19191932
1920- current_context_tokens.resize (current_context_tokens.size () - diff);
1921- }
1933+ // Execute the purge (remove tokens from KV cache and array)
1934+ int diff = purge_length;
1935+ llama_memory_seq_rm (llama_get_memory (ctx), 0 , purge_offset, purge_offset + diff);
1936+ llama_memory_seq_add (llama_get_memory (ctx), 0 , purge_offset + diff, -1 , -diff);
1937+ if (draft_ctx)
1938+ {
1939+ llama_memory_seq_rm (llama_get_memory (draft_ctx), 0 , purge_offset, purge_offset + diff);
1940+ llama_memory_seq_add (llama_get_memory (draft_ctx), 0 , purge_offset + diff, -1 , -diff);
19221941 }
19231942
1943+ for (size_t i = purge_offset + diff; i < current_context_tokens.size () - 1 ; i++)
1944+ {
1945+ current_context_tokens[i - diff] = current_context_tokens[i];
1946+ }
1947+
1948+ printf (" \n [Context Shifting: Erased %d tokens at position %d]" , diff, purge_offset + 1 );
1949+
1950+ current_context_tokens.resize (current_context_tokens.size () - diff);
19241951}
19251952
19261953static int GetBatchSize (int desiredBlasBatchSize,FileFormat in_file_format)
@@ -3851,32 +3878,77 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
38513878 const size_t vram_token_count = current_context_tokens.size ();
38523879 const size_t prompt_token_count = embd_inp.size ();
38533880
3854- g_smart_cache_metrics.total_requests ++;
3855-
3856- // Skip if prompt too small
3881+ // Skip if prompt too small (but save VRAM if it's large enough)
38573882 if (prompt_token_count < static_cast <size_t >(MIN_TOKENS))
38583883 {
3884+ g_smart_cache_metrics.record_skip ();
3885+
38593886 if (debugmode == 1 ) {
3860- printf (" \n [Smart Cache] Skip (prompt %zu < min %d tokens)" , prompt_token_count, MIN_TOKENS);
3887+ printf (" \n [Smart Cache] Skip search (prompt %zu < min %d tokens)" , prompt_token_count, MIN_TOKENS);
3888+ }
3889+
3890+ // CRITICAL: Save current VRAM context before it gets overwritten
3891+ // Even if new prompt is small, preserve large VRAM context for future reuse
3892+ if (vram_token_count >= static_cast <size_t >(MIN_TOKENS))
3893+ {
3894+ int vram_slot_id = g_smart_cache_manager->get_vram_slot_id ();
3895+ if (vram_slot_id == -1 ) {
3896+ vram_slot_id = g_smart_cache_manager->allocate_slot ();
3897+ g_smart_cache_manager->set_active_slot (vram_slot_id);
3898+ }
3899+
3900+ size_t kv_size = llama_state_get_size (llama_ctx_v4);
3901+ g_smart_cache_manager->evict_lru_slots_to_fit (kv_size);
3902+
3903+ size_t saved_bytes = gpttype_save_state_kv (vram_slot_id);
3904+ if (saved_bytes > 0 ) {
3905+ g_smart_cache_manager->save_to_slot (
3906+ vram_slot_id,
3907+ current_context_tokens,
3908+ saved_bytes
3909+ );
3910+ g_smart_cache_metrics.record_save_to_ram ();
3911+
3912+ // CRITICAL: Release this slot from VRAM so it can be found in RAM searches
3913+ // The slot is now saved to RAM and no longer represents active VRAM context
3914+ g_smart_cache_manager->set_active_slot (-1 );
3915+
3916+ if (debugmode == 1 ) {
3917+ printf (" , saved to RAM slot %d (%zu MB), released from VRAM" , vram_slot_id, saved_bytes / (1024 *1024 ));
3918+ }
3919+ }
38613920 }
38623921 }
38633922 else
38643923 {
3865- // 1. Check VRAM context prefix match
3866- int vram_prefix_count = ComputePrefixTokens (current_context_tokens, embd_inp);
3924+ // 1. Check if VRAM context can be reused (prefix + LCS check)
3925+ int trimstart, purge_offset, purge_length;
3926+ bool can_reuse_vram = compute_purge_parameters (
3927+ current_context_tokens, embd_inp,
3928+ inputs.max_length , nctx,
3929+ &trimstart, &purge_offset, &purge_length
3930+ );
3931+
3932+ // Calculate total reusable tokens (prefix + LCS)
3933+ int reusable_tokens = trimstart;
3934+ if (can_reuse_vram && purge_length > 0 ) {
3935+ // Can purge gap → reusable = prefix + (total - gap)
3936+ reusable_tokens = current_context_tokens.size () - purge_length;
3937+ }
38673938
38683939 if (debugmode == 1 ) {
3869- printf (" \n [Smart Cache] VRAM prefix=%d/%zu, min=%d" ,
3870- vram_prefix_count, vram_token_count, MIN_TOKENS);
3940+ printf (" \n [Smart Cache] VRAM reusable=%d/%zu (prefix=%d, purge=%s), min=%d" ,
3941+ reusable_tokens, vram_token_count, trimstart,
3942+ can_reuse_vram ? " YES" : " NO" , MIN_TOKENS);
38713943 }
38723944
3873- if (vram_prefix_count >= MIN_TOKENS)
3945+ if (reusable_tokens >= MIN_TOKENS)
38743946 {
3875- // ========== VRAM HIT ==========
3876- // Let PurgeMissingTokens + ContextFastForward handle reuse
3877- g_smart_cache_metrics.record_hit ( 1 . 0f , vram_prefix_count );
3947+ // ========== VRAM REUSE ==========
3948+ // Normal KoboldCpp behavior - PurgeMissingTokens + ContextFastForward handle reuse
3949+ g_smart_cache_metrics.record_vram_reuse (reusable_tokens );
38783950 if (debugmode == 1 ) {
3879- printf (" → VRAM HIT " );
3951+ printf (" → VRAM reuse (normal behavior) " );
38803952 }
38813953 }
38823954 else
@@ -3910,17 +3982,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39103982 g_smart_cache_metrics.record_save_to_ram ();
39113983
39123984 if (debugmode == 1 ) {
3913- printf (" , saved slot %d (%zu MB)" , vram_slot_id, saved_bytes / (1024 *1024 ));
3985+ printf (" , saved to RAM slot %d (%zu MB)" , vram_slot_id, saved_bytes / (1024 *1024 ));
39143986 }
39153987
3988+ // Release slot from VRAM tracking (now in RAM only)
3989+ g_smart_cache_manager->set_active_slot (-1 );
3990+
39163991 // Clear VRAM KV cache AND token array
39173992 gpttype_clear_state_kv (true );
39183993 current_context_tokens.clear ();
39193994 }
39203995 }
39213996
3922- // 3. Search RAM slots for best match
3923- int best_slot = g_smart_cache_manager->find_best_match (embd_inp, MIN_TOKENS);
3997+ // 3. Search RAM slots for best match (using prefix + LCS like VRAM check)
3998+ int best_slot = g_smart_cache_manager->find_best_match (embd_inp, MIN_TOKENS, inputs. max_length , nctx );
39243999
39254000 if (best_slot >= 0 )
39264001 {
@@ -3929,13 +4004,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39294004 if (load_success) {
39304005 const std::vector<int >* slot_tokens = g_smart_cache_manager->get_slot_tokens (best_slot);
39314006 if (slot_tokens) {
4007+ // Calculate prefix for statistics (exact match from start)
39324008 int prefix_count = ComputePrefixTokens (*slot_tokens, embd_inp);
39334009 current_context_tokens = *slot_tokens;
39344010
3935- g_smart_cache_metrics.record_hit (1 .0f , prefix_count);
4011+ // Calculate similarity percentage based on prefix only
4012+ size_t min_len = std::min (slot_tokens->size (), embd_inp.size ());
4013+ float similarity = min_len > 0 ? (float )prefix_count / min_len : 0 .0f ;
4014+
4015+ g_smart_cache_metrics.record_ram_hit (similarity, prefix_count);
39364016
39374017 if (debugmode == 1 ) {
3938- printf (" \n [Smart Cache] → RAM HIT slot %d (prefix %d tokens)" , best_slot, prefix_count);
4018+ printf (" \n [Smart Cache] → RAM HIT slot %d (loaded %zu tokens, prefix match %d, sim %.3f)" ,
4019+ best_slot, slot_tokens->size (), prefix_count, similarity);
39394020 }
39404021
39414022 g_smart_cache_manager->set_active_slot (best_slot);
@@ -3946,9 +4027,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39464027 else
39474028 {
39484029 // ========== RAM MISS ==========
3949- g_smart_cache_metrics.record_miss (0 .0f );
4030+ g_smart_cache_metrics.record_ram_miss (0 .0f );
39504031 if (debugmode == 1 ) {
3951- printf (" \n [Smart Cache] → RAM MISS (no slot with >= %d prefix )" , MIN_TOKENS);
4032+ printf (" \n [Smart Cache] → RAM MISS (no slot with >= %d reusable tokens )" , MIN_TOKENS);
39524033 }
39534034 // Proceed with cold prefill
39544035 }
0 commit comments