@@ -386,7 +386,7 @@ struct server_task {
386
386
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387
387
trigger.value = word;
388
388
trigger.token = token;
389
- params.sampling .grammar_triggers .push_back (trigger);
389
+ params.sampling .grammar_triggers .push_back (std::move ( trigger) );
390
390
} else {
391
391
SRV_DBG (" Grammar trigger word: `%s`\n " , word.c_str ());
392
392
params.sampling .grammar_triggers .push_back ({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
751
751
{" name" , tc.name },
752
752
{" arguments" , tc.arguments },
753
753
}},
754
- {" id" , tc.id },
754
+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755
+ // We only generate a random id for the ones that don't generate one by themselves
756
+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757
+ {" id" , tc.id .empty () ? gen_tool_call_id () : tc.id },
755
758
});
756
759
}
757
760
message[" tool_calls" ] = tool_calls;
@@ -2037,6 +2040,18 @@ struct server_context {
2037
2040
return ret;
2038
2041
}
2039
2042
2043
+ bool can_be_detokenized (const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2044
+ const llama_model * model = llama_get_model (ctx);
2045
+ const llama_vocab * vocab = llama_model_get_vocab (model);
2046
+ const int32_t n_vocab = llama_vocab_n_tokens (vocab);
2047
+ for (const auto & token : tokens) {
2048
+ if (token < 0 || token >= n_vocab) {
2049
+ return false ;
2050
+ }
2051
+ }
2052
+ return true ;
2053
+ }
2054
+
2040
2055
bool launch_slot_with_task (server_slot & slot, const server_task & task) {
2041
2056
slot.reset ();
2042
2057
slot.id_task = task.id ;
@@ -2051,6 +2066,11 @@ struct server_context {
2051
2066
slot.lora = task.params .lora ;
2052
2067
}
2053
2068
2069
+ bool can_detokenize = can_be_detokenized (ctx, slot.prompt_tokens );
2070
+ if (!can_detokenize) {
2071
+ send_error (task, " Prompt contains invalid tokens" , ERROR_TYPE_INVALID_REQUEST);
2072
+ return false ;
2073
+ }
2054
2074
SLT_DBG (slot, " launching slot : %s\n " , safe_json_to_str (slot.to_json ()).c_str ());
2055
2075
2056
2076
if (slot.n_predict > 0 && slot.params .n_predict > slot.n_predict ) {
@@ -2093,7 +2113,7 @@ struct server_context {
2093
2113
SRV_DBG (" %s" , " clearing KV cache\n " );
2094
2114
2095
2115
// clear the entire KV cache
2096
- llama_kv_cache_clear (ctx);
2116
+ llama_kv_self_clear (ctx);
2097
2117
clean_kv_cache = false ;
2098
2118
}
2099
2119
@@ -2635,8 +2655,8 @@ struct server_context {
2635
2655
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred .size ();
2636
2656
res->t_start = metrics.t_start ;
2637
2657
2638
- res->kv_cache_tokens_count = llama_get_kv_cache_token_count (ctx);
2639
- res->kv_cache_used_cells = llama_get_kv_cache_used_cells (ctx);
2658
+ res->kv_cache_tokens_count = llama_kv_self_n_tokens (ctx);
2659
+ res->kv_cache_used_cells = llama_kv_self_used_cells (ctx);
2640
2660
2641
2661
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total ;
2642
2662
res->t_prompt_processing_total = metrics.t_prompt_processing_total ;
@@ -2752,7 +2772,7 @@ struct server_context {
2752
2772
2753
2773
// Erase token cache
2754
2774
const size_t n_erased = slot->cache_tokens .size ();
2755
- llama_kv_cache_seq_rm (ctx, slot->id , -1 , -1 );
2775
+ llama_kv_self_seq_rm (ctx, slot->id , -1 , -1 );
2756
2776
slot->cache_tokens .clear ();
2757
2777
2758
2778
auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2820,8 +2840,8 @@ struct server_context {
2820
2840
2821
2841
SLT_WRN (slot, " slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n " , n_keep, n_left, n_discard);
2822
2842
2823
- llama_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
2824
- llama_kv_cache_seq_add (ctx, slot.id , n_keep + n_discard, slot.n_past , -n_discard);
2843
+ llama_kv_self_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
2844
+ llama_kv_self_seq_add (ctx, slot.id , n_keep + n_discard, slot.n_past , -n_discard);
2825
2845
2826
2846
if (slot.params .cache_prompt ) {
2827
2847
for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++) {
@@ -3012,8 +3032,8 @@ struct server_context {
3012
3032
3013
3033
const int64_t kv_shift = (int64_t ) head_p - (int64_t ) head_c;
3014
3034
3015
- llama_kv_cache_seq_rm (ctx, slot.id , head_p, head_c);
3016
- llama_kv_cache_seq_add (ctx, slot.id , head_c, head_c + n_match, kv_shift);
3035
+ llama_kv_self_seq_rm (ctx, slot.id , head_p, head_c);
3036
+ llama_kv_self_seq_add (ctx, slot.id , head_c, head_c + n_match, kv_shift);
3017
3037
3018
3038
for (size_t i = 0 ; i < n_match; i++) {
3019
3039
slot.cache_tokens [head_p + i] = slot.cache_tokens [head_c + i];
@@ -3051,9 +3071,9 @@ struct server_context {
3051
3071
}
3052
3072
3053
3073
// keep only the common part
3054
- if (!llama_kv_cache_seq_rm (ctx, slot.id , slot.n_past , -1 )) {
3074
+ if (!llama_kv_self_seq_rm (ctx, slot.id , slot.n_past , -1 )) {
3055
3075
// could not partially delete (likely using a non-Transformer model)
3056
- llama_kv_cache_seq_rm (ctx, slot.id , -1 , -1 );
3076
+ llama_kv_self_seq_rm (ctx, slot.id , -1 , -1 );
3057
3077
3058
3078
// there is no common part left
3059
3079
slot.n_past = 0 ;
@@ -3293,7 +3313,7 @@ struct server_context {
3293
3313
slot.cache_tokens .push_back (id);
3294
3314
slot.cache_tokens .insert (slot.cache_tokens .end (), ids.begin (), ids.end () - 1 );
3295
3315
3296
- llama_kv_cache_seq_rm (ctx, slot.id , slot.n_past , -1 );
3316
+ llama_kv_self_seq_rm (ctx, slot.id , slot.n_past , -1 );
3297
3317
3298
3318
for (size_t i = 0 ; i < ids.size (); ++i) {
3299
3319
completion_token_output result;
0 commit comments