Skip to content

Commit 32d32ef

Browse files
committed
Revert some bits
I made a mess while merging in Olivier's work, so it ended up merged into one commit in this branch. In this commit, I undo changes that wouldn't have been intended in this commit (ex. server.cpp
1 parent eae5d97 commit 32d32ef

File tree

1 file changed

+33
-13
lines changed

1 file changed

+33
-13
lines changed

examples/server/server.cpp

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ struct server_task {
386386
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387387
trigger.value = word;
388388
trigger.token = token;
389-
params.sampling.grammar_triggers.push_back(trigger);
389+
params.sampling.grammar_triggers.push_back(std::move(trigger));
390390
} else {
391391
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
392392
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
751751
{"name", tc.name},
752752
{"arguments", tc.arguments},
753753
}},
754-
{"id", tc.id},
754+
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755+
// We only generate a random id for the ones that don't generate one by themselves
756+
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757+
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
755758
});
756759
}
757760
message["tool_calls"] = tool_calls;
@@ -2037,6 +2040,18 @@ struct server_context {
20372040
return ret;
20382041
}
20392042

2043+
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2044+
const llama_model * model = llama_get_model(ctx);
2045+
const llama_vocab * vocab = llama_model_get_vocab(model);
2046+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2047+
for (const auto & token : tokens) {
2048+
if (token < 0 || token >= n_vocab) {
2049+
return false;
2050+
}
2051+
}
2052+
return true;
2053+
}
2054+
20402055
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
20412056
slot.reset();
20422057
slot.id_task = task.id;
@@ -2051,6 +2066,11 @@ struct server_context {
20512066
slot.lora = task.params.lora;
20522067
}
20532068

2069+
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2070+
if (!can_detokenize) {
2071+
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2072+
return false;
2073+
}
20542074
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
20552075

20562076
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2093,7 +2113,7 @@ struct server_context {
20932113
SRV_DBG("%s", "clearing KV cache\n");
20942114

20952115
// clear the entire KV cache
2096-
llama_kv_cache_clear(ctx);
2116+
llama_kv_self_clear(ctx);
20972117
clean_kv_cache = false;
20982118
}
20992119

@@ -2635,8 +2655,8 @@ struct server_context {
26352655
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
26362656
res->t_start = metrics.t_start;
26372657

2638-
res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
2639-
res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
2658+
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
2659+
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
26402660

26412661
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
26422662
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
@@ -2752,7 +2772,7 @@ struct server_context {
27522772

27532773
// Erase token cache
27542774
const size_t n_erased = slot->cache_tokens.size();
2755-
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
2775+
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
27562776
slot->cache_tokens.clear();
27572777

27582778
auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2820,8 +2840,8 @@ struct server_context {
28202840

28212841
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
28222842

2823-
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2824-
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2843+
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2844+
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
28252845

28262846
if (slot.params.cache_prompt) {
28272847
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3012,8 +3032,8 @@ struct server_context {
30123032

30133033
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
30143034

3015-
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3016-
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3035+
llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
3036+
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
30173037

30183038
for (size_t i = 0; i < n_match; i++) {
30193039
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3051,9 +3071,9 @@ struct server_context {
30513071
}
30523072

30533073
// keep only the common part
3054-
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3074+
if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
30553075
// could not partially delete (likely using a non-Transformer model)
3056-
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
3076+
llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
30573077

30583078
// there is no common part left
30593079
slot.n_past = 0;
@@ -3293,7 +3313,7 @@ struct server_context {
32933313
slot.cache_tokens.push_back(id);
32943314
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
32953315

3296-
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
3316+
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
32973317

32983318
for (size_t i = 0; i < ids.size(); ++i) {
32993319
completion_token_output result;

0 commit comments

Comments
 (0)