Skip to content

Commit cde3833

Browse files
authored
tool-call: allow --chat-template chatml w/ --jinja, default to chatml upon parsing issue, avoid double bos (#11616)
* tool-call: allow `--jinja --chat-template chatml` * fix double bos issue (drop bos/eos tokens from jinja template) * add missing try catch around jinja parsing to default to chatml * Simplify default chatml logic
1 parent b345178 commit cde3833

File tree

3 files changed

+124
-39
lines changed

3 files changed

+124
-39
lines changed

common/common.cpp

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1869,11 +1869,19 @@ std::string common_chat_format_example(const common_chat_template & tmpl, bool u
18691869
return common_chat_apply_template(tmpl, msgs, true, use_jinja);
18701870
}
18711871

1872+
#define CHATML_TEMPLATE_SRC \
1873+
"{%- for message in messages -%}\n" \
1874+
" {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
1875+
"{%- endfor -%}\n" \
1876+
"{%- if add_generation_prompt -%}\n" \
1877+
" {{- '<|im_start|>assistant\n' -}}\n" \
1878+
"{%- endif -%}"
1879+
18721880
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
18731881
{
1874-
auto vocab = llama_model_get_vocab(model);
1875-
std::string default_template_src = chat_template_override;
1876-
std::string template_tool_use_src = chat_template_override;
1882+
std::string default_template_src;
1883+
std::string template_tool_use_src;
1884+
18771885
bool has_explicit_template = !chat_template_override.empty();
18781886
if (chat_template_override.empty()) {
18791887
auto str = llama_model_chat_template(model, /* name */ nullptr);
@@ -1886,21 +1894,21 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
18861894
template_tool_use_src = str;
18871895
has_explicit_template = true;
18881896
}
1897+
} else {
1898+
default_template_src = chat_template_override;
18891899
}
18901900
if (default_template_src.empty() || default_template_src == "chatml") {
18911901
if (!template_tool_use_src.empty()) {
18921902
default_template_src = template_tool_use_src;
18931903
} else {
1894-
default_template_src = R"(
1895-
{%- for message in messages -%}
1896-
{{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
1897-
{%- endfor -%}
1898-
{%- if add_generation_prompt -%}
1899-
{{- "<|im_start|>assistant\n" -}}
1900-
{%- endif -%}
1901-
)";
1904+
default_template_src = CHATML_TEMPLATE_SRC;
19021905
}
19031906
}
1907+
std::string token_bos;
1908+
std::string token_eos;
1909+
// TODO: update logic that adds BOS and EOS tokens to the tokenized prompt, in favour of the template.
1910+
#if 0
1911+
auto vocab = llama_model_get_vocab(model);
19041912
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
19051913
if (token == LLAMA_TOKEN_NULL) {
19061914
if (default_template_src.find(jinja_variable_name) != std::string::npos
@@ -1912,15 +1920,25 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
19121920
return common_token_to_piece(vocab, token, true);
19131921
}
19141922
};
1915-
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1916-
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1917-
return {
1918-
has_explicit_template,
1919-
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1920-
template_tool_use_src.empty()
1921-
? nullptr
1922-
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos)
1923-
};
1923+
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1924+
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1925+
#endif
1926+
try {
1927+
return {
1928+
has_explicit_template,
1929+
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1930+
template_tool_use_src.empty()
1931+
? nullptr
1932+
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
1933+
};
1934+
} catch (const std::exception & e) {
1935+
LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
1936+
return {
1937+
has_explicit_template,
1938+
std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
1939+
nullptr,
1940+
};
1941+
}
19241942
}
19251943

19261944
//

examples/server/tests/unit/test_chat_completion.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@ def create_server():
1313
@pytest.mark.parametrize(
1414
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
1515
[
16+
(None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None),
17+
(None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None),
1618
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
17-
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None),
18-
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
19+
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None),
20+
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'),
21+
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
1922
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
2023
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
2124
]

examples/server/tests/unit/test_tool_call.py

Lines changed: 81 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ def create_server():
6767

6868

6969
def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
70-
n_predict = 512
7170
global server
71+
n_predict = 512
7272
# server = ServerPreset.stories15m_moe()
7373
server.jinja = True
7474
server.n_predict = n_predict
@@ -139,40 +139,62 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
139139
@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
140140
(TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
141141
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
142+
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
143+
144+
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
142145
(TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
143146
(PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
147+
144148
(TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
145149
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
150+
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
151+
146152
(TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
147153
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
154+
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
155+
148156
(TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
149157
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
158+
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
159+
150160
(TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
151161
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
162+
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
163+
152164
(TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
153165
(PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
154-
(TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
155-
(PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
166+
(PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
167+
168+
(TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
169+
(PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
170+
(PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"),
171+
156172
(TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
157173
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
174+
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
175+
158176
(TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
159177
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
178+
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
160179
# TODO: fix these
161180
# (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
162181
# (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
163182
])
164-
def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
183+
def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
184+
global server
165185
n_predict = 512
166186
server.n_slots = 1
167187
server.jinja = True
168188
server.n_ctx = 8192
169189
server.n_predict = n_predict
170190
server.model_hf_repo = hf_repo
171191
server.model_hf_file = None
172-
if template_override:
192+
if isinstance(template_override, tuple):
173193
(template_hf_repo, template_variant) = template_override
174194
server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
175195
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
196+
elif isinstance(template_override, str):
197+
server.chat_template = template_override
176198
server.start(timeout_seconds=TIMEOUT_SERVER_START)
177199
res = server.make_request("POST", "/chat/completions", data={
178200
"max_tokens": n_predict,
@@ -252,29 +274,49 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
252274
@pytest.mark.slow
253275
@pytest.mark.parametrize("hf_repo,template_override", [
254276
("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
255-
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
277+
("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
278+
256279
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
280+
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
281+
257282
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
258-
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
259-
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
283+
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
284+
285+
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
286+
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
287+
288+
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
289+
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
290+
260291
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
292+
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
293+
261294
("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
295+
("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
296+
262297
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
298+
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
299+
300+
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
301+
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
302+
263303
# ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
264304
# ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
265305
])
266-
def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | None] | None):
306+
def test_weather_tool_call(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
267307
global server
268308
server.n_slots = 1
269309
server.jinja = True
270310
server.n_ctx = 8192
271311
server.n_predict = 512
272312
server.model_hf_repo = hf_repo
273313
server.model_hf_file = None
274-
if template_override:
314+
if isinstance(template_override, tuple):
275315
(template_hf_repo, template_variant) = template_override
276316
server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
277317
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
318+
elif isinstance(template_override, str):
319+
server.chat_template = template_override
278320
server.start(timeout_seconds=TIMEOUT_SERVER_START)
279321
res = server.make_request("POST", "/chat/completions", data={
280322
"max_tokens": 256,
@@ -298,30 +340,52 @@ def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | Non
298340

299341
@pytest.mark.slow
300342
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
301-
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
302343
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
344+
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
345+
303346
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
304-
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
305-
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
347+
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
348+
349+
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
350+
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
351+
352+
('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
353+
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
354+
306355
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
356+
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
357+
307358
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
308-
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
309-
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
359+
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
360+
361+
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
362+
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
363+
364+
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
365+
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
366+
310367
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
368+
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
369+
370+
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
371+
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
372+
311373
# (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
312374
])
313-
def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
375+
def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
314376
global server
315377
server.n_slots = 1
316378
server.jinja = True
317379
server.n_ctx = 8192
318380
server.n_predict = 128
319381
server.model_hf_repo = hf_repo
320382
server.model_hf_file = None
321-
if template_override:
383+
if isinstance(template_override, tuple):
322384
(template_hf_repo, template_variant) = template_override
323385
server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
324386
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
387+
elif isinstance(template_override, str):
388+
server.chat_template = template_override
325389
server.start(timeout_seconds=TIMEOUT_SERVER_START)
326390
res = server.make_request("POST", "/chat/completions", data={
327391
"max_tokens": 256,

0 commit comments

Comments
 (0)