Skip to content

Commit f5cd27b

Browse files
ochafikochafik
and
ochafik
authored
server: streaming of tool calls and thoughts when --jinja is on (#12379)
* add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<|tool▁calls|>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <[email protected]> Co-authored-by: Olivier Chafik <[email protected]>
1 parent a2d02d5 commit f5cd27b

23 files changed

+3231
-1077
lines changed

common/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,16 @@ add_library(${TARGET} STATIC
6060
base64.hpp
6161
chat.cpp
6262
chat.h
63+
chat-parser.cpp
64+
chat-parser.h
6365
common.cpp
6466
common.h
6567
console.cpp
6668
console.h
6769
json-schema-to-grammar.cpp
6870
json.hpp
71+
json-partial.h
72+
json-partial.cpp
6973
llguidance.cpp
7074
log.cpp
7175
log.h

common/chat-parser.cpp

Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
#include "chat-parser.h"
2+
#include "common.h"
3+
#include "log.h"
4+
#include "regex-partial.h"
5+
6+
#include <optional>
7+
#include <stdexcept>
8+
#include <string>
9+
#include <vector>
10+
11+
using json = nlohmann::ordered_json;
12+
13+
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
14+
: input_(input), is_partial_(is_partial), syntax_(syntax)
15+
{
16+
result_.role = "assistant";
17+
18+
while (true) {
19+
std::string id = std::to_string(std::rand());
20+
if (input.find(id) == std::string::npos) {
21+
healing_marker_ = id;
22+
break;
23+
}
24+
}
25+
}
26+
27+
std::string common_chat_msg_parser::str(const common_string_range & rng) const {
28+
GGML_ASSERT(rng.begin <= rng.end);
29+
return input_.substr(rng.begin, rng.end - rng.begin);
30+
}
31+
32+
void common_chat_msg_parser::add_content(const std::string &content) {
33+
result_.content += content;
34+
}
35+
36+
void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
37+
result_.reasoning_content += reasoning_content;
38+
}
39+
40+
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
41+
if (name.empty()) {
42+
return false;
43+
}
44+
45+
common_chat_tool_call tool_call;
46+
tool_call.name = name;
47+
tool_call.arguments = arguments;
48+
tool_call.id = id;
49+
50+
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
51+
result_.tool_calls.emplace_back(tool_call);
52+
return true;
53+
}
54+
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
55+
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
56+
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
57+
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
58+
return add_tool_call(name, id, arguments);
59+
}
60+
61+
bool common_chat_msg_parser::add_tool_calls(const json & arr) {
62+
for (const auto & item : arr) {
63+
if (!add_tool_call(item)) {
64+
return false;
65+
}
66+
}
67+
return true;
68+
}
69+
void common_chat_msg_parser::finish() {
70+
if (!is_partial_ && pos_ != input_.size()) {
71+
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
72+
}
73+
}
74+
75+
bool common_chat_msg_parser::consume_spaces() {
76+
const auto length = input_.size();
77+
auto consumed = false;
78+
while (pos_ < length && std::isspace(input_[pos_])) {
79+
++pos_;
80+
consumed = true;
81+
}
82+
return consumed;
83+
}
84+
85+
bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
86+
auto pos = pos_;
87+
for (auto i = 0u; i < literal.size(); ++i) {
88+
if (pos >= input_.size()) {
89+
return false;
90+
}
91+
if (input_[pos] != literal[i]) {
92+
return false;
93+
}
94+
++pos;
95+
}
96+
pos_ = pos;
97+
return true;
98+
}
99+
100+
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
101+
auto idx = input_.find(literal, pos_);
102+
if (idx != std::string::npos) {
103+
find_regex_result res;
104+
res.prelude = input_.substr(pos_, idx - pos_);
105+
auto end = idx + literal.size();
106+
res.groups.emplace_back(common_string_range{idx, end});
107+
move_to(end);
108+
return res;
109+
}
110+
if (is_partial_) {
111+
idx = string_find_partial_stop(input_, literal);
112+
if (idx != std::string::npos && idx >= pos_) {
113+
find_regex_result res;
114+
res.prelude = input_.substr(pos_, idx - pos_);
115+
auto end = input_.size();
116+
res.groups.emplace_back(common_string_range{idx, end});
117+
move_to(end);
118+
return res;
119+
}
120+
}
121+
return std::nullopt;
122+
}
123+
124+
void common_chat_msg_parser::consume_literal(const std::string & literal) {
125+
if (!try_consume_literal(literal)) {
126+
throw common_chat_msg_partial_exception(literal);
127+
}
128+
}
129+
130+
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
131+
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
132+
auto stripped_reasoning = string_strip(reasoning);
133+
if (stripped_reasoning.empty()) {
134+
return;
135+
}
136+
if (syntax_.reasoning_in_content) {
137+
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
138+
add_content(stripped_reasoning);
139+
if (closed) {
140+
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
141+
}
142+
} else {
143+
add_reasoning_content(stripped_reasoning);
144+
}
145+
};
146+
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
147+
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
148+
if (auto res = try_find_literal(end_think)) {
149+
handle_reasoning(res->prelude, /* closed */ true);
150+
consume_spaces();
151+
return true;
152+
}
153+
auto rest = consume_rest();
154+
if (!rest.empty()) {
155+
handle_reasoning(rest, /* closed */ !is_partial());
156+
}
157+
if (!syntax_.thinking_forced_open) {
158+
throw common_chat_msg_partial_exception(end_think);
159+
}
160+
return true;
161+
}
162+
}
163+
return false;
164+
}
165+
166+
std::string common_chat_msg_parser::consume_rest() {
167+
auto rest = input_.substr(pos_);
168+
pos_ = input_.size();
169+
return rest;
170+
}
171+
172+
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
173+
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from) {
174+
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
175+
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
176+
return std::nullopt;
177+
}
178+
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
179+
if (is_partial()) {
180+
throw common_chat_msg_partial_exception(regex.str());
181+
}
182+
return std::nullopt;
183+
}
184+
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
185+
pos_ = m.groups[0].end;
186+
187+
return find_regex_result{prelude, m.groups};
188+
}
189+
190+
common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
191+
if (auto result = try_consume_regex(regex)) {
192+
return *result;
193+
}
194+
throw common_chat_msg_partial_exception(regex.str());
195+
}
196+
197+
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
198+
auto m = regex.search(input_, pos_);
199+
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
200+
return std::nullopt;
201+
}
202+
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
203+
if (is_partial()) {
204+
throw common_chat_msg_partial_exception(regex.str());
205+
}
206+
return std::nullopt;
207+
}
208+
if (m.groups[0].begin != pos_) {
209+
// Didn't match at the current position.
210+
return std::nullopt;
211+
}
212+
pos_ = m.groups[0].end;
213+
214+
return find_regex_result {
215+
/* .prelude = */ "",
216+
m.groups,
217+
};
218+
}
219+
220+
std::optional<common_json> common_chat_msg_parser::try_consume_json() {
221+
auto it = input_.cbegin() + pos_;
222+
const auto end = input_.cend();
223+
common_json result;
224+
if (!common_json_parse(it, end, healing_marker_, result)) {
225+
return std::nullopt;
226+
}
227+
pos_ = std::distance(input_.cbegin(), it);
228+
if (result.healing_marker.marker.empty()) {
229+
// No healing marker, just return the parsed json
230+
return result;
231+
}
232+
if (!is_partial()) {
233+
throw common_chat_msg_partial_exception("JSON");
234+
}
235+
return result;
236+
}
237+
238+
common_json common_chat_msg_parser::consume_json() {
239+
if (auto result = try_consume_json()) {
240+
return *result;
241+
}
242+
throw common_chat_msg_partial_exception("JSON");
243+
}
244+
245+
common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
246+
const std::vector<std::vector<std::string>> & args_paths,
247+
const std::vector<std::vector<std::string>> & content_paths
248+
) {
249+
if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
250+
return *result;
251+
}
252+
throw common_chat_msg_partial_exception("JSON");
253+
}
254+
255+
std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
256+
const std::vector<std::vector<std::string>> & args_paths,
257+
const std::vector<std::vector<std::string>> & content_paths
258+
) {
259+
auto partial = try_consume_json();
260+
if (!partial) {
261+
return std::nullopt;
262+
}
263+
auto is_arguments_path = [&](const std::vector<std::string> & path) {
264+
return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
265+
};
266+
auto is_content_path = [&](const std::vector<std::string> & path) {
267+
return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
268+
};
269+
270+
if (partial->healing_marker.marker.empty()) {
271+
if (args_paths.empty()) {
272+
// No arguments to dump, and JSON was parsed fully.
273+
return consume_json_result {
274+
partial->json,
275+
/* .is_partial = */ false,
276+
};
277+
}
278+
if (is_arguments_path({})) {
279+
// Entire JSON is the arguments and was parsed fully.
280+
return consume_json_result {
281+
partial->json.dump(),
282+
/* .is_partial = */ false,
283+
};
284+
}
285+
}
286+
287+
LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
288+
289+
auto found_healing_marker = false;
290+
std::vector<std::string> path;
291+
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
292+
if (is_arguments_path(path)) {
293+
auto arguments = j.dump();
294+
if (is_partial() && !partial->healing_marker.marker.empty()) {
295+
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
296+
if (idx != std::string::npos) {
297+
arguments.resize(idx);
298+
found_healing_marker = true;
299+
}
300+
if (arguments == "\"") {
301+
// This happens because of completing `:"$magic` after `"arguments"`
302+
arguments = "";
303+
}
304+
}
305+
return arguments;
306+
}
307+
if (is_content_path(path)) {
308+
if (!j.is_string()) {
309+
throw std::runtime_error("Content path must be a string");
310+
}
311+
std::string str = j;
312+
auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
313+
if (idx != std::string::npos) {
314+
str.resize(idx);
315+
found_healing_marker = true;
316+
}
317+
return str;
318+
}
319+
if (j.is_object()) {
320+
auto obj = json::object();
321+
for (const auto & p : j.items()) {
322+
const auto & key = p.key();
323+
const auto & value = p.value();
324+
const std::string key_str = key; // NOLINT
325+
auto idx = key_str.find(healing_marker_);
326+
if (idx != std::string::npos) {
327+
found_healing_marker = true;
328+
break;
329+
}
330+
path.push_back(key_str);
331+
if (value.is_string()) {
332+
const std::string value_str = value;
333+
if (value_str.find(healing_marker_) != std::string::npos) {
334+
found_healing_marker = true;
335+
if (is_content_path(path)) {
336+
if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
337+
// The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
338+
obj[key] = remove_unsupported_healings_and_dump_args(value);
339+
}
340+
}
341+
break;
342+
}
343+
obj[key] = value;
344+
} else {
345+
obj[key] = remove_unsupported_healings_and_dump_args(value);
346+
}
347+
path.pop_back();
348+
}
349+
return obj;
350+
}
351+
if (j.is_array()) {
352+
auto arr = json::array();
353+
for (const auto & value : j) {
354+
if (value.is_string()) {
355+
std::string str = value;
356+
auto idx = str.find(healing_marker_);
357+
if (idx != std::string::npos) {
358+
// Don't heal array values that aren't in the arguments.
359+
found_healing_marker = true;
360+
break;
361+
}
362+
}
363+
arr.push_back(remove_unsupported_healings_and_dump_args(value));
364+
}
365+
return arr;
366+
}
367+
return j;
368+
};
369+
370+
auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
371+
LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
372+
return consume_json_result {
373+
cleaned,
374+
/* .is_partial = */ found_healing_marker,
375+
};
376+
}

0 commit comments

Comments
 (0)