openvinotoolkit · dtrawins · Dec 9, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -34,6 +34,7 @@ HERMES3_MODEL="NousResearch/Hermes-3-Llama-3.1-8B"
 PHI4_MODEL="microsoft/Phi-4-mini-instruct"
 MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 GPT_OSS="openai/gpt-oss-20b"
+DEVSTRAL_MODEL="unsloth/Devstral-Small-2507"
 
 if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi
 
@@ -173,3 +174,14 @@ if [ ! -f "$1/$GPT_OSS/$TOKENIZER_FILE" ]; then
   echo "[ERROR] Models file $1/$GPT_OSS/$TOKENIZER_FILE does not exist."
   exit 1
 fi
+
+if [ -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then
+  echo "Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
+else
+  mkdir -p $1/$DEVSTRAL_MODEL
+  convert_tokenizer $DEVSTRAL_MODEL --with_detokenizer -o $1/$DEVSTRAL_MODEL
+fi
+if [ ! -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then
+  echo "[ERROR] Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE does not exist."
+  exit 1
+fi
diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -136,6 +136,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/hermes3/tool_parser.hpp",
             "io_processing/llama3/tool_parser.hpp",
             "io_processing/phi4/tool_parser.hpp",
+            "io_processing/devstral/tool_parser.hpp",
             "io_processing/mistral/tool_parser.hpp",
             "io_processing/qwen3/reasoning_parser.hpp",
             "io_processing/gptoss/reasoning_parser.hpp",
@@ -147,6 +148,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/hermes3/tool_parser.cpp",
             "io_processing/llama3/tool_parser.cpp",
             "io_processing/phi4/tool_parser.cpp",
+            "io_processing/devstral/tool_parser.cpp",
             "io_processing/mistral/tool_parser.cpp",
             "io_processing/qwen3/reasoning_parser.cpp",
             "io_processing/gptoss/reasoning_parser.cpp",
@@ -175,11 +177,13 @@ ovms_cc_library(
             "io_processing/phi4/generation_config_builder.hpp",
             "io_processing/llama3/generation_config_builder.hpp",
             "io_processing/hermes3/generation_config_builder.hpp",
+            "io_processing/devstral/generation_config_builder.hpp",
             "io_processing/generation_config_builder.hpp"],
     srcs = ["io_processing/base_generation_config_builder.cpp",
             "io_processing/phi4/generation_config_builder.cpp",
             "io_processing/llama3/generation_config_builder.cpp",
-            "io_processing/hermes3/generation_config_builder.cpp"],
+            "io_processing/hermes3/generation_config_builder.cpp",
+            "io_processing/devstral/generation_config_builder.cpp"],
     deps = [
         ":openai_request",
         "//src:libovmslogging",

diff --git a/src/llm/io_processing/devstral/generation_config_builder.cpp b/src/llm/io_processing/devstral/generation_config_builder.cpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <openvino/genai/generation_config.hpp>
+
+#include "generation_config_builder.hpp"
+
+namespace ovms {
+
+void DevstralGenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) {
+    // Call the base class method to fill in common configuration
+    BaseGenerationConfigBuilder::parseConfigFromRequest(request);
+
+    // For now the only specific part is related to tools, so if there are no tools provided in the request
+    // we can exit early
+    if (request.toolNameSchemaMap.empty()) {
+        return;
+    }
+
+    if (enableToolGuidedGeneration || request.toolChoice == "required") {
+        // Set tool guided generation config specific to Devstral model
+        auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
+        triggeredTags->triggers.push_back("[TOOL_CALLS]");
+
+        for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
+            const auto& toolSchema = toolSchemaWrapper.stringRepr;
+            ov::genai::StructuredOutputConfig::Tag tagItem;
+            tagItem.begin = "[TOOL_CALLS]" + toolName + "[ARGS]";
+            tagItem.end = "</s>";
+            tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
+            triggeredTags->tags.push_back(tagItem);
+        }
+        if (request.toolChoice == "required") {
+            triggeredTags->at_least_one = true;
+        }
+        ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
+        setStructuralTagsConfig(structuralTag);
+    }
+}
+
+}  // namespace ovms
diff --git a/src/llm/io_processing/devstral/generation_config_builder.hpp b/src/llm/io_processing/devstral/generation_config_builder.hpp
@@ -0,0 +1,33 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "../base_generation_config_builder.hpp"
+
+namespace ovms {
+
+/*
+ * Phi4GenerationConfigBuilder extends BaseGenerationConfigBuilder to provide specific configuration for Phi-4 model.
+ * It overrides the parseConfigFromRequest method to set tool guided generation config.
+ */
+class DevstralGenerationConfigBuilder : public BaseGenerationConfigBuilder {
+public:
+    DevstralGenerationConfigBuilder() = delete;
+    explicit DevstralGenerationConfigBuilder(const ov::genai::GenerationConfig& baseConfig, bool enableToolGuidedGeneration, DecodingMethod decodingMethod) :
+        BaseGenerationConfigBuilder(baseConfig, enableToolGuidedGeneration, decodingMethod) {}
+
+    void parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) override;
+};
+}  // namespace ovms
diff --git a/src/llm/io_processing/devstral/tool_parser.cpp b/src/llm/io_processing/devstral/tool_parser.cpp
@@ -0,0 +1,163 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <openvino/genai/tokenizer.hpp>
+#include <string>
+#include <vector>
+#include <regex>
+
+#include "src/port/rapidjson_document.hpp"
+
+#include "../../../logging.hpp"
+#include "tool_parser.hpp"
+#include "../utils.hpp"
+#include "src/stringutils.hpp"
+
+namespace ovms {
+
+void DevstralToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
+    std::vector<std::string> tools;
+    // Parser will consume entire model output only if the first generated token is the beginning of tools token.
+    // expected format: [TOOL_CALLS]tool_name[ARGS]{"arg1": "value1", ...}
+    if (parsedOutput.content.empty() || generatedTokens.size() <= 0) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No content to parse for tool calls");
+        return;
+    }
+    size_t firstToolTokenIndex;
+    auto it = std::find(generatedTokens.begin(), generatedTokens.end(), this->botTokenId);
+    if (it != generatedTokens.end()) {
+        firstToolTokenIndex = std::distance(generatedTokens.begin(), it);
+    } else {
+        return;
+    }
+
+    size_t firstArgsTokenIndex;
+    auto it_args = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId);
-    auto it_args = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId);
+    auto itArgs = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId);
-    auto it_args = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId);
+    auto itArgs = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId);
+    if (it_args != generatedTokens.end()) {
+        firstArgsTokenIndex = std::distance(generatedTokens.begin(), it_args);
+    } else {
+        return;
+    }
+    if (firstToolTokenIndex > firstArgsTokenIndex) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "First tool token index is greater than first args token index.");
+        return;
+    }
+    std::vector<int64_t> tool_name_tokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex));
+    std::vector<int64_t> arguments_tokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end());
-    std::vector<int64_t> tool_name_tokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex));
-    std::vector<int64_t> arguments_tokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end());
+    std::vector<int64_t> toolNameTokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex));
+    std::vector<int64_t> argumentsTokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end());
-    std::vector<int64_t> tool_name_tokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex));
-    std::vector<int64_t> arguments_tokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end());
+    std::vector<int64_t> toolNameTokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex));
+    std::vector<int64_t> argumentsTokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end());
+
+    ToolCall toolCall;
+    std::string tool_name = tokenizer.decode(tool_name_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});
-    std::string tool_name = tokenizer.decode(tool_name_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});
+    std::string toolName = tokenizer.decode(tool_name_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});
-    std::string tool_name = tokenizer.decode(tool_name_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});
+    std::string toolName = tokenizer.decode(tool_name_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});
+    if (this->toolSchemas.find(tool_name) == this->toolSchemas.end()) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool name '{}' not valid.", tool_name);
+        return;
+    }
+    std::string arguments = tokenizer.decode(arguments_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});
+
+    toolCall.name = tool_name;
+    toolCall.arguments = arguments;
+    toolCall.id = generateRandomId();  // Generate a random ID for the tool call
+    parsedOutput.toolCalls.push_back(toolCall);
+
+    // get subset of generatedTokens starting from begin() to firstArgsTokenIndex
+    std::vector<int64_t> content_tokens;
-    std::vector<int64_t> content_tokens;
+    std::vector<int64_t> contentTokens;
-    std::vector<int64_t> content_tokens;
+    std::vector<int64_t> contentTokens;
+    if (firstToolTokenIndex > 0) {
+        content_tokens = std::vector<int64_t>(generatedTokens.begin(), generatedTokens.begin() + firstToolTokenIndex);
+        parsedOutput.content = tokenizer.decode(content_tokens, ov::AnyMap{ov::genai::skip_special_tokens(true)});  // Return only the content till tool call
+    } else {
+        parsedOutput.content = "";
+    }
+    return;
+}
+
+std::optional<rapidjson::Document> DevstralToolParser::sendFullDelta(ToolCall& toolCall) {
+    rapidjson::Document argsDelta;
+    argsDelta.Parse(toolCall.arguments.c_str());
+    rapidjson::Document argumentsWrapper;
+    argumentsWrapper.SetObject();
+    rapidjson::Document::AllocatorType& allocator = argumentsWrapper.GetAllocator();
+    // now we need to add string toolCall.arguments to argumentsWrapper under "arguments" key
+    rapidjson::Value toolCallsString(rapidjson::kStringType);
+    toolCallsString.SetString(toolCall.arguments.c_str(), allocator);
+    argumentsWrapper.AddMember("arguments", toolCallsString, allocator);
+    auto currentDelta = wrapDelta(argumentsWrapper, this->toolCallIndex);
+    return currentDelta;
+}
+
+std::optional<rapidjson::Document> DevstralToolParser::parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) {
+    /* 
+    Devstral [TOOL_CALL]tool_name[ARGS]arguments[</s>]
+    It does not support parallel tool calls, so tool calls are always in sequence.
+
+    We have three processing states:
+        AWAITING_START_TAG,
+        AWAITING_ARGS_TAG,
+        PROCESSING_ARGS
+
+    We store the history of chunks in streamContent string. After state changes are detected, we clear the streamContent to only keep unprocessed part.
+    */
+
+    this->streamContent += chunk;
+    if (this->internalState == AWAITING_START_TAG) {
+        size_t pos = chunk.find("[TOOL_CALLS]");
+        if (pos != std::string::npos) {
+            this->internalState = AWAITING_ARGS_TAG;
+            this->toolCallIndex++;
+            if (pos == 0) {
+                this->streamContent.clear();
+            } else {
+                this->streamContent = this->streamContent.substr(pos + 13);  // "[TOOLS_CALLS]" length is 13
+            }
+        } else {
+            return std::nullopt;
+        }
+    }
+    if (this->internalState == AWAITING_ARGS_TAG) {
+        // check if [ARGS] tag is present in the chunk and update state accordingly
-        // check if [ARGS] tag is present in the chunk and update state accordingly
+        // check if [ARGS] tag is present in the streamContent and update state accordingly
-        // check if [ARGS] tag is present in the chunk and update state accordingly
+        // check if [ARGS] tag is present in the streamContent and update state accordingly
+        size_t pos = this->streamContent.find("[ARGS]");
+        if (pos != std::string::npos) {
+            this->internalState = PROCESSING_ARGS;
+            this->toolName = this->streamContent.substr(0, pos);
+            if (this->toolSchemas.find(this->toolName) == this->toolSchemas.end()) {
+                SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool name '{}' not valid.", this->toolName);
+                return std::nullopt;
+            }
+            this->streamContent = this->streamContent.substr(pos + 6);  // "[ARGS]" length is 6
+            return wrapFirstDelta(this->toolName, this->toolCallIndex);
+        } else {
+            return std::nullopt;
+        }
+    }
+    if (finishReason != ov::genai::GenerationFinishReason::NONE) {
+        size_t end_pos = this->streamContent.find("</s>");
-        size_t end_pos = this->streamContent.find("</s>");
+        size_t endPos = this->streamContent.find("</s>");
-        size_t end_pos = this->streamContent.find("</s>");
+        size_t endPos = this->streamContent.find("</s>");
+        std::string arguments;
+        if (end_pos != std::string::npos) {
+            arguments = this->streamContent.substr(0, end_pos);
+        } else {
+            arguments = this->streamContent;
+        }
+        if (!arguments.empty()) {
+            ToolCall toolCall;
+            toolCall.arguments = arguments;
+            toolCall.name = this->toolName;
+            return sendFullDelta(toolCall);
+        } else {
+            SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No valid arguments found in streamContent.");
+            return std::nullopt;
+        }
+    }
+    return std::nullopt;
+}
+}  // namespace ovms
diff --git a/src/llm/io_processing/devstral/tool_parser.hpp b/src/llm/io_processing/devstral/tool_parser.hpp
@@ -0,0 +1,80 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <openvino/genai/tokenizer.hpp>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "src/port/rapidjson_document.hpp"
+
+#include "src/llm/io_processing/base_output_parser.hpp"
+#include "src/llm/io_processing/partial_json_builder.hpp"
+#include "src/llm/apis/tool_schema_wrapper.hpp"
+
+namespace ovms {
+class DevstralToolParser : public BaseOutputParser {
+    const int64_t argsTokenId;  // [ARGS]
+    const int64_t botTokenId;   // [TOOL_CALLS]
+
+    // in streaming mode we can rely on tags in string format as tokens are not available
+    const std::string streamingParsingArgsStartTag = "[ARGS]";
+    const std::string streamingParsingToolCallsStartTag = "[TOOL_CALLS]";
+
+    enum InternalState {
+        AWAITING_START_TAG,
+        AWAITING_ARGS_TAG,
+        PROCESSING_ARGS
+    };
+
+    InternalState internalState = AWAITING_START_TAG;
+    const ToolsSchemas_t& toolSchemas;
+    // Index to track the current tool call being processed (-1 means no tool call has been started yet)
+    int toolCallIndex = -1;
+    std::string streamContent = "";  // content accumulated from stream chunks
+    std::string toolName = "";
+    std::optional<rapidjson::Document> sendFullDelta(ToolCall& toolCall);
+
+public:
+    DevstralToolParser() = delete;
+    DevstralToolParser(ov::genai::Tokenizer& tokenizer, const ToolsSchemas_t& toolSchemas) :
+        BaseOutputParser(tokenizer),
+        argsTokenId(tokenizer.encode("[ARGS]", {{"add_special_tokens", false}}).input_ids.data<int64_t>()[0]),
+        botTokenId(tokenizer.encode("[TOOL_CALLS]", {{"add_special_tokens", false}}).input_ids.data<int64_t>()[0]),
+        toolSchemas(toolSchemas) {}
+
+    void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
+    std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
+    const std::vector<std::string>& getParsingStartTags() const override {
+        static const std::vector<std::string> toolCallStartTags{streamingParsingToolCallsStartTag};
+        return toolCallStartTags;
+    }
+    const std::vector<std::string>& getSpecialParsingStartTags() const override {
+        static const std::vector<std::string> specialParsingStartTags{};
+        return specialParsingStartTags;
+    }
+    // Tools calls are expected to be the last part of the content, so we do not specify an end tag.
+    const std::string& getParsingEndTag() const override {
+        static const std::string toolCallEndTag = "";
+        return toolCallEndTag;
+    }
+
+    bool requiresStreamingWithSpecialTokens() const override {
+        return true;
+    }
+};
+}  // namespace ovms