Skip to content

fix: correct stream response when enable_thinking is set to false #5881

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 23 additions & 17 deletions python/sglang/srt/openai_api/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,24 @@ async def generate_stream_resp():
return response


def _get_enable_thinking_from_request(request_obj):
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.

Args:
request_obj: The request object (or an item from a list of requests).

Returns:
The boolean value of 'enable_thinking' if found and not True, otherwise True.
"""
if (
hasattr(request_obj, "chat_template_kwargs")
and request_obj.chat_template_kwargs
and request_obj.chat_template_kwargs.get("enable_thinking") is not None
):
return request_obj.chat_template_kwargs.get("enable_thinking")
return True


def v1_chat_generate_request(
all_requests: List[ChatCompletionRequest],
tokenizer_manager,
Expand Down Expand Up @@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
tool_calls = None
text = ret_item["text"]

enable_thinking = True
if isinstance(request, list):
tool_choice = request[idx].tool_choice
tools = request[idx].tools
separate_reasoning = request[idx].separate_reasoning

if (
request[idx].chat_template_kwargs
and request[idx].chat_template_kwargs.get("enable_thinking") is not None
):
enable_thinking = request[idx].chat_template_kwargs.get(
"enable_thinking", True
)
enable_thinking = _get_enable_thinking_from_request(request[idx])
else:
tool_choice = request.tool_choice
tools = request.tools
separate_reasoning = request.separate_reasoning

if (
request.chat_template_kwargs
and request.chat_template_kwargs.get("enable_thinking") is not None
):
enable_thinking = request.chat_template_kwargs.get(
"enable_thinking", True
)
enable_thinking = _get_enable_thinking_from_request(request)

reasoning_text = None
if reasoning_parser and separate_reasoning and enable_thinking:
Expand Down Expand Up @@ -1526,9 +1529,12 @@ async def generate_stream_resp():
delta = text[len(stream_buffer) :]
new_stream_buffer = stream_buffer + delta

enable_thinking = _get_enable_thinking_from_request(request)

if (
tokenizer_manager.server_args.reasoning_parser
and request.separate_reasoning
and enable_thinking
):
if index not in reasoning_parser_dict:
reasoning_parser_dict[index] = ReasoningParser(
Expand Down
1 change: 1 addition & 0 deletions python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
)
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"

# Nightly tests
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
Expand Down
1 change: 1 addition & 0 deletions test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class TestFile:
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 167),
TestFile("test_reasoning_content.py", 89),
TestFile("test_enable_thinking.py", 70),
TestFile("test_regex_constrained.py", 64),
TestFile("test_release_memory_occupation.py", 44),
TestFile("test_request_length_validation.py", 31),
Expand Down
186 changes: 186 additions & 0 deletions test/srt/test_enable_thinking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""
Usage:
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
"""

import asyncio
import json
import os
import sys
import time
import unittest

import requests

from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)


class TestEnableThinking(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-1234"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=[
"--reasoning-parser",
"qwen3",
],
)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_chat_completion_with_reasoning(self):
# Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
client = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"chat_template_kwargs": {"enable_thinking": True},
},
)

self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
data = client.json()

self.assertIn("choices", data)
self.assertTrue(len(data["choices"]) > 0)
self.assertIn("message", data["choices"][0])
self.assertIn("reasoning_content", data["choices"][0]["message"])
self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])

def test_chat_completion_without_reasoning(self):
# Test non-streaming with "enable_thinking": False, reasoning_content should be empty
client = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"chat_template_kwargs": {"enable_thinking": False},
},
)

self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
data = client.json()

self.assertIn("choices", data)
self.assertTrue(len(data["choices"]) > 0)
self.assertIn("message", data["choices"][0])

if "reasoning_content" in data["choices"][0]["message"]:
self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])

def test_stream_chat_completion_with_reasoning(self):
# Test streaming with "enable_thinking": True, reasoning_content should not be empty
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"stream": True,
"chat_template_kwargs": {"enable_thinking": True},
},
stream=True,
)

self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")

has_reasoning = False
has_content = False

print("\n=== Stream With Reasoning ===")
for line in response.iter_lines():
if line:
line = line.decode("utf-8")
if line.startswith("data:") and not line.startswith("data: [DONE]"):
data = json.loads(line[6:])
if "choices" in data and len(data["choices"]) > 0:
delta = data["choices"][0].get("delta", {})

if "reasoning_content" in delta and delta["reasoning_content"]:
has_reasoning = True

if "content" in delta and delta["content"]:
has_content = True

self.assertTrue(
has_reasoning,
"The reasoning content is not included in the stream response",
)
self.assertTrue(
has_content, "The stream response does not contain normal content"
)

def test_stream_chat_completion_without_reasoning(self):
# Test streaming with "enable_thinking": False, reasoning_content should be empty
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"stream": True,
"chat_template_kwargs": {"enable_thinking": False},
},
stream=True,
)

self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")

has_reasoning = False
has_content = False

print("\n=== Stream Without Reasoning ===")
for line in response.iter_lines():
if line:
line = line.decode("utf-8")
if line.startswith("data:") and not line.startswith("data: [DONE]"):
data = json.loads(line[6:])
if "choices" in data and len(data["choices"]) > 0:
delta = data["choices"][0].get("delta", {})

if "reasoning_content" in delta and delta["reasoning_content"]:
has_reasoning = True

if "content" in delta and delta["content"]:
has_content = True

self.assertFalse(
has_reasoning,
"The reasoning content should not be included in the stream response",
)
self.assertTrue(
has_content, "The stream response does not contain normal content"
)


if __name__ == "__main__":
unittest.main()
Loading