Skip to content

Commit 256c4c2

Browse files
authored
fix: correct stream response when enable_thinking is set to false (#5881)
1 parent 9f21e75 commit 256c4c2

File tree

4 files changed

+211
-17
lines changed

4 files changed

+211
-17
lines changed

python/sglang/srt/openai_api/adapter.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,24 @@ async def generate_stream_resp():
899899
return response
900900

901901

902+
def _get_enable_thinking_from_request(request_obj):
903+
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
904+
905+
Args:
906+
request_obj: The request object (or an item from a list of requests).
907+
908+
Returns:
909+
The boolean value of 'enable_thinking' if found and not True, otherwise True.
910+
"""
911+
if (
912+
hasattr(request_obj, "chat_template_kwargs")
913+
and request_obj.chat_template_kwargs
914+
and request_obj.chat_template_kwargs.get("enable_thinking") is not None
915+
):
916+
return request_obj.chat_template_kwargs.get("enable_thinking")
917+
return True
918+
919+
902920
def v1_chat_generate_request(
903921
all_requests: List[ChatCompletionRequest],
904922
tokenizer_manager,
@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
12631281
tool_calls = None
12641282
text = ret_item["text"]
12651283

1266-
enable_thinking = True
12671284
if isinstance(request, list):
12681285
tool_choice = request[idx].tool_choice
12691286
tools = request[idx].tools
12701287
separate_reasoning = request[idx].separate_reasoning
1271-
1272-
if (
1273-
request[idx].chat_template_kwargs
1274-
and request[idx].chat_template_kwargs.get("enable_thinking") is not None
1275-
):
1276-
enable_thinking = request[idx].chat_template_kwargs.get(
1277-
"enable_thinking", True
1278-
)
1288+
enable_thinking = _get_enable_thinking_from_request(request[idx])
12791289
else:
12801290
tool_choice = request.tool_choice
12811291
tools = request.tools
12821292
separate_reasoning = request.separate_reasoning
1283-
1284-
if (
1285-
request.chat_template_kwargs
1286-
and request.chat_template_kwargs.get("enable_thinking") is not None
1287-
):
1288-
enable_thinking = request.chat_template_kwargs.get(
1289-
"enable_thinking", True
1290-
)
1293+
enable_thinking = _get_enable_thinking_from_request(request)
12911294

12921295
reasoning_text = None
12931296
if reasoning_parser and separate_reasoning and enable_thinking:
@@ -1526,9 +1529,12 @@ async def generate_stream_resp():
15261529
delta = text[len(stream_buffer) :]
15271530
new_stream_buffer = stream_buffer + delta
15281531

1532+
enable_thinking = _get_enable_thinking_from_request(request)
1533+
15291534
if (
15301535
tokenizer_manager.server_args.reasoning_parser
15311536
and request.separate_reasoning
1537+
and enable_thinking
15321538
):
15331539
if index not in reasoning_parser_dict:
15341540
reasoning_parser_dict[index] = ReasoningParser(

python/sglang/test/test_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
7070
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
7171
)
72+
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
7273

7374
# Nightly tests
7475
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"

test/srt/run_suite.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class TestFile:
5959
TestFile("test_pytorch_sampling_backend.py", 66),
6060
TestFile("test_radix_attention.py", 167),
6161
TestFile("test_reasoning_content.py", 89),
62+
TestFile("test_enable_thinking.py", 70),
6263
TestFile("test_regex_constrained.py", 64),
6364
TestFile("test_release_memory_occupation.py", 44),
6465
TestFile("test_request_length_validation.py", 31),

test/srt/test_enable_thinking.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
"""
2+
Usage:
3+
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
4+
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
5+
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
6+
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
7+
"""
8+
9+
import asyncio
10+
import json
11+
import os
12+
import sys
13+
import time
14+
import unittest
15+
16+
import requests
17+
18+
from sglang.srt.utils import kill_process_tree
19+
from sglang.test.test_utils import (
20+
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
21+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
22+
DEFAULT_URL_FOR_TEST,
23+
CustomTestCase,
24+
popen_launch_server,
25+
)
26+
27+
28+
class TestEnableThinking(CustomTestCase):
29+
@classmethod
30+
def setUpClass(cls):
31+
cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
32+
cls.base_url = DEFAULT_URL_FOR_TEST
33+
cls.api_key = "sk-1234"
34+
cls.process = popen_launch_server(
35+
cls.model,
36+
cls.base_url,
37+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
38+
api_key=cls.api_key,
39+
other_args=[
40+
"--reasoning-parser",
41+
"qwen3",
42+
],
43+
)
44+
45+
@classmethod
46+
def tearDownClass(cls):
47+
kill_process_tree(cls.process.pid)
48+
49+
def test_chat_completion_with_reasoning(self):
50+
# Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
51+
client = requests.post(
52+
f"{self.base_url}/v1/chat/completions",
53+
headers={"Authorization": f"Bearer {self.api_key}"},
54+
json={
55+
"model": self.model,
56+
"messages": [{"role": "user", "content": "Hello"}],
57+
"temperature": 0,
58+
"separate_reasoning": True,
59+
"chat_template_kwargs": {"enable_thinking": True},
60+
},
61+
)
62+
63+
self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
64+
data = client.json()
65+
66+
self.assertIn("choices", data)
67+
self.assertTrue(len(data["choices"]) > 0)
68+
self.assertIn("message", data["choices"][0])
69+
self.assertIn("reasoning_content", data["choices"][0]["message"])
70+
self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
71+
72+
def test_chat_completion_without_reasoning(self):
73+
# Test non-streaming with "enable_thinking": False, reasoning_content should be empty
74+
client = requests.post(
75+
f"{self.base_url}/v1/chat/completions",
76+
headers={"Authorization": f"Bearer {self.api_key}"},
77+
json={
78+
"model": self.model,
79+
"messages": [{"role": "user", "content": "Hello"}],
80+
"temperature": 0,
81+
"separate_reasoning": True,
82+
"chat_template_kwargs": {"enable_thinking": False},
83+
},
84+
)
85+
86+
self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
87+
data = client.json()
88+
89+
self.assertIn("choices", data)
90+
self.assertTrue(len(data["choices"]) > 0)
91+
self.assertIn("message", data["choices"][0])
92+
93+
if "reasoning_content" in data["choices"][0]["message"]:
94+
self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
95+
96+
def test_stream_chat_completion_with_reasoning(self):
97+
# Test streaming with "enable_thinking": True, reasoning_content should not be empty
98+
response = requests.post(
99+
f"{self.base_url}/v1/chat/completions",
100+
headers={"Authorization": f"Bearer {self.api_key}"},
101+
json={
102+
"model": self.model,
103+
"messages": [{"role": "user", "content": "Hello"}],
104+
"temperature": 0,
105+
"separate_reasoning": True,
106+
"stream": True,
107+
"chat_template_kwargs": {"enable_thinking": True},
108+
},
109+
stream=True,
110+
)
111+
112+
self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
113+
114+
has_reasoning = False
115+
has_content = False
116+
117+
print("\n=== Stream With Reasoning ===")
118+
for line in response.iter_lines():
119+
if line:
120+
line = line.decode("utf-8")
121+
if line.startswith("data:") and not line.startswith("data: [DONE]"):
122+
data = json.loads(line[6:])
123+
if "choices" in data and len(data["choices"]) > 0:
124+
delta = data["choices"][0].get("delta", {})
125+
126+
if "reasoning_content" in delta and delta["reasoning_content"]:
127+
has_reasoning = True
128+
129+
if "content" in delta and delta["content"]:
130+
has_content = True
131+
132+
self.assertTrue(
133+
has_reasoning,
134+
"The reasoning content is not included in the stream response",
135+
)
136+
self.assertTrue(
137+
has_content, "The stream response does not contain normal content"
138+
)
139+
140+
def test_stream_chat_completion_without_reasoning(self):
141+
# Test streaming with "enable_thinking": False, reasoning_content should be empty
142+
response = requests.post(
143+
f"{self.base_url}/v1/chat/completions",
144+
headers={"Authorization": f"Bearer {self.api_key}"},
145+
json={
146+
"model": self.model,
147+
"messages": [{"role": "user", "content": "Hello"}],
148+
"temperature": 0,
149+
"separate_reasoning": True,
150+
"stream": True,
151+
"chat_template_kwargs": {"enable_thinking": False},
152+
},
153+
stream=True,
154+
)
155+
156+
self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
157+
158+
has_reasoning = False
159+
has_content = False
160+
161+
print("\n=== Stream Without Reasoning ===")
162+
for line in response.iter_lines():
163+
if line:
164+
line = line.decode("utf-8")
165+
if line.startswith("data:") and not line.startswith("data: [DONE]"):
166+
data = json.loads(line[6:])
167+
if "choices" in data and len(data["choices"]) > 0:
168+
delta = data["choices"][0].get("delta", {})
169+
170+
if "reasoning_content" in delta and delta["reasoning_content"]:
171+
has_reasoning = True
172+
173+
if "content" in delta and delta["content"]:
174+
has_content = True
175+
176+
self.assertFalse(
177+
has_reasoning,
178+
"The reasoning content should not be included in the stream response",
179+
)
180+
self.assertTrue(
181+
has_content, "The stream response does not contain normal content"
182+
)
183+
184+
185+
if __name__ == "__main__":
186+
unittest.main()

0 commit comments

Comments
 (0)