Skip to content

Commit cf798ec

Browse files
zhyncsxwu-intel
authored andcommitted
Revert "feat: add thinking_budget (sgl-project#6089)" (sgl-project#6181)
1 parent 0baca84 commit cf798ec

File tree

9 files changed

+5
-196
lines changed

9 files changed

+5
-196
lines changed

docs/backend/sampling_params.md

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ Please refer to our dedicated guide on [constrained decoding](./structured_outpu
6464
| ignore_eos | `bool = False` | Don't stop generation when EOS token is sampled. |
6565
| skip_special_tokens | `bool = True` | Remove special tokens during decoding. |
6666
| custom_params | `Optional[List[Optional[Dict[str, Any]]]] = None` | Used when employing `CustomLogitProcessor`. For usage, see below. |
67-
| thinking_budget | `Optional[int] = None` | The maximum number of reasoning tokens that can be generated for a request. |
6867

6968
## Examples
7069

@@ -297,29 +296,3 @@ response = requests.post(
297296
)
298297
print(response.json())
299298
```
300-
301-
### Thinking Budget
302-
303-
Launch a server with `--reasoning-parser`.
304-
305-
```bash
306-
python3 -m sglang.launch_server --model Qwen/Qwen3-8B --reasoning-parser qwen3
307-
```
308-
309-
Send a request:
310-
311-
```python
312-
import requests
313-
response = requests.post(
314-
"http://localhost:30000/generate",
315-
json={
316-
"text": "9.11 and 9.8, which is greater?",
317-
"sampling_params": {
318-
"temperature": 0.3,
319-
"max_new_tokens": 256,
320-
"thinking_budget": 20,
321-
},
322-
},
323-
)
324-
print(response.json())
325-
```

python/sglang/srt/model_executor/model_runner.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,9 +1145,7 @@ def sample(
11451145
[self.sample(values, forward_batch) for values in logits_output],
11461146
axis=-1,
11471147
)
1148-
sampling_info = forward_batch.sampling_info
1149-
if sampling_info.thinking_budgets is not None:
1150-
sampling_info.apply_thinking_budgets(logits_output.next_token_logits)
1148+
11511149
self._preprocess_logits(logits_output, forward_batch.sampling_info)
11521150

11531151
# Sample the next tokens
@@ -1158,8 +1156,6 @@ def sample(
11581156
forward_batch.top_logprobs_nums,
11591157
forward_batch.token_ids_logprobs,
11601158
)
1161-
if sampling_info.thinking_budgets is not None:
1162-
sampling_info.update_thinking_budgets(next_token_ids)
11631159
return next_token_ids
11641160

11651161
@property

python/sglang/srt/openai_api/adapter.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,6 @@ def v1_generate_request(
529529
"temperature": request.temperature,
530530
"max_new_tokens": request.max_tokens,
531531
"min_new_tokens": request.min_tokens,
532-
"thinking_budget": request.thinking_budget,
533532
"stop": request.stop,
534533
"stop_token_ids": request.stop_token_ids,
535534
"top_p": request.top_p,
@@ -1102,7 +1101,6 @@ def v1_chat_generate_request(
11021101
"temperature": request.temperature,
11031102
"max_new_tokens": request.max_tokens or request.max_completion_tokens,
11041103
"min_new_tokens": request.min_tokens,
1105-
"thinking_budget": request.thinking_budget,
11061104
"stop": stop,
11071105
"stop_token_ids": request.stop_token_ids,
11081106
"top_p": request.top_p,

python/sglang/srt/openai_api/protocol.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ class CompletionRequest(BaseModel):
172172
top_k: int = -1
173173
min_p: float = 0.0
174174
min_tokens: int = 0
175-
thinking_budget: Optional[int] = None
176175
json_schema: Optional[str] = None
177176
regex: Optional[str] = None
178177
ebnf: Optional[str] = None
@@ -351,13 +350,6 @@ class ChatCompletionRequest(BaseModel):
351350
description="The maximum number of completion tokens for a chat completion request, "
352351
"including visible output tokens and reasoning tokens. Input tokens are not included. ",
353352
)
354-
thinking_budget: Optional[int] = Field(
355-
default=None,
356-
description="The maximum number of reasoning tokens that can be generated for a request. "
357-
"This setting of does not affect the thinking process of models. "
358-
"If the number of tokens generated by the model's thinking process exceeds thinking_budget, "
359-
"the reasoning content will be truncated and the final response content will be generated immediately.",
360-
)
361353
n: int = 1
362354
presence_penalty: float = 0.0
363355
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None

python/sglang/srt/reasoning_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def detect_and_parse(self, text: str) -> StreamingParseResult:
3232
One-time parsing: Detects and parses reasoning sections in the provided text.
3333
Returns both reasoning content and normal text separately.
3434
"""
35-
text = text.replace(self.think_start_token, "")
35+
text = text.replace(self.think_start_token, "").strip()
3636
if self.think_end_token not in text:
3737
# Assume reasoning was truncated before `</think>` token
3838
return StreamingParseResult(reasoning_text=text)
@@ -73,7 +73,7 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
7373
normal_text = current_text[end_idx + len(self.think_end_token) :]
7474

7575
return StreamingParseResult(
76-
normal_text=normal_text, reasoning_text=reasoning_text
76+
normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
7777
)
7878

7979
# Continue with reasoning content

python/sglang/srt/sampling/sampling_batch_info.py

Lines changed: 2 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,8 @@ class SamplingBatchInfo:
3030
# Whether any request needs min_p sampling
3131
need_min_p_sampling: bool
3232

33-
# Use thinking_budget to truncate thinking
34-
num_thinking_tokens: Optional[torch.Tensor] = None
35-
think_end_ids: Optional[torch.Tensor] = None
36-
thinking_budgets: Optional[torch.Tensor] = None
37-
3833
# Masking tensors for grammar-guided structured outputs
39-
vocab_size: int = 0
34+
vocab_size: int
4035
grammars: Optional[List] = None
4136
vocab_mask: Optional[torch.Tensor] = None
4237
apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
@@ -81,22 +76,7 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
8176
min_ps = torch.tensor(
8277
[r.sampling_params.min_p for r in reqs], dtype=torch.float
8378
).to(device, non_blocking=True)
84-
if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
85-
think_end_ids = torch.tensor(
86-
[getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
87-
dtype=torch.int64,
88-
).to(device, non_blocking=True)
89-
num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
90-
device, non_blocking=True
91-
)
92-
thinking_budgets = torch.tensor(
93-
[r.sampling_params.thinking_budget or -1 for r in reqs],
94-
dtype=torch.int64,
95-
).to(device, non_blocking=True)
96-
else:
97-
think_end_ids = None
98-
num_thinking_tokens = None
99-
thinking_budgets = None
79+
10080
# Check if any request has custom logit processor
10181
has_custom_logit_processor = (
10282
batch.enable_custom_logit_processor # check the flag first.
@@ -152,9 +132,6 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
152132
top_ps=top_ps,
153133
top_ks=top_ks,
154134
min_ps=min_ps,
155-
think_end_ids=think_end_ids,
156-
num_thinking_tokens=num_thinking_tokens,
157-
thinking_budgets=thinking_budgets,
158135
is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
159136
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
160137
vocab_size=vocab_size,
@@ -169,35 +146,6 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
169146
def __len__(self):
170147
return len(self.temperatures)
171148

172-
def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
173-
has_budget = self.thinking_budgets > 0
174-
if not has_budget.any():
175-
return
176-
torch.where(
177-
has_budget,
178-
self.num_thinking_tokens + 1,
179-
self.num_thinking_tokens,
180-
out=self.num_thinking_tokens,
181-
)
182-
should_stop = has_budget & (
183-
self.num_thinking_tokens - 1 > self.thinking_budgets
184-
)
185-
next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
186-
batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
187-
if len(batch_indices) > 0:
188-
end_token_indices = self.think_end_ids[batch_indices]
189-
next_token_logits[batch_indices, end_token_indices] = 0.0
190-
191-
def update_thinking_budgets(self, next_token_ids: torch.Tensor):
192-
if not torch.any(self.thinking_budgets > 0):
193-
return
194-
torch.where(
195-
next_token_ids == self.think_end_ids,
196-
torch.tensor(-1, device=self.thinking_budgets.device),
197-
self.thinking_budgets,
198-
out=self.thinking_budgets,
199-
)
200-
201149
def update_regex_vocab_mask(self):
202150
if not self.grammars:
203151
self.vocab_mask = None

python/sglang/srt/sampling/sampling_params.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ class SamplingParams:
3030
def __init__(
3131
self,
3232
max_new_tokens: int = 128,
33-
thinking_budget: Optional[int] = None,
3433
stop: Optional[Union[str, List[str]]] = None,
3534
stop_token_ids: Optional[List[int]] = None,
3635
temperature: float = 1.0,
@@ -58,7 +57,6 @@ def __init__(
5857
self.stop_token_ids = set(stop_token_ids)
5958
else:
6059
self.stop_token_ids = None
61-
self.thinking_budget = thinking_budget
6260
self.temperature = temperature
6361
self.top_p = top_p
6462
self.top_k = top_k

test/srt/run_suite.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ class TestFile:
6161
TestFile("test_radix_attention.py", 167),
6262
TestFile("test_reasoning_content.py", 89),
6363
TestFile("test_enable_thinking.py", 70),
64-
TestFile("test_thinking_budget.py", 60),
6564
TestFile("test_regex_constrained.py", 64),
6665
TestFile("test_release_memory_occupation.py", 44),
6766
TestFile("test_request_length_validation.py", 31),

test/srt/test_thinking_budget.py

Lines changed: 0 additions & 95 deletions
This file was deleted.

0 commit comments

Comments
 (0)