Skip to content

Commit ac21408

Browse files
he-yufengmtparet
authored andcommitted
[Bugfix] Fix RoBERTa position_ids accumulation on CUDA graph padding (vllm-project#37884)
1 parent 460f3cb commit ac21408

2 files changed

Lines changed: 12 additions & 26 deletions

File tree

vllm/model_executor/models/roberta.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,14 @@ def forward(
7979
if inputs_embeds is None:
8080
inputs_embeds = self.word_embeddings(input_ids)
8181

82-
position_embeddings = self.position_embeddings(position_ids)
82+
# RoBERTa positions start at padding_idx + 1 instead of 0.
83+
# Use non-in-place add to avoid mutating the persistent positions
84+
# buffer -- in-place += would accumulate on CUDA graph padding
85+
# slots that aren't refreshed between requests, eventually
86+
# overflowing max_position_embeddings.
87+
position_embeddings = self.position_embeddings(
88+
position_ids + self.padding_idx + 1
89+
)
8390

8491
token_type_embeddings = self.token_type_embeddings(token_type_ids)
8592
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
@@ -123,13 +130,6 @@ def forward(
123130
intermediate_tensors: IntermediateTensors | None = None,
124131
inputs_embeds: torch.Tensor | None = None,
125132
) -> torch.Tensor:
126-
# Fix Roberta positions here outside of the CUDA graph.
127-
# Because we need the to extract the sequences from
128-
# input_ids the control flow is data dependent.
129-
replace_roberta_positions(
130-
input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
131-
)
132-
133133
return self.model(
134134
input_ids=input_ids,
135135
positions=positions,
@@ -324,9 +324,6 @@ def forward(
324324
inputs_embeds: torch.Tensor | None = None,
325325
token_type_ids: torch.Tensor | None = None,
326326
) -> torch.Tensor:
327-
replace_roberta_positions(
328-
input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
329-
)
330327
if token_type_ids is not None:
331328
assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
332329
assert input_ids is not None
@@ -337,16 +334,3 @@ def forward(
337334
inputs_embeds=inputs_embeds,
338335
intermediate_tensors=intermediate_tensors,
339336
)
340-
341-
342-
def replace_roberta_positions(
343-
input_ids: torch.Tensor, position_ids: torch.Tensor, padding_idx: int
344-
) -> None:
345-
# Replace position ids because in RoBERTa models
346-
# they have to start at padding_idx + 1 and ignore
347-
# existing padding tokens
348-
# References:
349-
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
350-
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
351-
# vllm does not use padding tokens, let's make things simpler
352-
position_ids += padding_idx + 1

vllm/model_executor/models/transformers/legacy.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,10 @@ def forward(
6565
inputs_embeds: torch.Tensor | None = None,
6666
) -> torch.Tensor | IntermediateTensors:
6767
if self.is_roberta:
68-
# RoBERTa-specific positions padding
69-
positions += self.padding_idx + 1
68+
# RoBERTa positions start at padding_idx + 1.
69+
# Non-in-place add to avoid mutating the persistent GPU buffer --
70+
# in-place += would accumulate on CUDA graph padding slots.
71+
positions = positions + self.padding_idx + 1
7072
return super().forward(
7173
input_ids=input_ids,
7274
positions=positions,

0 commit comments

Comments
 (0)