[Bugfix] Fix RoBERTa position_ids accumulation on CUDA graph padding (vllm-project#37884)

he-yufeng · mtparet · commit ac2140805335 · 2026-04-09T20:48:22.000+02:00
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
@@ -79,7 +79,14 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
 
-        position_embeddings = self.position_embeddings(position_ids)
+        # RoBERTa positions start at padding_idx + 1 instead of 0.
+        # Use non-in-place add to avoid mutating the persistent positions
+        # buffer -- in-place += would accumulate on CUDA graph padding
+        # slots that aren't refreshed between requests, eventually
+        # overflowing max_position_embeddings.
+        position_embeddings = self.position_embeddings(
+            position_ids + self.padding_idx + 1
+        )
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
         embeddings = inputs_embeds + token_type_embeddings + position_embeddings
@@ -123,13 +130,6 @@ def forward(
         intermediate_tensors: IntermediateTensors | None = None,
         inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        # Fix Roberta positions here outside of the CUDA graph.
-        # Because we need the to extract the sequences from
-        # input_ids the control flow is data dependent.
-        replace_roberta_positions(
-            input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
-        )
-
         return self.model(
             input_ids=input_ids,
             positions=positions,
@@ -324,9 +324,6 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        replace_roberta_positions(
-            input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
-        )
         if token_type_ids is not None:
             assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
             assert input_ids is not None
@@ -337,16 +334,3 @@ def forward(
             inputs_embeds=inputs_embeds,
             intermediate_tensors=intermediate_tensors,
         )
-
-
-def replace_roberta_positions(
-    input_ids: torch.Tensor, position_ids: torch.Tensor, padding_idx: int
-) -> None:
-    # Replace position ids because in RoBERTa models
-    # they have to start at padding_idx + 1 and ignore
-    # existing padding tokens
-    # References:
-    # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
-    # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-    # vllm does not use padding tokens, let's make things simpler
-    position_ids += padding_idx + 1
diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py
@@ -65,8 +65,10 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor | IntermediateTensors:
         if self.is_roberta:
-            # RoBERTa-specific positions padding
-            positions += self.padding_idx + 1
+            # RoBERTa positions start at padding_idx + 1.
+            # Non-in-place add to avoid mutating the persistent GPU buffer --
+            # in-place += would accumulate on CUDA graph padding slots.
+            positions = positions + self.padding_idx + 1
         return super().forward(
             input_ids=input_ids,
             positions=positions,