@@ -79,7 +79,14 @@ def forward(
7979 if inputs_embeds is None :
8080 inputs_embeds = self .word_embeddings (input_ids )
8181
82- position_embeddings = self .position_embeddings (position_ids )
82+ # RoBERTa positions start at padding_idx + 1 instead of 0.
83+ # Use non-in-place add to avoid mutating the persistent positions
84+ # buffer -- in-place += would accumulate on CUDA graph padding
85+ # slots that aren't refreshed between requests, eventually
86+ # overflowing max_position_embeddings.
87+ position_embeddings = self .position_embeddings (
88+ position_ids + self .padding_idx + 1
89+ )
8390
8491 token_type_embeddings = self .token_type_embeddings (token_type_ids )
8592 embeddings = inputs_embeds + token_type_embeddings + position_embeddings
@@ -123,13 +130,6 @@ def forward(
123130 intermediate_tensors : IntermediateTensors | None = None ,
124131 inputs_embeds : torch .Tensor | None = None ,
125132 ) -> torch .Tensor :
126- # Fix Roberta positions here outside of the CUDA graph.
127- # Because we need the to extract the sequences from
128- # input_ids the control flow is data dependent.
129- replace_roberta_positions (
130- input_ids = input_ids , position_ids = positions , padding_idx = self .padding_idx
131- )
132-
133133 return self .model (
134134 input_ids = input_ids ,
135135 positions = positions ,
@@ -324,9 +324,6 @@ def forward(
324324 inputs_embeds : torch .Tensor | None = None ,
325325 token_type_ids : torch .Tensor | None = None ,
326326 ) -> torch .Tensor :
327- replace_roberta_positions (
328- input_ids = input_ids , position_ids = positions , padding_idx = self .padding_idx
329- )
330327 if token_type_ids is not None :
331328 assert self .roberta .config .vocab_size < (1 << TOKEN_TYPE_SHIFT )
332329 assert input_ids is not None
@@ -337,16 +334,3 @@ def forward(
337334 inputs_embeds = inputs_embeds ,
338335 intermediate_tensors = intermediate_tensors ,
339336 )
340-
341-
342- def replace_roberta_positions (
343- input_ids : torch .Tensor , position_ids : torch .Tensor , padding_idx : int
344- ) -> None :
345- # Replace position ids because in RoBERTa models
346- # they have to start at padding_idx + 1 and ignore
347- # existing padding tokens
348- # References:
349- # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
350- # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
351- # vllm does not use padding tokens, let's make things simpler
352- position_ids += padding_idx + 1
0 commit comments