Fix some bug for finetune and batch infer For GLM-4.1V (#39090)

zRzRzRzRzRzRzR · web-flow · commit 4a79bf947d06 · 2025-06-30T12:16:22.000+02:00
* update

* 1
diff --git a/src/transformers/models/glm4v/image_processing_glm4v_fast.py b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
@@ -121,6 +121,7 @@ def _preprocess(
         do_convert_rgb: bool,
         input_data_format: Optional[Union[str, ChannelDimension]],
         device: Optional[Union[str, torch.device]],
+        disable_grouping: Optional[bool],
     ):
         """
         Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
@@ -173,7 +174,7 @@ def _preprocess(
         resized_height, resized_width = height, width
 
         # Group images by size for batched resizing
-        grouped_images, grouped_images_index = group_images_by_shape(images)
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
@@ -191,7 +192,7 @@ def _preprocess(
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
         # Group images by size for further processing
         # Needed in case do_resize is False, or resize returns images with different sizes
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
         processed_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             # Fused rescale and normalize
@@ -249,6 +250,7 @@ def preprocess(
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         device: Optional["torch.device"] = None,
+        disable_grouping: Optional[bool] = False,
         **kwargs,
     ):
         r"""
@@ -323,6 +325,7 @@ def preprocess(
                     do_convert_rgb=do_convert_rgb,
                     input_data_format=input_data_format,
                     device=device,
+                    disable_grouping=disable_grouping,
                 )
                 pixel_values.extend(patches)
                 vision_grid_thws.append(image_grid_thw)
@@ -351,11 +354,11 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
 
         factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
-            t=self.temporal_patch_size,
+            num_frames=self.temporal_patch_size,
             height=height,
             width=width,
+            temporal_factor=self.temporal_patch_size,
             factor=factor,
-            t_factor=self.temporal_patch_size,
         )
         grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
         return grid_h * grid_w
diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py
@@ -287,6 +287,7 @@ def __init__(self, config: Glm4vVisionConfig) -> None:
         self.attention_dropout = config.attention_dropout
         self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
         self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.is_causal = False
 
     def forward(
         self,
@@ -324,7 +325,7 @@ def forward(
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scale,
-            is_causal=False,
+            is_causal=self.is_causal,
             **kwargs,
         )
         attn_output = attn_output.squeeze(0)
@@ -1016,7 +1017,7 @@ def get_rope_index(
                 dtype=input_ids.dtype,
                 device=input_ids.device,
             )
-
+            image_index, video_index = 0, 0
             attention_mask = attention_mask.to(total_input_ids.device)
             for i, input_ids in enumerate(total_input_ids):
                 input_ids = input_ids[attention_mask[i] == 1]
@@ -1046,7 +1047,6 @@ def get_rope_index(
 
                 llm_pos_ids_list = []
                 video_frame_num = 1
-                image_index, video_index = 0, 0
 
                 for modality_type, start_idx, end_idx in input_type_group:
                     st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
@@ -1088,9 +1088,7 @@ def get_rope_index(
                             t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
 
                             h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-
                             w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-
                             llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
 
                         video_index += 1
diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
@@ -516,6 +516,7 @@ def __init__(self, config: Glm4vVisionConfig) -> None:
         self.attention_dropout = config.attention_dropout
         self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
         self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.is_causal = False
 
     def forward(
         self,
@@ -553,7 +554,7 @@ def forward(
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scale,
-            is_causal=False,
+            is_causal=self.is_causal,
             **kwargs,
         )
         attn_output = attn_output.squeeze(0)
@@ -1115,7 +1116,7 @@ def get_rope_index(
                 dtype=input_ids.dtype,
                 device=input_ids.device,
             )
-
+            image_index, video_index = 0, 0
             attention_mask = attention_mask.to(total_input_ids.device)
             for i, input_ids in enumerate(total_input_ids):
                 input_ids = input_ids[attention_mask[i] == 1]
@@ -1145,7 +1146,6 @@ def get_rope_index(
 
                 llm_pos_ids_list = []
                 video_frame_num = 1
-                image_index, video_index = 0, 0
 
                 for modality_type, start_idx, end_idx in input_type_group:
                     st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
@@ -1187,9 +1187,7 @@ def get_rope_index(
                             t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
 
                             h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-
                             w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-
                             llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
 
                         video_index += 1