huggingface · yonigozlan · Nov 3, 2025 · Aug 31, 2025 · Aug 31, 2025 · Sep 1, 2025
diff --git a/src/transformers/models/colqwen2/configuration_colqwen2.py b/src/transformers/models/colqwen2/configuration_colqwen2.py
@@ -61,12 +61,13 @@ def __init__(
         vlm_config=None,
         embedding_dim: int = 128,
         initializer_range: float = 0.02,
+        use_qwen2_5=False,
         **kwargs,
     ):
         if vlm_config is None:
-            vlm_config = CONFIG_MAPPING["qwen2_vl"]()
+            vlm_config = CONFIG_MAPPING["qwen2_5_vl"]() if use_qwen2_5 else CONFIG_MAPPING["qwen2_vl"]()
             logger.info(
-                "`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2VLConfig` with default values."
+                f"`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2{".5" if use_qwen2_5 else ""}VLConfig` with default values."
             )
         elif isinstance(vlm_config, dict):
             vlm_config = deepcopy(vlm_config)

diff --git a/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py b/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
@@ -69,7 +69,7 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> d
                     original_state_dict[key] = f.get_tensor(key)
 
     # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
-    if "lm_head.weight" not in original_state_dict:
+    if "lm_head.weight" not in original_state_dict and "model.embed_tokens.weight" in original_state_dict:
         original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
 
     return original_state_dict
@@ -99,11 +99,12 @@ def convert_colqwen2_weights_to_hf(
     push_to_hub: bool,
     revision: Optional[str] = None,
     original_vlm_name_or_path: Optional[str] = None,
+    using_qwen2_5=False,
 ):
     # Load the original model data
     original_config = AutoConfig.from_pretrained(
         model_id,
-        revision=revision,
+        revision=revision
     )
     if original_vlm_name_or_path is not None:
         original_config._name_or_path = original_vlm_name_or_path
@@ -119,10 +120,11 @@ def convert_colqwen2_weights_to_hf(
     config = ColQwen2Config(
         vlm_config=original_config,
         embedding_dim=128,  # hardcoded in the original model
+        use_qwen2_5=using_qwen2_5
     )
     config.model_type = "colqwen2"
     config.is_composition = False
-
+    
     # Load the untrained model
     model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
     print("Created model with new config and randomly initialized weights")
@@ -201,6 +203,12 @@ def convert_colqwen2_weights_to_hf(
         help="Name or path of the original VLM backbone model",
         default=None,
     )
+    parser.add_argument(
+        "--using_qwen2_5",
+        help="Whether the original VLM backbone is Qwen2.5",
+        action="store_true",
+        default=False,
+    )
     args = parser.parse_args()
 
     convert_colqwen2_weights_to_hf(
@@ -209,4 +217,5 @@ def convert_colqwen2_weights_to_hf(
         push_to_hub=args.push_to_hub,
         revision=args.revision,
         original_vlm_name_or_path=args.original_vlm_name_or_path,
+        using_qwen2_5=args.using_qwen2_5
     )
diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py
@@ -175,7 +175,8 @@ def forward(
             inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
 
             if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
+                vdtype, vdevice = self._dtype_device(self.vlm.visual)
+                pixel_values = pixel_values.to(dtype=vdtype, device=vdevice)
                 image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
                 image_mask = (
                     (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
@@ -250,5 +251,10 @@ def resize_token_embeddings(
 
         return model_embeds
 
+    def _dtype_device(self, module):
+        for p in module.parameters():
+            return p.dtype, p.device
+        return next(self.parameters()).dtype, next(self.parameters()).device
+
 
 __all__ = ["ColQwen2ForRetrieval", "ColQwen2PreTrainedModel"]
diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -368,7 +368,8 @@ def forward(
             inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
 
             if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
+                vdtype, vdevice = self._dtype_device(self.vlm.visual)
+                pixel_values = pixel_values.to(dtype=vdtype, device=vdevice)
                 image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
                 image_mask = (
                     (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
@@ -408,6 +409,11 @@ def forward(
             hidden_states=vlm_hidden_states,
             attentions=vlm_output.attentions,
         )
+
+    def _dtype_device(self, module):
+        for p in module.parameters():
+            return p.dtype, p.device
+        return next(self.parameters()).dtype, next(self.parameters()).device
 
 
 __all__ = [