Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
d4a79ce
adding option for 2.5
Aug 31, 2025
6b8f487
minor - arg in conversion script
Aug 31, 2025
d0697ce
getting started on modelling.py
Sep 1, 2025
26b8cbf
minor - shouldve been using modular
Sep 1, 2025
f5299d2
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 1, 2025
14d96d3
adressing comments + fixing datatype/device _get method
Sep 1, 2025
3ac06f9
minor
Sep 1, 2025
8bc5d73
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 4, 2025
0656367
Merge branch 'huggingface:main' into integrate_colqwen2.5_using_colqw…
sahil-kabir Sep 5, 2025
73b029b
commiting suggestion
sahil-kabir Sep 5, 2025
3aa8aa8
docs + first test
Sep 5, 2025
d4be146
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 5, 2025
f591764
ruff fix
Sep 10, 2025
9577aae
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 10, 2025
e9ea6b6
minor fix
Sep 10, 2025
6ae49f6
ruff fix
Sep 10, 2025
9297f9e
model fix
Sep 10, 2025
6a62d82
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 10, 2025
2032bd5
Merge branch 'huggingface:main' into integrate_colqwen2.5_using_colqw…
sahil-kabir Sep 12, 2025
a0a6245
model fix
Sep 13, 2025
db2df86
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 13, 2025
272a7dc
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 27, 2025
5ca07ce
fine-grained check, with a hardcoded score from the original Hf imple…
Sep 27, 2025
961fb9f
minor ruff
Sep 27, 2025
b6b454e
Merge remote-tracking branch 'upstream/main' into integrate_colqwen2.…
yonigozlan Oct 3, 2025
76238d3
update tests values with CI hardware
yonigozlan Oct 3, 2025
0582b59
adding 2.5 to conversion script
Oct 19, 2025
30dc9d9
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Oct 19, 2025
26fe35c
Apply style fixes
github-actions[bot] Nov 3, 2025
673289b
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
yonigozlan Nov 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/transformers/models/colqwen2/configuration_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,15 @@ def __init__(
vlm_config=None,
embedding_dim: int = 128,
initializer_range: float = 0.02,
use_qwen2_5=False,
**kwargs,
):
if vlm_config is None:
vlm_config = CONFIG_MAPPING["qwen2_vl"]()
model_name = "qwen2_5_vl" if use_qwen2_5 else "qwen2_vl"
vlm_config = CONFIG_MAPPING[model_name]()
config_name = "Qwen2_5VLConfig" if use_qwen2_5 else "Qwen2VLConfig"
logger.info(
"`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2VLConfig` with default values."
"`vlm_config` is `None`. Initializing `vlm_config` with the `%s` with default values." % config_name
)
elif isinstance(vlm_config, dict):
vlm_config = deepcopy(vlm_config)
Expand All @@ -82,6 +85,7 @@ def __init__(
f"Invalid type for `vlm_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}."
)

self.use_qwen2_5 = use_qwen2_5
self.vlm_config = vlm_config
self.embedding_dim = embedding_dim
self.initializer_range = initializer_range
Expand Down
13 changes: 11 additions & 2 deletions src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> d
original_state_dict[key] = f.get_tensor(key)

# Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
if "lm_head.weight" not in original_state_dict:
if "lm_head.weight" not in original_state_dict and "model.embed_tokens.weight" in original_state_dict:
original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()

return original_state_dict
Expand Down Expand Up @@ -99,10 +99,11 @@ def convert_colqwen2_weights_to_hf(
push_to_hub: bool,
revision: Optional[str] = None,
original_vlm_name_or_path: Optional[str] = None,
use_qwen2_5=False,
):
# Load the original model data
original_config = AutoConfig.from_pretrained(
model_id,
model_id,
revision=revision,
)
if original_vlm_name_or_path is not None:
Expand All @@ -119,6 +120,7 @@ def convert_colqwen2_weights_to_hf(
config = ColQwen2Config(
vlm_config=original_config,
embedding_dim=128, # hardcoded in the original model
use_qwen2_5=use_qwen2_5,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instantiate a qwen2_5 config instead

)
config.model_type = "colqwen2"
config.is_composition = False
Expand Down Expand Up @@ -201,6 +203,12 @@ def convert_colqwen2_weights_to_hf(
help="Name or path of the original VLM backbone model",
default=None,
)
parser.add_argument(
"--use_qwen2_5",
help="Whether the original VLM backbone is Qwen2.5",
action="store_true",
default=False,
)
args = parser.parse_args()

convert_colqwen2_weights_to_hf(
Expand All @@ -209,4 +217,5 @@ def convert_colqwen2_weights_to_hf(
push_to_hub=args.push_to_hub,
revision=args.revision,
original_vlm_name_or_path=args.original_vlm_name_or_path,
use_qwen2_5=args.use_qwen2_5,
)
11 changes: 10 additions & 1 deletion src/transformers/models/colqwen2/modeling_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ def forward(
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)

if pixel_values is not None:
pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
dtype, device = self._get_dtype_device()
pixel_values = pixel_values.to(dtype=dtype, device=device)
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
image_mask = (
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
Expand Down Expand Up @@ -250,5 +251,13 @@ def resize_token_embeddings(

return model_embeds

def _get_dtype_device(self) -> tuple[str, str]:
if self.config.use_qwen2_5:
parameters = next(self.vlm.visual.parameters())
else:
parameters = next(self.parameters())
dtype, device = parameters.dtype, parameters.device
return dtype, device


__all__ = ["ColQwen2ForRetrieval", "ColQwen2PreTrainedModel"]
11 changes: 10 additions & 1 deletion src/transformers/models/colqwen2/modular_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ def forward(
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)

if pixel_values is not None:
pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
dtype, device = self._get_dtype_device()
pixel_values = pixel_values.to(dtype=dtype, device=device)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big fan of this, it'd be great if we can avoid having use_qwen2_5 in the config. Let's just use the dtype and device of input_embeds

Suggested change
dtype, device = self._get_dtype_device()
pixel_values = pixel_values.to(dtype=dtype, device=device)
pixel_values = pixel_values.to(inputs_embeds.device, inputs_embeds.dtype)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, in qwen-2 vision tower, we cast pixels to correct dtype manually so it is not needed. Also, LM and vision might be loaded with different dtypes and devices in specific cases :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zucchini-nlp Did I understand correctly that this line could be removed entirely and it should work anyway?

@sahil-kabir Maybe worth a quick try. 😉

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, correct

image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
image_mask = (
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
Expand Down Expand Up @@ -409,6 +410,14 @@ def forward(
attentions=vlm_output.attentions,
)

def _get_dtype_device(self) -> tuple[str, str]:
if self.config.use_qwen2_5:
parameters = next(self.vlm.visual.parameters())
else:
parameters = next(self.parameters())
dtype, device = parameters.dtype, parameters.device
return dtype, device

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for that then

Suggested change
def _get_dtype_device(self) -> tuple[str, str]:
if self.config.use_qwen2_5:
parameters = next(self.vlm.visual.parameters())
else:
parameters = next(self.parameters())
dtype, device = parameters.dtype, parameters.device
return dtype, device


__all__ = [
"ColQwen2ForRetrieval",
Expand Down