Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
d4a79ce
adding option for 2.5
Aug 31, 2025
6b8f487
minor - arg in conversion script
Aug 31, 2025
d0697ce
getting started on modelling.py
Sep 1, 2025
26b8cbf
minor - shouldve been using modular
Sep 1, 2025
f5299d2
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 1, 2025
14d96d3
adressing comments + fixing datatype/device _get method
Sep 1, 2025
3ac06f9
minor
Sep 1, 2025
8bc5d73
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 4, 2025
0656367
Merge branch 'huggingface:main' into integrate_colqwen2.5_using_colqw…
sahil-kabir Sep 5, 2025
73b029b
commiting suggestion
sahil-kabir Sep 5, 2025
3aa8aa8
docs + first test
Sep 5, 2025
d4be146
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 5, 2025
f591764
ruff fix
Sep 10, 2025
9577aae
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 10, 2025
e9ea6b6
minor fix
Sep 10, 2025
6ae49f6
ruff fix
Sep 10, 2025
9297f9e
model fix
Sep 10, 2025
6a62d82
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 10, 2025
2032bd5
Merge branch 'huggingface:main' into integrate_colqwen2.5_using_colqw…
sahil-kabir Sep 12, 2025
a0a6245
model fix
Sep 13, 2025
db2df86
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 13, 2025
272a7dc
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Sep 27, 2025
5ca07ce
fine-grained check, with a hardcoded score from the original Hf imple…
Sep 27, 2025
961fb9f
minor ruff
Sep 27, 2025
b6b454e
Merge remote-tracking branch 'upstream/main' into integrate_colqwen2.…
yonigozlan Oct 3, 2025
76238d3
update tests values with CI hardware
yonigozlan Oct 3, 2025
0582b59
adding 2.5 to conversion script
Oct 19, 2025
30dc9d9
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
sahil-kabir Oct 19, 2025
26fe35c
Apply style fixes
github-actions[bot] Nov 3, 2025
673289b
Merge branch 'main' into integrate_colqwen2.5_using_colqwen2_modellin…
yonigozlan Nov 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/transformers/models/colqwen2/configuration_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,13 @@ def __init__(
vlm_config=None,
embedding_dim: int = 128,
initializer_range: float = 0.02,
use_qwen2_5=False,
**kwargs,
):
if vlm_config is None:
vlm_config = CONFIG_MAPPING["qwen2_vl"]()
vlm_config = CONFIG_MAPPING["qwen2_5_vl"]() if use_qwen2_5 else CONFIG_MAPPING["qwen2_vl"]()
logger.info(
"`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2VLConfig` with default values."
f"`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2{".5" if use_qwen2_5 else ""}VLConfig` with default values."
)
elif isinstance(vlm_config, dict):
vlm_config = deepcopy(vlm_config)
Expand Down
15 changes: 12 additions & 3 deletions src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> d
original_state_dict[key] = f.get_tensor(key)

# Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
if "lm_head.weight" not in original_state_dict:
if "lm_head.weight" not in original_state_dict and "model.embed_tokens.weight" in original_state_dict:
original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()

return original_state_dict
Expand Down Expand Up @@ -99,11 +99,12 @@ def convert_colqwen2_weights_to_hf(
push_to_hub: bool,
revision: Optional[str] = None,
original_vlm_name_or_path: Optional[str] = None,
using_qwen2_5=False,
):
# Load the original model data
original_config = AutoConfig.from_pretrained(
model_id,
revision=revision,
revision=revision
)
if original_vlm_name_or_path is not None:
original_config._name_or_path = original_vlm_name_or_path
Expand All @@ -119,10 +120,11 @@ def convert_colqwen2_weights_to_hf(
config = ColQwen2Config(
vlm_config=original_config,
embedding_dim=128, # hardcoded in the original model
use_qwen2_5=using_qwen2_5
)
config.model_type = "colqwen2"
config.is_composition = False

# Load the untrained model
model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
print("Created model with new config and randomly initialized weights")
Expand Down Expand Up @@ -201,6 +203,12 @@ def convert_colqwen2_weights_to_hf(
help="Name or path of the original VLM backbone model",
default=None,
)
parser.add_argument(
"--using_qwen2_5",
help="Whether the original VLM backbone is Qwen2.5",
action="store_true",
default=False,
)
args = parser.parse_args()

convert_colqwen2_weights_to_hf(
Expand All @@ -209,4 +217,5 @@ def convert_colqwen2_weights_to_hf(
push_to_hub=args.push_to_hub,
revision=args.revision,
original_vlm_name_or_path=args.original_vlm_name_or_path,
using_qwen2_5=args.using_qwen2_5
)
8 changes: 7 additions & 1 deletion src/transformers/models/colqwen2/modeling_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ def forward(
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)

if pixel_values is not None:
pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
vdtype, vdevice = self._dtype_device(self.vlm.visual)
pixel_values = pixel_values.to(dtype=vdtype, device=vdevice)
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
image_mask = (
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
Expand Down Expand Up @@ -250,5 +251,10 @@ def resize_token_embeddings(

return model_embeds

def _dtype_device(self, module):
for p in module.parameters():
return p.dtype, p.device
return next(self.parameters()).dtype, next(self.parameters()).device


__all__ = ["ColQwen2ForRetrieval", "ColQwen2PreTrainedModel"]
8 changes: 7 additions & 1 deletion src/transformers/models/colqwen2/modular_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ def forward(
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)

if pixel_values is not None:
pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
vdtype, vdevice = self._dtype_device(self.vlm.visual)
pixel_values = pixel_values.to(dtype=vdtype, device=vdevice)
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
image_mask = (
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
Expand Down Expand Up @@ -408,6 +409,11 @@ def forward(
hidden_states=vlm_hidden_states,
attentions=vlm_output.attentions,
)

def _dtype_device(self, module):
for p in module.parameters():
return p.dtype, p.device
return next(self.parameters()).dtype, next(self.parameters()).device


__all__ = [
Expand Down