From 792387b1642f85ccc5f35c088b172c6e5d8dcdf6 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 12:51:40 +0200 Subject: [PATCH 01/11] wip --- convert_hf_to_gguf.py | 75 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 6 +++ gguf-py/gguf/gguf_writer.py | 3 ++ gguf-py/gguf/tensor_mapping.py | 21 ++++++++++ 4 files changed, 105 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b9cea7e4699c6..37a9f36dc21b3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2555,6 +2555,81 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +class Qwen2VLVisionModel(VisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams["image_size"] = self.hparams.get("image_size", 560) + # rename config.json values + self.hparams["num_attention_heads"] = self.hparams.get("num_heads") + self.hparams["num_hidden_layers"] = self.hparams.get("depth") + self.hparams["intermediate_size"] = self.hparams.get("hidden_size") + self.hparams["hidden_size"] = self.hparams.get("embed_dim") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if self.global_config['model_type'] == 'qwen2_vl': + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL) + elif self.global_config['model_type'] == 'qwen2_5_vl': + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL) + self.gguf_writer.add_vision_use_silu(True) + # find n_wa_pattern (window attention pattern) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + else: + raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd." in new_name: + return gguf.GGMLQuantizationType.F16 + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + if name.startswith("visual."): + # process visual tensors + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c: c * 2] + wv = data_torch[c * 2:] + return [ + (self.map_tensor_name(name.replace("qkv", "q")), wq), + (self.map_tensor_name(name.replace("qkv", "k")), wk), + (self.map_tensor_name(name.replace("qkv", "v")), wv), + ] + elif 'patch_embed.proj.weight' in name: + # split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = data_torch.shape + del c1, c2, kh, kw # unused + assert kt == 2, "Current implmentation only support temporal_patch_size of 2" + return [ + (self.map_tensor_name(name), data_torch[:, :, 0, ...]), + (self.map_tensor_name(name + '.1'), data_torch[:, :, 1, ...]), + ] + else: + return [(self.map_tensor_name(name), data_torch)] + return [] # skip other tensors + + @ModelBase.register("WavTokenizerDec") class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 326ccdb071a79..ec3bd20e2e597 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -233,6 +233,7 @@ class ClipVision: IMAGE_STD = "clip.vision.image_std" USE_GELU = "clip.use_gelu" USE_SILU = "clip.use_silu" + N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl class Attention: HEAD_COUNT = "clip.vision.attention.head_count" @@ -479,6 +480,7 @@ class MODEL_TENSOR(IntEnum): V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_PATCH1 = auto() # qwen2vl V_ENC_EMBD_POS = auto() V_ENC_ATTN_Q = auto() V_ENC_ATTN_K = auto() @@ -734,6 +736,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH1: "v.patch_embd.weight.1", # qwen2vl MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", @@ -770,6 +773,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_PATCH1, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_ATTN_Q, MODEL_TENSOR.V_ENC_ATTN_K, @@ -2155,6 +2159,8 @@ class VisionProjectorType: GEMMA3 = "gemma3" IDEFICS3 = "idefics3" PIXTRAL = "pixtral" + QWEN2VL = "qwen2vl_merger" + QWEN25VL = "qwen2.5vl_merger" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f22a6d4a3472b..b796c11291a26 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -981,6 +981,9 @@ def add_vision_use_silu(self, value: bool) -> None: def add_vision_projector_scale_factor(self, value: int) -> None: self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) + def add_vision_n_wa_pattern(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 311d1ff69c799..0100d0f3333c2 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -896,6 +896,7 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ: ( "multi_modal_projector.linear_{bid}", + "visual.merger.mlp.{bid}", # qwen2vl ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -919,6 +920,11 @@ class TensorNameMap: "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM "vision_tower.patch_conv", # pixtral + "visual.patch_embed.proj", # qwen2vl + ), + + MODEL_TENSOR.V_ENC_EMBD_PATCH1: ( + "visual.patch_embed.proj.weight.1", # qwen2vl, generated ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -932,6 +938,7 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral + "visual.blocks.{bid}.attn.q", # qwen2vl, generated ), MODEL_TENSOR.V_ENC_ATTN_K: ( @@ -939,6 +946,7 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral + "visual.blocks.{bid}.attn.k", # qwen2vl, generated ), MODEL_TENSOR.V_ENC_ATTN_V: ( @@ -946,6 +954,7 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral + "visual.blocks.{bid}.attn.v", # qwen2vl, generated ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -953,6 +962,7 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral + "visual.blocks.{bid}.norm1", # qwen2vl ), MODEL_TENSOR.V_ENC_OUTPUT: ( @@ -960,6 +970,7 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral + "visual.blocks.{bid}.attn.proj", # qwen2vl ), MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( @@ -967,17 +978,24 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral + "visual.blocks.{bid}.norm2", # qwen2vl ), + # some namings are messed up because the original llava code swapped fc1 and fc2 + # we have no better way to fix it, just be careful + # new models like pixtral use the correct naming MODEL_TENSOR.V_ENC_FFN_UP: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped) "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral + "visual.blocks.{bid}.mlp.fc2", # qwen2vl + "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl ), MODEL_TENSOR.V_ENC_FFN_GATE: ( "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral + "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( @@ -985,6 +1003,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped) "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral + "visual.blocks.{bid}.mlp.fc1", # qwen2vl + "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl ), MODEL_TENSOR.V_PRE_NORM: ( @@ -995,6 +1015,7 @@ class TensorNameMap: MODEL_TENSOR.V_POST_NORM: ( "vision_tower.vision_model.post_layernorm", "model.vision_model.post_layernorm", # SmolVLM + "visual.merger.ln_q", # qwen2vl ), MODEL_TENSOR.V_MM_INP_PROJ: ( From f7260c2d2b08e5a77995886bc17a4506cd929391 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 13:06:01 +0200 Subject: [PATCH 02/11] qwen2.5vl ok --- convert_hf_to_gguf.py | 7 ++++--- gguf-py/gguf/tensor_mapping.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 37a9f36dc21b3..ed069e4f89d83 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1103,7 +1103,7 @@ def set_gguf_parameters(self): # preprocessor config self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"]) def write_vocab(self): raise ValueError("VisionModel does not support vocab writing") @@ -2563,8 +2563,9 @@ def __init__(self, *args, **kwargs): # rename config.json values self.hparams["num_attention_heads"] = self.hparams.get("num_heads") self.hparams["num_hidden_layers"] = self.hparams.get("depth") - self.hparams["intermediate_size"] = self.hparams.get("hidden_size") - self.hparams["hidden_size"] = self.hparams.get("embed_dim") + if "embed_dim" in self.hparams: # qwen2vl + self.hparams["intermediate_size"] = self.hparams.get("hidden_size") + self.hparams["hidden_size"] = self.hparams.get("embed_dim") def set_gguf_parameters(self): super().set_gguf_parameters() diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0100d0f3333c2..b113e3f116321 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -990,7 +990,7 @@ class TensorNameMap: "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped) "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral "visual.blocks.{bid}.mlp.fc2", # qwen2vl - "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl + "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1004,7 +1004,7 @@ class TensorNameMap: "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped) "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral "visual.blocks.{bid}.mlp.fc1", # qwen2vl - "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl + "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl ), MODEL_TENSOR.V_PRE_NORM: ( From b5e72ed35022b3b41ac003bed5a2d02bb2f46ace Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 17:22:28 +0200 Subject: [PATCH 03/11] vision: fix models missing "text_config" --- convert_hf_to_gguf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 21a4cf719228c..0e447db709964 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1087,6 +1087,8 @@ def __init__(self, *args, **kwargs): raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION") # get n_embd of the text model + if "text_config" not in self.hparams: + self.hparams["text_config"] = {} text_config = {**self.hparams, **self.hparams["text_config"]} self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) assert self.n_embd_text > 0, "n_embd not found in hparams" From 4fac7d4eaa4528a3f814732e8f1e60ed4c11566a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 18:53:56 +0200 Subject: [PATCH 04/11] add test --- examples/llava/README.md | 10 ++++++++++ examples/llava/tests.sh | 16 +++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index f58d9de7107e8..cd71116c18f7a 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -34,6 +34,16 @@ llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF # Pixtral 12B llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF + +# Qwen 2 VL +llama-mtmd-cli -hf ggml-org/Qwen2-VL-2B-Instruct +llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct + +# Qwen 2.5 VL +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct ``` ## How it works and what is `mmproj`? diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index 75604315cfeba..5030a0beadd33 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -36,12 +36,6 @@ add_test() { arr_tmpl+=("$tmpl") } -add_test_big() { - if [ "$RUN_BIG_TESTS" = true ]; then - add_test "$@" - fi -} - add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0" add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M" add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0" @@ -58,7 +52,15 @@ add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" # to test the big models, run: ./tests.sh big -add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" +if [ "$RUN_BIG_TESTS" = true ]; then + add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct:Q4_K_M" + # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct:Q4_K_M" # too big +fi # these models always give the wrong answer, not sure why # add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M" From 474933e252407c9f9be1bf63d2331c76efd9eac5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 19:12:14 +0200 Subject: [PATCH 05/11] fix test repo name --- examples/llava/README.md | 12 ++++++------ examples/llava/tests.sh | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index cd71116c18f7a..2e2f5508a3cbf 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -36,14 +36,14 @@ llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF # Qwen 2 VL -llama-mtmd-cli -hf ggml-org/Qwen2-VL-2B-Instruct -llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct +llama-mtmd-cli -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF +llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF # Qwen 2.5 VL -llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct -llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct -llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct -llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF +llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF ``` ## How it works and what is `mmproj`? diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index 5030a0beadd33..de29c60077e5a 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -54,12 +54,12 @@ add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" # to test the big models, run: ./tests.sh big if [ "$RUN_BIG_TESTS" = true ]; then add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" - add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct:Q4_K_M" - add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct:Q4_K_M" - add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct:Q4_K_M" - add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct:Q4_K_M" - add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct:Q4_K_M" - # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct:Q4_K_M" # too big + add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M" + add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" + # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big fi # these models always give the wrong answer, not sure why From 651752f1ae25fe8a01c1e57c18cf2eca80b2774e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 22:04:06 +0200 Subject: [PATCH 06/11] fix 32B model --- convert_hf_to_gguf.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0e447db709964..04a55f159d2e7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2579,15 +2579,20 @@ def set_gguf_parameters(self): elif self.global_config['model_type'] == 'qwen2_5_vl': self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL) self.gguf_writer.add_vision_use_silu(True) - # find n_wa_pattern (window attention pattern) - fullatt_block_indexes = hparams.get("fullatt_block_indexes") - assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" - n_wa_pattern = fullatt_block_indexes[0] + 1 - # validate n_wa_pattern - for i in range(1, len(fullatt_block_indexes)): - if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: - raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") - self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + out_hidden_size = hparams.get("out_hidden_size") + if out_hidden_size == 5120: + # 32B model does not have n_wa_pattern, the other models do + self.gguf_writer.add_vision_n_wa_pattern(0) + else: + # find n_wa_pattern (window attention pattern) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) else: raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") # default values below are taken from HF tranformers code From d96ef53a8cfd8d91ffd370604c2ecc284e972d92 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 22:34:13 +0200 Subject: [PATCH 07/11] Revert "fix 32B model" This reverts commit 651752f1ae25fe8a01c1e57c18cf2eca80b2774e. --- convert_hf_to_gguf.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 04a55f159d2e7..0e447db709964 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2579,20 +2579,15 @@ def set_gguf_parameters(self): elif self.global_config['model_type'] == 'qwen2_5_vl': self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL) self.gguf_writer.add_vision_use_silu(True) - out_hidden_size = hparams.get("out_hidden_size") - if out_hidden_size == 5120: - # 32B model does not have n_wa_pattern, the other models do - self.gguf_writer.add_vision_n_wa_pattern(0) - else: - # find n_wa_pattern (window attention pattern) - fullatt_block_indexes = hparams.get("fullatt_block_indexes") - assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" - n_wa_pattern = fullatt_block_indexes[0] + 1 - # validate n_wa_pattern - for i in range(1, len(fullatt_block_indexes)): - if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: - raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") - self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + # find n_wa_pattern (window attention pattern) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) else: raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") # default values below are taken from HF tranformers code From 13e4cccb499bdecc8a173d58bd5ee5648ae5a8d2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 22:41:23 +0200 Subject: [PATCH 08/11] clarify about 32B --- examples/llava/README.md | 2 +- examples/llava/tests.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 2e2f5508a3cbf..8be7453345c8a 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -42,8 +42,8 @@ llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF # Qwen 2.5 VL llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF -llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF +# NOTE: Qwen2.5-VL-32B text-only model is currently unusable ``` ## How it works and what is `mmproj`? diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index de29c60077e5a..dafe5b1662366 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -58,7 +58,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M" add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M" - add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" + # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # text model is broken, not sure why # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big fi From 6e31ddcd2c5a9dd3583accfd62728f77d6a6e201 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 22:46:31 +0200 Subject: [PATCH 09/11] rm qwen surgery script --- examples/llava/qwen2_vl_surgery.py | 217 ----------------------------- 1 file changed, 217 deletions(-) delete mode 100644 examples/llava/qwen2_vl_surgery.py diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py deleted file mode 100644 index 7951a6fa8951e..0000000000000 --- a/examples/llava/qwen2_vl_surgery.py +++ /dev/null @@ -1,217 +0,0 @@ -import argparse -from typing import Dict, List, Optional - -import torch -import numpy as np -from gguf import * -from transformers import ( - AutoProcessor, - Qwen2VLConfig, - Qwen2VLProcessor, - Qwen2VLForConditionalGeneration, - Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue] - Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue] -) - - -VISION = "clip.vision" - - -def k(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]): - if fullatt_block_indexes is None: - return 0 - n_wa = fullatt_block_indexes[0] - for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]): - if b - a - 1 != n_wa: - raise ValueError( - f"window/full attention layer should have fix pattern of " - f"for each full-attention layer followed by {n_wa} window-attention layers" - ) - return n_wa + 1 - - -class VL2: - - @staticmethod - def to_gguf_name(name: str) -> str: - og = name - name = name.replace("text_model", "t").replace("vision_model", "v") - name = name.replace("blocks", "blk").replace("embeddings.", "") - name = name.replace("attn.", "attn_") - name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.") - # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln") - name = name.replace("norm1", "ln1").replace("norm2", "ln2") - name = name.replace("merger.mlp", 'mm') - print(f"[to_gguf_name] {og} --> {name}") - return name - - @classmethod - def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]: - vision_model = qwen2vl.visual - tensor_map = {} - for name, ten in vision_model.state_dict().items(): - ten = ten.numpy() - if 'qkv' in name: - if ten.ndim == 2: # weight - c3, _ = ten.shape - else: # bias - c3 = ten.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = ten[:c] - wk = ten[c: c * 2] - wv = ten[c * 2:] - tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq - tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk - tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv - elif 'merger' in name: - if name.endswith("ln_q.weight"): - tensor_map['v.post_ln.weight'] = ten - elif name.endswith("ln_q.bias"): - tensor_map['v.post_ln.bias'] = ten - else: - # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" - tensor_map[cls.to_gguf_name(name)] = ten - elif 'patch_embed.proj.weight' in name: - # NOTE: split Conv3D into Conv2Ds - c1, c2, kt, kh, kw = ten.shape - assert kt == 2, "Current implmentation only support temporal_patch_size of 2" - tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] - tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] - else: - tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten - - for new_name, ten in tensor_map.items(): - if ten.ndim <= 1 or new_name.endswith("_norm.weight"): - tensor_map[new_name] = ten.astype(np.float32) - else: - tensor_map[new_name] = ten.astype(dtype) - tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder - return tensor_map - - -class VL25(VL2): - - @staticmethod - def to_gguf_name(name: str) -> str: - og = name - name = name.replace("text_model", "t").replace("vision_model", "v") - name = name.replace("blocks", "blk").replace("embeddings.", "") - name = name.replace("attn.", "attn_") - name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up") - name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.") - name = name.replace("norm1", "ln1").replace("norm2", "ln2") - name = name.replace("merger.mlp", 'mm') - print(f"[vl25][to_gguf_name] {og} --> {name}") - return name - - -def main(args): - if args.data_type == 'fp32': - dtype = torch.float32 - np_dtype = np.float32 - ftype = 0 - elif args.data_type == 'fp16': - dtype = torch.float16 - np_dtype = np.float16 - ftype = 1 - else: - raise ValueError() - - local_model = False - model_path = "" - model_name = args.model_name - print("model_name: ", model_name) - if args.model_type == "qwen2vl": - qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( - model_name, torch_dtype=dtype, device_map="cpu" - ) - cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] - vcfg = cfg.vision_config - else: - qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained( - model_name, torch_dtype=dtype, device_map="cpu" - ) - cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] - vcfg = cfg.vision_config - - if os.path.isdir(model_name): - local_model = True - if model_name.endswith(os.sep): - model_name = model_name[:-1] - model_path = model_name - model_name = os.path.basename(model_name) - fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf" - - fout = GGUFWriter(path=fname_out, arch="clip") - fout.add_description("image encoder for Qwen2VL") - - fout.add_file_type(ftype) - fout.add_bool("clip.has_text_encoder", False) - fout.add_bool("clip.has_vision_encoder", True) - fout.add_bool("clip.has_qwen2vl_merger", True) - - print(cfg.vision_config) - if 'silu' in cfg.vision_config.hidden_act.lower(): - fout.add_bool("clip.use_silu", True) - fout.add_bool("clip.use_gelu", False) - elif 'gelu' in cfg.vision_config.hidden_act.lower(): - fout.add_bool("clip.use_silu", False) - fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower()) - else: - raise ValueError() - - if args.model_type == "qwen2.5vl": - fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes)) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size) - fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size) - fout.add_string("clip.projector_type", "qwen2.5vl_merger") - else: - fout.add_string("clip.projector_type", "qwen2vl_merger") - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim) - fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) - - if args.model_type == "qwen2.5vl": - tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype) - else: - tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype) - for name, data in tensor_map.items(): - fout.add_tensor(name, data) - - fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) - fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) - fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder - fout.add_name(model_name) - """ - HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig, - it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`. - """ - - if local_model: - processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path) - else: - processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) - fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue] - fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue] - - fout.write_header_to_file() - fout.write_kv_data_to_file() - fout.write_tensors_to_file() - fout.close() - print("save model as: ", fname_out) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct") - parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl") - parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") - args = parser.parse_args() - main(args) From ef0bc7aac16e7c67f4ef66ffc734e96ac83b61fc Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 30 Apr 2025 22:50:32 +0200 Subject: [PATCH 10/11] update llava/readme --- examples/llava/README.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 8be7453345c8a..02b980d367199 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -67,7 +67,16 @@ Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advanta ## How to obtain `mmproj` -Multimodal projector (`mmproj`) files are specific to each model architecture. Please refer to the relevant guide for instructions on how to obtain or create them: +Multimodal projector (`mmproj`) files are specific to each model architecture. + +For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file: +- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support +- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) +- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) +- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint +- Qwen 2 VL and Qwen 2.5 VL + +For older models, please refer to the relevant guide for instructions on how to obtain or create them: - [LLaVA](../../docs/multimodal/llava.md) - [MobileVLM](../../docs/multimodal/MobileVLM.md) @@ -76,10 +85,3 @@ Multimodal projector (`mmproj`) files are specific to each model architecture. P - [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md) - [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md) - [IBM Granite Vision](../../docs/multimodal/granitevision.md) -- [Google Gemma 3](../../docs/multimodal/gemma3.md) - -For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file: -- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support -- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) -- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) -- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint From c030984b5a30eb2e5530516bfd7c4b8a42dbcf52 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 2 May 2025 15:39:27 +0200 Subject: [PATCH 11/11] move V_ENC_EMBD_PATCH handling to Qwen2VLVisionModel --- convert_hf_to_gguf.py | 4 ++-- gguf-py/gguf/constants.py | 3 --- gguf-py/gguf/tensor_mapping.py | 4 ---- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 697f82c3e56f8..ff82a85a9d7cd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2653,8 +2653,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter del c1, c2, kh, kw # unused assert kt == 2, "Current implmentation only support temporal_patch_size of 2" return [ - (self.map_tensor_name(name), data_torch[:, :, 0, ...]), - (self.map_tensor_name(name + '.1'), data_torch[:, :, 1, ...]), + (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]), + (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]), ] else: return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3f9e4c091ca4c..74e46c3ee0f95 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -481,7 +481,6 @@ class MODEL_TENSOR(IntEnum): V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() - V_ENC_EMBD_PATCH1 = auto() # qwen2vl V_ENC_EMBD_POS = auto() V_ENC_ATTN_Q = auto() V_ENC_ATTN_K = auto() @@ -739,7 +738,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", - MODEL_TENSOR.V_ENC_EMBD_PATCH1: "v.patch_embd.weight.1", # qwen2vl MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", @@ -778,7 +776,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, - MODEL_TENSOR.V_ENC_EMBD_PATCH1, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_ATTN_Q, MODEL_TENSOR.V_ENC_ATTN_K, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b6e1fba4c7bf4..2b089f84a841a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -923,10 +923,6 @@ class TensorNameMap: "visual.patch_embed.proj", # qwen2vl ), - MODEL_TENSOR.V_ENC_EMBD_PATCH1: ( - "visual.patch_embed.proj.weight.1", # qwen2vl, generated - ), - MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", "vpm.embeddings.position_embedding",