@@ -5062,7 +5062,7 @@ def set_gguf_parameters(self):
50625062 self.gguf_writer.add_add_bos_token(False)
50635063
50645064
5065- @ModelBase.register("Phi3ForCausalLM")
5065+ @ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV" )
50665066class Phi3MiniModel(TextModel):
50675067 model_arch = gguf.MODEL_ARCH.PHI3
50685068
@@ -5237,6 +5237,129 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
52375237 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
52385238 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
52395239
5240+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5241+ if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
5242+ return
5243+
5244+ yield from super().modify_tensors(data_torch, name, bid)
5245+
5246+
5247+ @ModelBase.register("Phi4ForCausalLMV")
5248+ class Phi4VisionMmprojModel(MmprojModel):
5249+ def __init__(self, *args, **kwargs):
5250+ super().__init__(*args, **kwargs)
5251+ assert self.hparams_vision is not None
5252+
5253+ self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
5254+ if self.vision_total_layers < 2:
5255+ raise ValueError(
5256+ f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
5257+ )
5258+
5259+ # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
5260+ # drop post-layernorm/head weights. This makes the GGUF runtime output match
5261+ # the feature map consumed by the patched siglip.cpp Phi-4 projector path.
5262+ self.vision_export_layers = self.vision_total_layers - 1
5263+ self.vision_last_layer_idx = self.vision_total_layers - 1
5264+
5265+ for key in self.n_block_keys:
5266+ if key in self.hparams_vision:
5267+ self.hparams_vision[key] = self.vision_export_layers
5268+ break
5269+
5270+ self.block_count = self.vision_export_layers
5271+ self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
5272+
5273+ patch_size = self.preprocessor_config.get("patch_size")
5274+ if patch_size is None:
5275+ raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
5276+
5277+ self.hparams_vision["patch_size"] = patch_size
5278+
5279+ pos_emb_name = next(
5280+ (
5281+ name for name in self.model_tensors
5282+ if name.endswith("vision_model.embeddings.position_embedding.weight")
5283+ ),
5284+ None,
5285+ )
5286+ if pos_emb_name is None:
5287+ raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
5288+
5289+ pos_emb_shape = self.model_tensors[pos_emb_name]().shape
5290+ base_grid_tokens = int(pos_emb_shape[0])
5291+ grid_side = math.isqrt(base_grid_tokens)
5292+ if grid_side * grid_side != base_grid_tokens:
5293+ raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
5294+
5295+ self.hparams_vision["image_size"] = grid_side * patch_size
5296+
5297+ min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
5298+ max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
5299+ if min_num_patches is None or max_num_patches is None:
5300+ raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
5301+
5302+ self.min_pixels = int(min_num_patches) * patch_size * patch_size
5303+ self.max_pixels = int(max_num_patches) * patch_size * patch_size
5304+
5305+ def set_gguf_parameters(self):
5306+ super().set_gguf_parameters()
5307+ assert self.hparams_vision is not None
5308+
5309+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
5310+ self.gguf_writer.add_vision_min_pixels(self.min_pixels)
5311+ self.gguf_writer.add_vision_max_pixels(self.max_pixels)
5312+ self.gguf_writer.add_vision_use_gelu(True)
5313+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
5314+
5315+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5316+ if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
5317+ if ".vision_model.head." in name:
5318+ return
5319+
5320+ new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
5321+
5322+ if ".vision_model.post_layernorm." in new_name:
5323+ return
5324+
5325+ if bid is not None and bid == self.vision_last_layer_idx:
5326+ return
5327+
5328+ if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
5329+ assert self.hparams_vision is not None
5330+ if data_torch.ndim != 2:
5331+ raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
5332+
5333+ patch_area = self.hparams_vision["patch_size"] ** 2
5334+ in_features = data_torch.shape[1]
5335+ if in_features % patch_area != 0:
5336+ raise ValueError(
5337+ f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
5338+ )
5339+
5340+ num_channels = in_features // patch_area
5341+ patch_size = self.hparams_vision["patch_size"]
5342+ data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
5343+ data_torch = data_torch.permute(0, 3, 1, 2)
5344+
5345+ yield from super().modify_tensors(data_torch, new_name, bid)
5346+ return
5347+
5348+ if name.startswith(("model.mm_projector.", "mm_projector.")):
5349+ local_name = name
5350+ local_name = local_name.replace("model.mm_projector.", "")
5351+ local_name = local_name.replace("mm_projector.", "")
5352+
5353+ if not (local_name.startswith("0.") or local_name.startswith("2.")):
5354+ return
5355+
5356+ suffix = ".bias" if local_name.endswith(".bias") else ".weight"
5357+ mm_idx = int(local_name.split(".", maxsplit=1)[0])
5358+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
5359+ return
5360+
5361+ return
5362+
52405363
52415364@ModelBase.register("PhiMoEForCausalLM")
52425365class PhiMoeModel(Phi3MiniModel):
0 commit comments