Skip to content

Commit fdb1764

Browse files
dranger003ngxson
andauthored
model : add support for Phi4ForCausalLMV (#20168)
* Add support for Phi4ForCausalLMV. * Fix Phi-4 vision parity (correcting SigLIP2 patch-kernel export layout) and matching HF NaFlex resize behavior in mtmd. * Rename contants + fix tokenizer label * Clean-ups. * Fix GGUF export. * Set tokenizer.ggml.pre explicitly. * Default vocab name rather than forcing it. * Clean-ups. * Fix indent. * Fix subscriptable error. * remov overcomplicated code path * Clean-ups. --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
1 parent 1eea6a2 commit fdb1764

6 files changed

Lines changed: 158 additions & 2 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5062,7 +5062,7 @@ def set_gguf_parameters(self):
50625062
self.gguf_writer.add_add_bos_token(False)
50635063

50645064

5065-
@ModelBase.register("Phi3ForCausalLM")
5065+
@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV")
50665066
class Phi3MiniModel(TextModel):
50675067
model_arch = gguf.MODEL_ARCH.PHI3
50685068

@@ -5237,6 +5237,129 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
52375237
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
52385238
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
52395239

5240+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5241+
if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
5242+
return
5243+
5244+
yield from super().modify_tensors(data_torch, name, bid)
5245+
5246+
5247+
@ModelBase.register("Phi4ForCausalLMV")
5248+
class Phi4VisionMmprojModel(MmprojModel):
5249+
def __init__(self, *args, **kwargs):
5250+
super().__init__(*args, **kwargs)
5251+
assert self.hparams_vision is not None
5252+
5253+
self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
5254+
if self.vision_total_layers < 2:
5255+
raise ValueError(
5256+
f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
5257+
)
5258+
5259+
# Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
5260+
# drop post-layernorm/head weights. This makes the GGUF runtime output match
5261+
# the feature map consumed by the patched siglip.cpp Phi-4 projector path.
5262+
self.vision_export_layers = self.vision_total_layers - 1
5263+
self.vision_last_layer_idx = self.vision_total_layers - 1
5264+
5265+
for key in self.n_block_keys:
5266+
if key in self.hparams_vision:
5267+
self.hparams_vision[key] = self.vision_export_layers
5268+
break
5269+
5270+
self.block_count = self.vision_export_layers
5271+
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
5272+
5273+
patch_size = self.preprocessor_config.get("patch_size")
5274+
if patch_size is None:
5275+
raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
5276+
5277+
self.hparams_vision["patch_size"] = patch_size
5278+
5279+
pos_emb_name = next(
5280+
(
5281+
name for name in self.model_tensors
5282+
if name.endswith("vision_model.embeddings.position_embedding.weight")
5283+
),
5284+
None,
5285+
)
5286+
if pos_emb_name is None:
5287+
raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
5288+
5289+
pos_emb_shape = self.model_tensors[pos_emb_name]().shape
5290+
base_grid_tokens = int(pos_emb_shape[0])
5291+
grid_side = math.isqrt(base_grid_tokens)
5292+
if grid_side * grid_side != base_grid_tokens:
5293+
raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
5294+
5295+
self.hparams_vision["image_size"] = grid_side * patch_size
5296+
5297+
min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
5298+
max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
5299+
if min_num_patches is None or max_num_patches is None:
5300+
raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
5301+
5302+
self.min_pixels = int(min_num_patches) * patch_size * patch_size
5303+
self.max_pixels = int(max_num_patches) * patch_size * patch_size
5304+
5305+
def set_gguf_parameters(self):
5306+
super().set_gguf_parameters()
5307+
assert self.hparams_vision is not None
5308+
5309+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
5310+
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
5311+
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
5312+
self.gguf_writer.add_vision_use_gelu(True)
5313+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
5314+
5315+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5316+
if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
5317+
if ".vision_model.head." in name:
5318+
return
5319+
5320+
new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
5321+
5322+
if ".vision_model.post_layernorm." in new_name:
5323+
return
5324+
5325+
if bid is not None and bid == self.vision_last_layer_idx:
5326+
return
5327+
5328+
if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
5329+
assert self.hparams_vision is not None
5330+
if data_torch.ndim != 2:
5331+
raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
5332+
5333+
patch_area = self.hparams_vision["patch_size"] ** 2
5334+
in_features = data_torch.shape[1]
5335+
if in_features % patch_area != 0:
5336+
raise ValueError(
5337+
f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
5338+
)
5339+
5340+
num_channels = in_features // patch_area
5341+
patch_size = self.hparams_vision["patch_size"]
5342+
data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
5343+
data_torch = data_torch.permute(0, 3, 1, 2)
5344+
5345+
yield from super().modify_tensors(data_torch, new_name, bid)
5346+
return
5347+
5348+
if name.startswith(("model.mm_projector.", "mm_projector.")):
5349+
local_name = name
5350+
local_name = local_name.replace("model.mm_projector.", "")
5351+
local_name = local_name.replace("mm_projector.", "")
5352+
5353+
if not (local_name.startswith("0.") or local_name.startswith("2.")):
5354+
return
5355+
5356+
suffix = ".bias" if local_name.endswith(".bias") else ".weight"
5357+
mm_idx = int(local_name.split(".", maxsplit=1)[0])
5358+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
5359+
return
5360+
5361+
return
5362+
52405363

52415364
@ModelBase.register("PhiMoEForCausalLM")
52425365
class PhiMoeModel(Phi3MiniModel):

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3881,6 +3881,7 @@ class VisionProjectorType:
38813881
GEMMA3 = "gemma3"
38823882
GEMMA3NV = "gemma3nv"
38833883
GEMMA3NA = "gemma3na"
3884+
PHI4 = "phi4"
38843885
IDEFICS3 = "idefics3"
38853886
PIXTRAL = "pixtral"
38863887
LLAMA4 = "llama4"

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ enum projector_type {
216216
PROJECTOR_TYPE_GEMMA3,
217217
PROJECTOR_TYPE_GEMMA3NV,
218218
PROJECTOR_TYPE_GEMMA3NA,
219+
PROJECTOR_TYPE_PHI4,
219220
PROJECTOR_TYPE_IDEFICS3,
220221
PROJECTOR_TYPE_PIXTRAL,
221222
PROJECTOR_TYPE_QWEN25VL,
@@ -253,6 +254,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
253254
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
254255
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
255256
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
257+
{ PROJECTOR_TYPE_PHI4, "phi4"},
256258
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
257259
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
258260
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},

tools/mtmd/clip.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
792792
case PROJECTOR_TYPE_IDEFICS3:
793793
case PROJECTOR_TYPE_LFM2:
794794
case PROJECTOR_TYPE_JANUS_PRO:
795+
case PROJECTOR_TYPE_PHI4:
795796
{
796797
builder = std::make_unique<clip_graph_siglip>(ctx, img);
797798
} break;
@@ -1144,6 +1145,13 @@ struct clip_model_loader {
11441145
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
11451146
hparams.set_limit_image_tokens(64, 256);
11461147
} break;
1148+
case PROJECTOR_TYPE_PHI4:
1149+
{
1150+
hparams.n_merge = 1;
1151+
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
1152+
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
1153+
hparams.set_warmup_n_tokens(16*16);
1154+
} break;
11471155
case PROJECTOR_TYPE_PIXTRAL:
11481156
case PROJECTOR_TYPE_LIGHTONOCR:
11491157
{
@@ -1841,6 +1849,13 @@ struct clip_model_loader {
18411849
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
18421850
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
18431851
} break;
1852+
case PROJECTOR_TYPE_PHI4:
1853+
{
1854+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
1855+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1856+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1857+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1858+
} break;
18441859
case PROJECTOR_TYPE_LFM2A:
18451860
{
18461861
for (int i : {0, 2, 3, 5, 6}) {
@@ -3157,6 +3172,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
31573172
res_imgs->entries.push_back(std::move(img_f32));
31583173
} break;
31593174

3175+
case PROJECTOR_TYPE_PHI4:
31603176
case PROJECTOR_TYPE_PIXTRAL:
31613177
case PROJECTOR_TYPE_LIGHTONOCR:
31623178
{
@@ -3383,6 +3399,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
33833399
case PROJECTOR_TYPE_MLP:
33843400
case PROJECTOR_TYPE_MLP_NORM:
33853401
case PROJECTOR_TYPE_JANUS_PRO:
3402+
case PROJECTOR_TYPE_PHI4:
33863403
{
33873404
// do nothing
33883405
} break;
@@ -3884,6 +3901,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38843901
case PROJECTOR_TYPE_VOXTRAL:
38853902
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
38863903
case PROJECTOR_TYPE_JANUS_PRO:
3904+
case PROJECTOR_TYPE_PHI4:
38873905
case PROJECTOR_TYPE_COGVLM:
38883906
{
38893907
// do nothing
@@ -4013,6 +4031,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
40134031
case PROJECTOR_TYPE_LDPV2:
40144032
return ctx->model.mm_model_peg_0_b->ne[0];
40154033
case PROJECTOR_TYPE_MLP:
4034+
case PROJECTOR_TYPE_PHI4:
40164035
case PROJECTOR_TYPE_PIXTRAL:
40174036
case PROJECTOR_TYPE_LIGHTONOCR:
40184037
return ctx->model.mm_2_w->ne[1];

tools/mtmd/models/siglip.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ ggml_cgraph * clip_graph_siglip::build() {
44
ggml_tensor * inp = build_inp();
55

66
ggml_tensor * learned_pos_embd = model.position_embeddings;
7-
if (proj_type == PROJECTOR_TYPE_LFM2) {
7+
if (proj_type == PROJECTOR_TYPE_LFM2 || proj_type == PROJECTOR_TYPE_PHI4) {
88
learned_pos_embd = resize_position_embeddings();
99
}
1010

@@ -75,6 +75,14 @@ ggml_cgraph * clip_graph_siglip::build() {
7575
hparams.ffn_op,
7676
-1);
7777

78+
} else if (proj_type == PROJECTOR_TYPE_PHI4) {
79+
cur = build_ffn(cur,
80+
model.mm_0_w, model.mm_0_b,
81+
nullptr, nullptr,
82+
model.mm_2_w, model.mm_2_b,
83+
FFN_GELU,
84+
-1);
85+
7886
} else {
7987
GGML_ABORT("SigLIP: Unsupported projector type");
8088
}

tools/mtmd/mtmd.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ struct mtmd_context {
290290
img_beg = "<|vision_start|>";
291291
img_end = "<|vision_end|>";
292292

293+
} else if (proj == PROJECTOR_TYPE_PHI4) {
294+
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
295+
293296
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
294297
// (more details in mtmd_context constructor)
295298
img_beg = "<|image_start|>";

0 commit comments

Comments
 (0)