@@ -592,6 +592,10 @@ struct clip_graph {
592
592
const int n_pos = n_patches;
593
593
const int num_position_ids = n_pos * 4 ; // m-rope requires 4 dim per position
594
594
595
+ norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
596
+ ? NORM_TYPE_RMS // qwen 2.5 vl
597
+ : NORM_TYPE_NORMAL; // qwen 2 vl
598
+
595
599
int mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
596
600
597
601
ggml_tensor * inp_raw = build_inp_raw ();
@@ -633,7 +637,7 @@ struct clip_graph {
633
637
634
638
// pre-layernorm
635
639
if (model.pre_ln_w ) {
636
- inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , NORM_TYPE_RMS , eps, -1 );
640
+ inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , norm_t , eps, -1 );
637
641
}
638
642
639
643
if (use_window_attn) {
@@ -710,7 +714,7 @@ struct clip_graph {
710
714
cb (cur, " ffn_inp" , il);
711
715
712
716
// layernorm2
713
- cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , NORM_TYPE_RMS , eps, il);
717
+ cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
714
718
cb (cur, " ffn_inp_normed" , il);
715
719
716
720
// ffn
@@ -731,7 +735,7 @@ struct clip_graph {
731
735
732
736
// post-layernorm
733
737
if (model.post_ln_w ) {
734
- inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , NORM_TYPE_RMS , eps, n_layer);
738
+ inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , norm_t , eps, n_layer);
735
739
}
736
740
737
741
// multimodal projection
0 commit comments