@@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
583
583
mlp_1 = ggml_gelu (ctx0, mlp_1);
584
584
struct ggml_tensor * mlp_3 = ggml_mul_mat (ctx0, model.mm_model_mlp_3_w , mlp_1);
585
585
mlp_3 = ggml_add (ctx0, mlp_3, model.mm_model_mlp_3_b );
586
- // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24]
587
- mlp_3 = ggml_reshape_4d (ctx0, mlp_3, mlp_3->ne [0 ], n_patch, n_patch, mlp_3->ne [3 ]);
588
- // permute logic is src idxs 0,1,2,3 perm to dst idxs
589
- mlp_3 = ggml_permute_cpy (ctx0, mlp_3, 2 , 0 , 1 , 3 );
590
- // mlp_3 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
586
+ // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
591
587
592
588
// block 1
593
589
struct ggml_tensor * block_1 = nullptr ;
594
590
{
591
+ // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
592
+ mlp_3 = ggml_cont (ctx0, ggml_permute (ctx0, mlp_3, 1 , 0 , 2 , 3 ));
593
+ mlp_3 = ggml_reshape_4d (ctx0, mlp_3, n_patch, n_patch, mlp_3->ne [1 ], mlp_3->ne [2 ]);
595
594
// stride = 1, padding = 1, bias is nullptr
596
595
block_1 = ggml_conv_depthwise_2d (ctx0, model.mm_model_block_1_block_0_0_w , mlp_3, nullptr , 1 , 1 , 1 , 1 , 1 , 1 );
597
596
598
597
// layer norm
599
598
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
600
- block_1 = ggml_permute_cpy (ctx0, block_1, 1 , 2 , 0 , 3 );
599
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 1 , 2 , 0 , 3 ) );
601
600
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
602
601
block_1 = ggml_norm (ctx0, block_1, eps);
603
602
block_1 = ggml_add (ctx0, ggml_mul (ctx0, block_1, model.mm_model_block_1_block_0_1_w ), model.mm_model_block_1_block_0_1_b );
604
- block_1 = ggml_permute_cpy (ctx0, block_1, 2 , 0 , 1 , 3 );
603
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 2 , 0 , 1 , 3 ) );
605
604
606
605
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
607
606
// hardswish
@@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
621
620
block_1 = ggml_reshape_4d (ctx0, block_1, 1 , 1 , block_1->ne [0 ], block_1->ne [1 ]);
622
621
block_1 = ggml_mul (ctx0, block_1_hw, block_1);
623
622
624
- // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
625
- struct ggml_tensor * block_2_0_w_4d = ggml_reshape_4d (ctx0, model.mm_model_block_1_block_2_0_w , 1 , 1 ,
626
- model.mm_model_block_1_block_2_0_w ->ne [0 ], model.mm_model_block_1_block_2_0_w ->ne [1 ]);
627
- block_1 = ggml_conv_2d (ctx0, block_2_0_w_4d, block_1, 1 , 1 , 0 , 0 , 1 , 1 );
623
+ int w = block_1->ne [0 ], h = block_1->ne [1 ];
624
+ block_1 = ggml_reshape_3d (ctx0, block_1, w*h, block_1->ne [2 ], block_1->ne [3 ]);
625
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 1 , 0 , 2 , 3 ));
626
+
627
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
628
+ block_1 = ggml_mul_mat (ctx0, model.mm_model_block_1_block_2_0_w , block_1);
629
+ block_1 = ggml_reshape_4d (ctx0, block_1, block_1->ne [0 ], w, h, block_1->ne [3 ]);
628
630
629
- // layernorm
630
- block_1 = ggml_permute_cpy (ctx0, block_1, 1 , 2 , 0 , 3 );
631
631
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
632
632
block_1 = ggml_norm (ctx0, block_1, eps);
633
633
block_1 = ggml_add (ctx0, ggml_mul (ctx0, block_1, model.mm_model_block_1_block_2_1_w ), model.mm_model_block_1_block_2_1_b );
634
- block_1 = ggml_permute_cpy (ctx0, block_1, 2 , 0 , 1 , 3 );
634
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 2 , 0 , 1 , 3 ) );
635
635
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
636
636
// residual
637
637
block_1 = ggml_add (ctx0, mlp_3, block_1);
@@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
644
644
645
645
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
646
646
// layer norm
647
- block_1 = ggml_permute_cpy (ctx0, block_1, 1 , 2 , 0 , 3 );
647
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 1 , 2 , 0 , 3 ) );
648
648
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
649
649
block_1 = ggml_norm (ctx0, block_1, eps);
650
650
block_1 = ggml_add (ctx0, ggml_mul (ctx0, block_1, model.mm_model_block_2_block_0_1_w ), model.mm_model_block_2_block_0_1_b );
651
- block_1 = ggml_permute_cpy (ctx0, block_1, 2 , 0 , 1 , 3 );
651
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 2 , 0 , 1 , 3 ) );
652
652
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
653
653
// hardswish
654
654
struct ggml_tensor * block_1_hw = ggml_hardswish (ctx0, block_1);
@@ -664,22 +664,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
664
664
block_1 = ggml_mul_mat (ctx0, model.mm_model_block_2_block_1_fc2_w , block_1);
665
665
block_1 = ggml_add (ctx0, block_1, model.mm_model_block_2_block_1_fc2_b );
666
666
block_1 = ggml_hardsigmoid (ctx0, block_1);
667
-
667
+
668
668
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
669
669
block_1 = ggml_reshape_4d (ctx0, block_1, 1 , 1 , block_1->ne [0 ], block_1->ne [1 ]);
670
670
block_1 = ggml_mul (ctx0, block_1_hw, block_1);
671
- // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
672
- struct ggml_tensor * block_2_0_w_4d = ggml_reshape_4d (ctx0, model.mm_model_block_2_block_2_0_w , 1 , 1 ,
673
- model.mm_model_block_2_block_2_0_w ->ne [0 ], model.mm_model_block_1_block_2_0_w ->ne [1 ]);
674
- block_1 = ggml_conv_2d (ctx0, block_2_0_w_4d, block_1, 1 , 1 , 0 , 0 , 1 , 1 );
675
- // layernorm
676
- block_1 = ggml_permute_cpy (ctx0, block_1, 1 , 2 , 0 , 3 );
671
+
672
+ int w = block_1->ne [0 ], h = block_1->ne [1 ];
673
+ block_1 = ggml_reshape_3d (ctx0, block_1, w*h, block_1->ne [2 ], block_1->ne [3 ]);
674
+ block_1 = ggml_cont (ctx0, ggml_permute (ctx0, block_1, 1 , 0 , 2 , 3 ));
675
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
676
+ block_1 = ggml_mul_mat (ctx0, model.mm_model_block_2_block_2_0_w , block_1);
677
+ block_1 = ggml_reshape_4d (ctx0, block_1, block_1->ne [0 ], w, h, block_1->ne [3 ]);
678
+
679
+
677
680
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
678
681
block_1 = ggml_norm (ctx0, block_1, eps);
679
682
block_1 = ggml_add (ctx0, ggml_mul (ctx0, block_1, model.mm_model_block_2_block_2_1_w ), model.mm_model_block_2_block_2_1_b );
680
683
block_1 = ggml_reshape_3d (ctx0, block_1, block_1->ne [0 ], block_1->ne [1 ] * block_1->ne [2 ], block_1->ne [3 ]);
681
684
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
682
- }
685
+ }
683
686
embeddings = block_1;
684
687
}
685
688
else {
0 commit comments