@@ -554,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
554
554
}
555
555
556
556
// implementation of the 2D RoPE without adding a new op in ggml
557
+ // this is not efficient (use double the memory), but works on all backends
558
+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
557
559
static ggml_tensor * build_rope_2d (
558
- ggml_cgraph * gf,
559
560
ggml_context * ctx0,
560
561
ggml_tensor * cur,
561
562
ggml_tensor * pos_h,
562
563
ggml_tensor * pos_w,
563
564
const float freq_base
564
565
) {
565
- ggml_tensor * tmp;
566
566
const int64_t n_dim = cur->ne [0 ];
567
567
const int64_t n_head = cur->ne [1 ];
568
568
const int64_t n_pos = cur->ne [2 ];
@@ -571,18 +571,23 @@ static ggml_tensor * build_rope_2d(
571
571
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
572
572
// first half of cur will use 1e-0, 1e-2 (even)
573
573
// second half of cur will use 1e-1, 1e-3 (odd)
574
- //
575
- // for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
574
+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
576
575
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
577
576
// then for the second half, we use freq_scale to shift the inv_freq
578
577
// ^ why? replace (2i) with (2i+1) in the above equation
579
578
const float freq_scale_odd = std::pow (freq_base, (float )-2 /n_dim);
580
579
581
580
// first half
581
+ ggml_tensor * first;
582
582
{
583
- cur = ggml_rope_ext_inplace (
583
+ first = ggml_view_3d (ctx0, cur,
584
+ n_dim/2 , n_head, n_pos,
585
+ ggml_row_size (cur->type , n_dim),
586
+ ggml_row_size (cur->type , n_dim*n_head),
587
+ 0 );
588
+ first = ggml_rope_ext (
584
589
ctx0,
585
- cur ,
590
+ first ,
586
591
pos_h, // positions
587
592
nullptr , // freq factors
588
593
n_dim/2 , // n_dims
@@ -592,26 +597,27 @@ static ggml_tensor * build_rope_2d(
592
597
}
593
598
594
599
// second half
600
+ ggml_tensor * second;
595
601
{
596
- tmp = ggml_view_3d (ctx0, cur,
602
+ second = ggml_view_3d (ctx0, cur,
597
603
n_dim/2 , n_head, n_pos,
598
604
ggml_row_size (cur->type , n_dim),
599
605
ggml_row_size (cur->type , n_dim*n_head),
600
606
n_dim/2 * ggml_element_size (cur));
601
- tmp = ggml_rope_ext_inplace (
607
+ second = ggml_cont (ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
608
+ second = ggml_rope_ext (
602
609
ctx0,
603
- tmp ,
610
+ second ,
604
611
pos_w, // positions
605
612
nullptr , // freq factors
606
613
n_dim/2 , // n_dims
607
614
0 , 0 , freq_base,
608
615
freq_scale_odd,
609
616
0 .0f , 1 .0f , 0 .0f , 0 .0f
610
617
);
611
- // calculate inplace (modify cur directly)
612
- ggml_build_forward_expand (gf, tmp);
613
618
}
614
619
620
+ cur = ggml_concat (ctx0, first, second, 0 );
615
621
return cur;
616
622
}
617
623
@@ -680,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
680
686
struct ggml_tensor * Q = ggml_mul_mat (ctx0, model.layers [il].q_w , cur);
681
687
682
688
Q = ggml_reshape_3d (ctx0, Q, d_head, n_head, num_patches);
683
- Q = build_rope_2d (gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta );
689
+ Q = build_rope_2d (ctx0, Q, pos_h, pos_w, hparams.rope_theta );
684
690
Q = ggml_cont (ctx0, ggml_permute (ctx0, Q, 0 , 2 , 1 , 3 ));
685
691
686
692
struct ggml_tensor * K = ggml_mul_mat (ctx0, model.layers [il].k_w , cur);
687
693
688
694
K = ggml_reshape_3d (ctx0, K, d_head, n_head, num_patches);
689
- K = build_rope_2d (gf, ctx0, K, pos_h, pos_w, hparams.rope_theta );
695
+ K = build_rope_2d (ctx0, K, pos_h, pos_w, hparams.rope_theta );
690
696
K = ggml_cont (ctx0, ggml_permute (ctx0, K, 0 , 2 , 1 , 3 ));
691
697
692
698
struct ggml_tensor * V = ggml_mul_mat (ctx0, model.layers [il].v_w , cur);
@@ -2796,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2796
2802
const auto & model = ctx->vision_model ;
2797
2803
const auto & hparams = model.hparams ;
2798
2804
2805
+ // TODO @ngxson : this is ugly, need to refactor later
2806
+ bool support_dynamic_size = ctx->has_minicpmv_projector
2807
+ || ctx->has_qwen2vl_merger
2808
+ || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
2809
+
2799
2810
const int image_size = hparams.image_size ;
2800
2811
int image_size_width = image_size;
2801
2812
int image_size_height = image_size;
2802
- if (ctx-> has_minicpmv_projector | ctx-> has_qwen2vl_merger ) {
2813
+ if (support_dynamic_size ) {
2803
2814
image_size_width = imgs.entries [0 ]->nx ;
2804
2815
image_size_height = imgs.entries [0 ]->ny ;
2805
2816
}
@@ -2811,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2811
2822
2812
2823
{
2813
2824
struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
2814
- float * data = (float *)malloc (ggml_nbytes (inp_raw));
2825
+ std::vector<float > inp_data (ggml_nelements (inp_raw));
2826
+ float * data = inp_data.data ();
2827
+
2828
+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
2829
+ //
2830
+ // ┌──W──┐
2831
+ // │ H │ channel = R
2832
+ // ├─────┤ │
2833
+ // │ H │ channel = G
2834
+ // ├─────┤ │
2835
+ // │ H │ channel = B
2836
+ // └─────┘ │
2837
+ // ──────┘ x B
2815
2838
2816
- // TODO @ngxson : this whole code block is ugly, will need to be refactored
2817
2839
for (size_t i = 0 ; i < imgs.entries .size (); i++) {
2818
2840
const int nx = imgs.entries [i]->nx ;
2819
2841
const int ny = imgs.entries [i]->ny ;
@@ -2828,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2828
2850
const int n = nx * ny;
2829
2851
2830
2852
for (int b = 0 ; b < batch_size; b++) {
2831
- for (int k = 0 ; k < 3 ; k++) {
2832
- for (int y = 0 ; y < ny; y++) {
2833
- for (int x = 0 ; x < nx; x++) {
2834
- data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries [b]->buf [3 * (y * nx + x) + k];
2835
- }
2853
+ float * batch_entry = data + b * (3 *n);
2854
+ for (int y = 0 ; y < ny; y++) {
2855
+ for (int x = 0 ; x < nx; x++) {
2856
+ size_t base_src = 3 *(y * nx + x); // idx of the first channel
2857
+ size_t base_dst = y * nx + x; // idx of the first channel
2858
+ batch_entry[ base_dst] = imgs.entries [b]->buf [base_src ];
2859
+ batch_entry[1 *n + base_dst] = imgs.entries [b]->buf [base_src + 1 ];
2860
+ batch_entry[2 *n + base_dst] = imgs.entries [b]->buf [base_src + 2 ];
2836
2861
}
2837
2862
}
2838
2863
}
2839
2864
}
2840
2865
ggml_backend_tensor_set (inp_raw, data, 0 , ggml_nbytes (inp_raw));
2841
- free (data);
2842
2866
}
2843
2867
if (ctx->has_minicpmv_projector ) {
2844
2868
{
0 commit comments