Skip to content

Commit edb18b6

Browse files
authored
clip : fix pixtral on some GPU backends (#13097)
* clip : fix pixtral on some GPU backends * refactor inp_raw set * rm outdated comment * fix dynamic size * add TODO
1 parent 514c456 commit edb18b6

File tree

2 files changed

+48
-22
lines changed

2 files changed

+48
-22
lines changed

examples/llava/clip.cpp

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -554,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
554554
}
555555

556556
// implementation of the 2D RoPE without adding a new op in ggml
557+
// this is not efficient (use double the memory), but works on all backends
558+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
557559
static ggml_tensor * build_rope_2d(
558-
ggml_cgraph * gf,
559560
ggml_context * ctx0,
560561
ggml_tensor * cur,
561562
ggml_tensor * pos_h,
562563
ggml_tensor * pos_w,
563564
const float freq_base
564565
) {
565-
ggml_tensor * tmp;
566566
const int64_t n_dim = cur->ne[0];
567567
const int64_t n_head = cur->ne[1];
568568
const int64_t n_pos = cur->ne[2];
@@ -571,18 +571,23 @@ static ggml_tensor * build_rope_2d(
571571
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
572572
// first half of cur will use 1e-0, 1e-2 (even)
573573
// second half of cur will use 1e-1, 1e-3 (odd)
574-
//
575-
// for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
574+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
576575
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
577576
// then for the second half, we use freq_scale to shift the inv_freq
578577
// ^ why? replace (2i) with (2i+1) in the above equation
579578
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
580579

581580
// first half
581+
ggml_tensor * first;
582582
{
583-
cur = ggml_rope_ext_inplace(
583+
first = ggml_view_3d(ctx0, cur,
584+
n_dim/2, n_head, n_pos,
585+
ggml_row_size(cur->type, n_dim),
586+
ggml_row_size(cur->type, n_dim*n_head),
587+
0);
588+
first = ggml_rope_ext(
584589
ctx0,
585-
cur,
590+
first,
586591
pos_h, // positions
587592
nullptr, // freq factors
588593
n_dim/2, // n_dims
@@ -592,26 +597,27 @@ static ggml_tensor * build_rope_2d(
592597
}
593598

594599
// second half
600+
ggml_tensor * second;
595601
{
596-
tmp = ggml_view_3d(ctx0, cur,
602+
second = ggml_view_3d(ctx0, cur,
597603
n_dim/2, n_head, n_pos,
598604
ggml_row_size(cur->type, n_dim),
599605
ggml_row_size(cur->type, n_dim*n_head),
600606
n_dim/2 * ggml_element_size(cur));
601-
tmp = ggml_rope_ext_inplace(
607+
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
608+
second = ggml_rope_ext(
602609
ctx0,
603-
tmp,
610+
second,
604611
pos_w, // positions
605612
nullptr, // freq factors
606613
n_dim/2, // n_dims
607614
0, 0, freq_base,
608615
freq_scale_odd,
609616
0.0f, 1.0f, 0.0f, 0.0f
610617
);
611-
// calculate inplace (modify cur directly)
612-
ggml_build_forward_expand(gf, tmp);
613618
}
614619

620+
cur = ggml_concat(ctx0, first, second, 0);
615621
return cur;
616622
}
617623

@@ -680,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
680686
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
681687

682688
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
683-
Q = build_rope_2d(gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta);
689+
Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
684690
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
685691

686692
struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
687693

688694
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
689-
K = build_rope_2d(gf, ctx0, K, pos_h, pos_w, hparams.rope_theta);
695+
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
690696
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
691697

692698
struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
@@ -2796,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27962802
const auto & model = ctx->vision_model;
27972803
const auto & hparams = model.hparams;
27982804

2805+
// TODO @ngxson : this is ugly, need to refactor later
2806+
bool support_dynamic_size = ctx->has_minicpmv_projector
2807+
|| ctx->has_qwen2vl_merger
2808+
|| ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
2809+
27992810
const int image_size = hparams.image_size;
28002811
int image_size_width = image_size;
28012812
int image_size_height = image_size;
2802-
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2813+
if (support_dynamic_size) {
28032814
image_size_width = imgs.entries[0]->nx;
28042815
image_size_height = imgs.entries[0]->ny;
28052816
}
@@ -2811,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28112822

28122823
{
28132824
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
2814-
float * data = (float *)malloc(ggml_nbytes(inp_raw));
2825+
std::vector<float> inp_data(ggml_nelements(inp_raw));
2826+
float * data = inp_data.data();
2827+
2828+
// layout of data (note: the channel dim is unrolled to better visualize the layout):
2829+
//
2830+
// ┌──W──┐
2831+
// │ H │ channel = R
2832+
// ├─────┤ │
2833+
// │ H │ channel = G
2834+
// ├─────┤ │
2835+
// │ H │ channel = B
2836+
// └─────┘ │
2837+
// ──────┘ x B
28152838

2816-
// TODO @ngxson : this whole code block is ugly, will need to be refactored
28172839
for (size_t i = 0; i < imgs.entries.size(); i++) {
28182840
const int nx = imgs.entries[i]->nx;
28192841
const int ny = imgs.entries[i]->ny;
@@ -2828,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28282850
const int n = nx * ny;
28292851

28302852
for (int b = 0; b < batch_size; b++) {
2831-
for (int k = 0; k < 3; k++) {
2832-
for (int y = 0; y < ny; y++) {
2833-
for (int x = 0; x < nx; x++) {
2834-
data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
2835-
}
2853+
float * batch_entry = data + b * (3*n);
2854+
for (int y = 0; y < ny; y++) {
2855+
for (int x = 0; x < nx; x++) {
2856+
size_t base_src = 3*(y * nx + x); // idx of the first channel
2857+
size_t base_dst = y * nx + x; // idx of the first channel
2858+
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
2859+
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
2860+
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
28362861
}
28372862
}
28382863
}
28392864
}
28402865
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
2841-
free(data);
28422866
}
28432867
if (ctx->has_minicpmv_projector) {
28442868
{

tests/test-backend-ops.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,6 +2606,8 @@ struct test_rope : public test_case {
26062606
} else {
26072607
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
26082608
}
2609+
2610+
// TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
26092611
}
26102612
ggml_set_name(out, "out");
26112613

0 commit comments

Comments
 (0)