From c37aa34a9d89b3d272e14973fdeba3989c1e725e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 29 Jun 2023 23:11:55 +0300
Subject: [PATCH 01/18] WIP: Implement batch inference

---
 clip.cpp          | 310 +++++++++++++++++++++++++++++++++++++++++++++-
 clip.h            |   7 +-
 examples/main.cpp |   8 +-
 ggml              |   2 +-
 4 files changed, 319 insertions(+), 8 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index b917270..b722abe 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -11,7 +11,7 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
-// #define CLIP_DEBUG
+#define CLIP_DEBUG
 
 // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
 // after that, remove this and use the mechanism implemented in GGML directly
@@ -23,7 +23,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions)
     case 397:                        // base
         if (n_image_positions == 50) // patch size = 32
         {
-            return 8 * mb;
+            return 16 * mb;
         }
         else // patch size = 16
         {
@@ -1236,7 +1236,6 @@ bool clip_image_encode(
 
         // residual 2
         cur = ggml_add(ctx0, embeddings, cur);
-        // ggml_set_name(cur, "check");
 
         embeddings = cur;
     }
@@ -1477,3 +1476,308 @@ bool image_normalize(clip_image_u8 *img, clip_image_f32 *res)
     }
     return true;
 }
+
+bool clip_image_batch_encode(
+    const clip_ctx *ctx,
+    int n_threads,
+    const std::vector<clip_image_f32> &imgs,
+    float *vec)
+{
+    const auto &model = ctx->vision_model;
+    const auto &hparams = model.hparams;
+
+    const int image_size = hparams.image_size;
+    const int patch_size = hparams.patch_size;
+    const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_positions = num_patches + 1;
+    const int hidden_size = hparams.hidden_size;
+    const int n_head = hparams.n_head;
+    const int d_head = hidden_size / n_head;
+    const int n_layer = hparams.n_layer;
+    const int n_intermediate = hparams.n_intermediate;
+    const int projection_dim = hparams.projection_dim;
+    const int batch_size = imgs.size();
+
+    auto &buf_compute = ctx->buf_compute;
+
+    struct ggml_init_params params = {
+        .mem_size = buf_compute.size,
+        .mem_buffer = buf_compute.data,
+        .no_alloc = false,
+    };
+
+    struct ggml_context *ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+    gf.n_threads = n_threads;
+
+    static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions);
+    static void *scr0 = malloc(scr0_size);
+
+    struct ggml_tensor *inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
+
+    {
+        float *data = (float *)ggml_get_data(inp);
+
+        const int nx = imgs[0].nx;
+        const int ny = imgs[0].ny;
+        const int n = nx * ny;
+
+        GGML_ASSERT(nx == image_size && ny == image_size);
+
+        for (int b = 0; b < batch_size; b++)
+        {
+            for (int k = 0; k < 3; k++)
+            {
+                for (int y = 0; y < ny; y++)
+                {
+                    for (int x = 0; x < nx; x++)
+                    {
+                        data[(b * k * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k];
+                    }
+                }
+            }
+        }
+    }
+
+    inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp);
+    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+    // concat class_embeddings and patch_embeddings
+    struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    /*
+    ggml_set_zero(embeddings);
+    for (int b = 0; b < batch_size; b++)
+    {
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, b * (ggml_nbytes(embeddings) / batch_size));
+        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size);
+    }
+    */
+
+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+    embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
+
+    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+    for (int i = 0; i < num_positions; i++)
+    {
+        ggml_set_i32_1d(positions, i, i);
+    }
+
+    embeddings = ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
+
+    // pre-layernorm
+    {
+        embeddings = ggml_norm(ctx0, embeddings);
+
+        embeddings = ggml_add(ctx0,
+                              ggml_mul(ctx0,
+                                       ggml_repeat(ctx0, model.pre_ln_w, embeddings),
+                                       embeddings),
+                              ggml_repeat(ctx0, model.pre_ln_b, embeddings));
+    }
+
+    struct ggml_tensor *temp_w = ggml_new_tensor_4d(ctx0, model.layers[0].q_w->type, hidden_size, hidden_size, batch_size, 1);
+    struct ggml_tensor *temp_i = ggml_new_tensor_4d(ctx0, model.layers[0].ff_i_w->type, hidden_size, n_intermediate, batch_size, 1);
+    struct ggml_tensor *temp_o = ggml_new_tensor_4d(ctx0, model.layers[0].ff_o_w->type, n_intermediate, hidden_size, batch_size, 1);
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++)
+    {
+        struct ggml_tensor *cur = embeddings; // embeddings = residual, cur = hidden_states
+
+        const size_t nb_q_w = model.layers[il].q_w->nb[0];
+
+        ggml_set_scratch(ctx0, {0, scr0_size, scr0});
+
+        // layernorm1
+        {
+            cur = ggml_norm(ctx0, cur);
+
+            cur = ggml_add(ctx0,
+                           ggml_mul(ctx0,
+                                    ggml_repeat(ctx0, model.layers[il].ln_1_w, cur),
+                                    cur),
+                           ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        // self-attention
+        {
+
+            struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur),
+                                             ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].q_w, temp_w),
+                                                          cur));
+
+            Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head)));
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
+            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur),
+                                             ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].k_w, temp_w),
+                                                          cur));
+
+            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur),
+                                             ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].v_w, temp_w),
+                                                          cur));
+
+            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+
+            struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
+            KQ = ggml_soft_max_inplace(ctx0, KQ);
+            struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
+            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
+            KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
+
+            cur = ggml_cpy(ctx0,
+                           KQV,
+                           ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
+        }
+
+        // attention output
+        cur = ggml_add(ctx0,
+                       ggml_repeat(ctx0, model.layers[il].o_b, cur),
+                       ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].o_w, temp_w),
+                                    cur));
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, embeddings);
+
+        embeddings = cur; // embeddings = residual, cur = hidden_states
+
+        // layernorm2
+        {
+            cur = ggml_norm(ctx0, cur);
+
+            cur = ggml_add(ctx0,
+                           ggml_mul(ctx0,
+                                    ggml_repeat(ctx0, model.layers[il].ln_2_w, cur),
+                                    cur),
+                           ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+        }
+
+        cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_w, temp_i), cur);
+        cur = ggml_add(ctx0,
+                       ggml_repeat(ctx0, model.layers[il].ff_i_b, cur),
+                       cur);
+
+        if (ctx->use_gelu)
+        {
+            cur = ggml_gelu_inplace(ctx0, cur);
+        }
+        else
+        {
+            cur = ggml_gelu_quick_inplace(ctx0, cur);
+        }
+
+        cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_w, temp_o), cur);
+        cur = ggml_add(ctx0,
+                       ggml_repeat(ctx0, model.layers[il].ff_o_b, cur),
+                       cur);
+
+        // residual 2
+        cur = ggml_add(ctx0, embeddings, cur);
+
+        embeddings = cur;
+    }
+
+    // get the output of cls token, e.g., 0th index
+    struct ggml_tensor *cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size);
+    for (int b = 0; b < batch_size; b++)
+    {
+        ggml_set_i32_1d(cls, b, b * num_positions);
+    }
+    embeddings = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, embeddings, hidden_size, num_positions * batch_size), cls);
+
+    // post-layernorm
+    {
+        embeddings = ggml_norm(ctx0, embeddings);
+
+        embeddings = ggml_add(ctx0,
+                              ggml_mul(ctx0,
+                                       ggml_repeat(ctx0, model.post_ln_w, embeddings),
+                                       embeddings),
+                              ggml_repeat(ctx0, model.post_ln_b, embeddings));
+    }
+
+    ggml_set_scratch(ctx0, {0, 0, nullptr});
+
+    // final visual projection
+    embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
+
+    // normalize output embeddings
+    ggml_tensor *length = ggml_sqrt(ctx0,
+                                    ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
+    embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+
+    ggml_set_name(embeddings, "check");
+
+    // run the computation
+    ggml_build_forward_expand(&gf, embeddings);
+    ggml_graph_compute(ctx0, &gf);
+
+// print
+#ifdef CLIP_DEBUG
+    {
+        auto print_t_f32 = [&](struct ggml_tensor *t)
+        {
+            float *data = (float *)t->data;
+            printf("dtype: f32, dims: %jd %jd %jd %jd, nb: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+            printf("data: ");
+            for (int i = 0; i < std::min((int)t->ne[0], 20); i++)
+            {
+                printf("%f ", data[i]);
+            }
+
+            // printf("\n\n");
+            double sum = 0.0;
+            for (int i = 0; i < ggml_nelements(t); i++)
+            {
+                sum += data[i];
+            }
+            printf("sum:  %f\n", sum);
+        };
+
+        auto print_t_f16 = [&](struct ggml_tensor *t)
+        {
+            ggml_fp16_t *data = (ggml_fp16_t *)t->data;
+            printf("dtype: f16, dims: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]);
+            printf("data: ");
+            for (int i = 0; i < std::min((int)t->ne[0], 10); i++)
+            {
+                printf("%f ", ggml_fp16_to_fp32(data[i]));
+            }
+            printf("\n\n");
+            double sum = 0.0;
+            for (int i = 0; i < ggml_nelements(t); i++)
+            {
+                sum += ggml_fp16_to_fp32(data[i]);
+            }
+            printf("sum:  %f\n", sum);
+        };
+
+        auto *t = ggml_get_tensor(ctx0, "check");
+        if (t->type == GGML_TYPE_F32)
+        {
+            print_t_f32(t);
+        }
+        else
+        {
+            print_t_f16(t);
+        }
+    }
+
+    printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+#endif
+
+    memcpy(vec, ggml_get_data_f32(embeddings), sizeof(float) * projection_dim);
+
+    ggml_free(ctx0);
+
+    return true;
+}
diff --git a/clip.h b/clip.h
index 7978019..9aa8360 100644
--- a/clip.h
+++ b/clip.h
@@ -219,8 +219,11 @@ bool clip_compare_text_and_image(clip_ctx *ctx, int n_threads, std::string &text
 float clip_similarity_score(float *vec1, float *vec2, int vec_dim);
 bool softmax_with_sorting(float *arr, int length, float *sorted_scores, int *indices);
 
-// utils for debugging
-void write_floats_to_file(float *array, int size, char *filename);
+bool clip_image_batch_encode(
+    const clip_ctx *ctx,
+    int n_threads,
+    const std::vector<clip_image_f32> &imgs,
+    float *vec);
 
 // #ifdef __cplusplus
 // }
diff --git a/examples/main.cpp b/examples/main.cpp
index e38e047..920b356 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -54,10 +54,14 @@ int main(int argc, char **argv)
 
     clip_image_preprocess(ctx, &img0, &img_res);
 
+    std::vector<clip_image_f32> imgs;
+    imgs.push_back(img_res);
+    // imgs.push_back(img_res);
+
     const int64_t t_image_encode_us = ggml_time_us();
 
-    float img_vec[vec_dim];
-    if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec))
+    float img_vec[vec_dim * 2];
+    if (!clip_image_batch_encode(ctx, params.n_threads, imgs, img_vec))
     {
         return 1;
     }
diff --git a/ggml b/ggml
index 93b94a2..d2b23a4 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 93b94a2d41e880cb2abfb708535d5b04ad05b7a5
+Subproject commit d2b23a4d628317e7ab5efbba8d22d178af381369

From 09dff96af030fb3301ac1fcf894ba0d6c584a680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 30 Jun 2023 10:40:14 +0300
Subject: [PATCH 02/18] WIP: use broadcastable mul_mat

---
 clip.cpp | 31 +++++++++++++------------------
 ggml     |  2 +-
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index b722abe..b7a793f 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -23,7 +23,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions)
     case 397:                        // base
         if (n_image_positions == 50) // patch size = 32
         {
-            return 16 * mb;
+            return 32 * mb;
         }
         else // patch size = 16
         {
@@ -54,7 +54,7 @@ size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions)
     case 397:
         if (n_positions <= 50)
         {
-            return 16 * mb;
+            return 512 * mb;
         }
         else
         {
@@ -1545,17 +1545,16 @@ bool clip_image_batch_encode(
 
     // concat class_embeddings and patch_embeddings
     struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-    /*
+
     ggml_set_zero(embeddings);
     for (int b = 0; b < batch_size; b++)
     {
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, b * (ggml_nbytes(embeddings) / batch_size));
-        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size);
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, 0);
+        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size);
     }
-    */
 
-    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-    embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
+    // embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+    // embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
 
     struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     for (int i = 0; i < num_positions; i++)
@@ -1576,10 +1575,6 @@ bool clip_image_batch_encode(
                               ggml_repeat(ctx0, model.pre_ln_b, embeddings));
     }
 
-    struct ggml_tensor *temp_w = ggml_new_tensor_4d(ctx0, model.layers[0].q_w->type, hidden_size, hidden_size, batch_size, 1);
-    struct ggml_tensor *temp_i = ggml_new_tensor_4d(ctx0, model.layers[0].ff_i_w->type, hidden_size, n_intermediate, batch_size, 1);
-    struct ggml_tensor *temp_o = ggml_new_tensor_4d(ctx0, model.layers[0].ff_o_w->type, n_intermediate, hidden_size, batch_size, 1);
-
     // loop over layers
     for (int il = 0; il < n_layer; il++)
     {
@@ -1604,7 +1599,7 @@ bool clip_image_batch_encode(
         {
 
             struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur),
-                                             ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].q_w, temp_w),
+                                             ggml_mul_mat(ctx0, model.layers[il].q_w,
                                                           cur));
 
             Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head)));
@@ -1613,7 +1608,7 @@ bool clip_image_batch_encode(
             Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
 
             struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur),
-                                             ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].k_w, temp_w),
+                                             ggml_mul_mat(ctx0, model.layers[il].k_w,
                                                           cur));
 
             K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
@@ -1621,7 +1616,7 @@ bool clip_image_batch_encode(
             K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
 
             struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur),
-                                             ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].v_w, temp_w),
+                                             ggml_mul_mat(ctx0, model.layers[il].v_w,
                                                           cur));
 
             V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
@@ -1642,7 +1637,7 @@ bool clip_image_batch_encode(
         // attention output
         cur = ggml_add(ctx0,
                        ggml_repeat(ctx0, model.layers[il].o_b, cur),
-                       ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].o_w, temp_w),
+                       ggml_mul_mat(ctx0, model.layers[il].o_w,
                                     cur));
 
         // re-add the layer input, e.g., residual
@@ -1661,7 +1656,7 @@ bool clip_image_batch_encode(
                            ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
         }
 
-        cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_w, temp_i), cur);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
         cur = ggml_add(ctx0,
                        ggml_repeat(ctx0, model.layers[il].ff_i_b, cur),
                        cur);
@@ -1675,7 +1670,7 @@ bool clip_image_batch_encode(
             cur = ggml_gelu_quick_inplace(ctx0, cur);
         }
 
-        cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_w, temp_o), cur);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
         cur = ggml_add(ctx0,
                        ggml_repeat(ctx0, model.layers[il].ff_o_b, cur),
                        cur);
diff --git a/ggml b/ggml
index d2b23a4..c16c9e5 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit d2b23a4d628317e7ab5efbba8d22d178af381369
+Subproject commit c16c9e56ec48a139fa179f3852f4c154b2272f26

From bfab116fd3b7e09081ea2b8e368d7b4e3fffc05e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 30 Jun 2023 15:00:27 +0300
Subject: [PATCH 03/18] Batched Conv2D is working

---
 clip.cpp          | 22 ++++++++++++----------
 examples/main.cpp |  2 +-
 ggml              |  2 +-
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index b7a793f..dccdef5 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -1513,10 +1513,10 @@ bool clip_image_batch_encode(
     static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions);
     static void *scr0 = malloc(scr0_size);
 
-    struct ggml_tensor *inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
+    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
 
     {
-        float *data = (float *)ggml_get_data(inp);
+        float *data = (float *)ggml_get_data(inp_raw);
 
         const int nx = imgs[0].nx;
         const int ny = imgs[0].ny;
@@ -1532,29 +1532,30 @@ bool clip_image_batch_encode(
                 {
                     for (int x = 0; x < nx; x++)
                     {
-                        data[(b * k * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k];
+                        data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k];
                     }
                 }
             }
         }
     }
 
-    inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp);
+    struct ggml_tensor *inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp_raw);
     inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+    ggml_set_name(inp, "check");
 
     // concat class_embeddings and patch_embeddings
     struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
 
     ggml_set_zero(embeddings);
-    for (int b = 0; b < batch_size; b++)
+
+    // TODO: correct thisconcat op
+    // for (int b = 0; b < batch_size; b++)
     {
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size);
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
     }
-
-    // embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-    // embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
+    ggml_set_name(embeddings, "check");
 
     struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     for (int i = 0; i < num_positions; i++)
@@ -1757,6 +1758,7 @@ bool clip_image_batch_encode(
         };
 
         auto *t = ggml_get_tensor(ctx0, "check");
+        // auto t = inp_raw;
         if (t->type == GGML_TYPE_F32)
         {
             print_t_f32(t);
diff --git a/examples/main.cpp b/examples/main.cpp
index 920b356..51b1ae6 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 
     std::vector<clip_image_f32> imgs;
     imgs.push_back(img_res);
-    // imgs.push_back(img_res);
+    imgs.push_back(img_res);
 
     const int64_t t_image_encode_us = ggml_time_us();
 
diff --git a/ggml b/ggml
index c16c9e5..f967ae8 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit c16c9e56ec48a139fa179f3852f4c154b2272f26
+Subproject commit f967ae87c73dbc94201be22e346e35c51aacaa1b

From c80a020a25881d3102e34ae88be6d4b6af93f0f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 30 Jun 2023 15:06:13 +0300
Subject: [PATCH 04/18] Batched Conv2D is working

---
 clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clip.cpp b/clip.cpp
index dccdef5..49bc8f0 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -1549,7 +1549,7 @@ bool clip_image_batch_encode(
 
     ggml_set_zero(embeddings);
 
-    // TODO: correct thisconcat op
+    // TODO: correct this concat op
     // for (int b = 0; b < batch_size; b++)
     {
         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);

From 502a6694f665b23a58c6c1edf8244ec6b95cddf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 30 Jun 2023 19:38:38 +0300
Subject: [PATCH 05/18] Fix concat

---
 clip.cpp          | 19 +++++++------------
 examples/main.cpp |  2 +-
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index 49bc8f0..e381942 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -1542,20 +1542,15 @@ bool clip_image_batch_encode(
     struct ggml_tensor *inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp_raw);
     inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-    ggml_set_name(inp, "check");
 
     // concat class_embeddings and patch_embeddings
     struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
 
     ggml_set_zero(embeddings);
+    struct ggml_tensor *temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
 
-    // TODO: correct this concat op
-    // for (int b = 0; b < batch_size; b++)
-    {
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
-    }
-    ggml_set_name(embeddings, "check");
+    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+    embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
 
     struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     for (int i = 0; i < num_positions; i++)
@@ -1568,6 +1563,7 @@ bool clip_image_batch_encode(
     // pre-layernorm
     {
         embeddings = ggml_norm(ctx0, embeddings);
+        ggml_set_name(embeddings, "check");
 
         embeddings = ggml_add(ctx0,
                               ggml_mul(ctx0,
@@ -1707,11 +1703,10 @@ bool clip_image_batch_encode(
     embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
 
     // normalize output embeddings
+    struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, 0));
     ggml_tensor *length = ggml_sqrt(ctx0,
-                                    ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
-    embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
-
-    ggml_set_name(embeddings, "check");
+                                    ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
+    embeddings = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
 
     // run the computation
     ggml_build_forward_expand(&gf, embeddings);
diff --git a/examples/main.cpp b/examples/main.cpp
index 51b1ae6..920b356 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 
     std::vector<clip_image_f32> imgs;
     imgs.push_back(img_res);
-    imgs.push_back(img_res);
+    // imgs.push_back(img_res);
 
     const int64_t t_image_encode_us = ggml_time_us();
 

From 734f6db4a0528b1810729f03b2ef3fd60bdcdf49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Sun, 2 Jul 2023 13:28:08 +0300
Subject: [PATCH 06/18] Batched output normalization

---
 clip.cpp          | 19 +++++++++++++------
 examples/main.cpp |  4 ++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index e381942..6086011 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -1132,6 +1132,7 @@ bool clip_image_encode(
     // pre-layernorm
     {
         embeddings = ggml_norm(ctx0, embeddings);
+        ggml_set_name(embeddings, "check");
 
         embeddings = ggml_add(ctx0,
                               ggml_mul(ctx0,
@@ -1703,13 +1704,19 @@ bool clip_image_batch_encode(
     embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
 
     // normalize output embeddings
-    struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, 0));
-    ggml_tensor *length = ggml_sqrt(ctx0,
-                                    ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
-    embeddings = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+    struct ggml_tensor *output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size);
+
+    for (int b = 0; b < batch_size; b++)
+    {
+        struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
+        ggml_tensor *length = ggml_sqrt(ctx0,
+                                        ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
+        embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
+    }
 
     // run the computation
-    ggml_build_forward_expand(&gf, embeddings);
+    ggml_build_forward_expand(&gf, output);
     ggml_graph_compute(ctx0, &gf);
 
 // print
@@ -1767,7 +1774,7 @@ bool clip_image_batch_encode(
     printf("used_mem = %zu\n", ggml_used_mem(ctx0));
 #endif
 
-    memcpy(vec, ggml_get_data_f32(embeddings), sizeof(float) * projection_dim);
+    memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size);
 
     ggml_free(ctx0);
 
diff --git a/examples/main.cpp b/examples/main.cpp
index 920b356..49c817b 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 
     std::vector<clip_image_f32> imgs;
     imgs.push_back(img_res);
-    // imgs.push_back(img_res);
+    imgs.push_back(img_res);
 
     const int64_t t_image_encode_us = ggml_time_us();
 
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
 
     const int64_t t_similarity_score = ggml_time_us();
 
-    float score = clip_similarity_score(txt_vec, img_vec, vec_dim);
+    float score = clip_similarity_score(txt_vec, img_vec + vec_dim, vec_dim);
     printf("%s Similarity score = %2.3f\n", __func__, score);
 
     const int64_t t_main_end_us = ggml_time_us();

From 236b75537d34ed3467c5e81ebb750dd1f04becc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Tue, 4 Jul 2023 00:17:02 +0300
Subject: [PATCH 07/18] Full batch inference is working

---
 clip.cpp          | 429 +++++++++-------------------------------------
 examples/main.cpp |  10 +-
 ggml              |   2 +-
 3 files changed, 86 insertions(+), 355 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index 6086011..6a8db54 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -11,7 +11,7 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
-#define CLIP_DEBUG
+// #define CLIP_DEBUG
 
 // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
 // after that, remove this and use the mechanism implemented in GGML directly
@@ -1058,6 +1058,17 @@ bool clip_image_encode(
     int n_threads,
     const clip_image_f32 &img,
     float *vec)
+{
+    std::vector<clip_image_f32> imgs;
+    imgs.push_back(img);
+    return clip_image_batch_encode(ctx, n_threads, imgs, vec);
+}
+
+bool clip_image_batch_encode(
+    const clip_ctx *ctx,
+    int n_threads,
+    const std::vector<clip_image_f32> &imgs,
+    float *vec)
 {
     const auto &model = ctx->vision_model;
     const auto &hparams = model.hparams;
@@ -1072,6 +1083,7 @@ bool clip_image_encode(
     const int n_layer = hparams.n_layer;
     const int n_intermediate = hparams.n_intermediate;
     const int projection_dim = hparams.projection_dim;
+    int batch_size = imgs.size();
 
     auto &buf_compute = ctx->buf_compute;
 
@@ -1088,38 +1100,48 @@ bool clip_image_encode(
     static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions);
     static void *scr0 = malloc(scr0_size);
 
-    struct ggml_tensor *inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, 1);
+    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
 
     {
-        float *data = (float *)ggml_get_data(inp);
+        float *data = (float *)ggml_get_data(inp_raw);
 
-        const int nx = img.nx;
-        const int ny = img.ny;
-        const int n = nx * ny;
+        for (int b = 0; b < imgs.size(); b++)
+        {
+            const int nx = imgs[b].nx;
+            const int ny = imgs[b].ny;
+            GGML_ASSERT(nx == image_size && ny == image_size);
 
-        GGML_ASSERT(nx == image_size && ny == image_size);
+            const int n = nx * ny;
 
-        for (int k = 0; k < 3; k++)
-        {
-            for (int y = 0; y < ny; y++)
+            for (int b = 0; b < batch_size; b++)
             {
-                for (int x = 0; x < nx; x++)
+                for (int k = 0; k < 3; k++)
                 {
-                    data[k * n + y * nx + x] = img.data[3 * (y * nx + x) + k];
+                    for (int y = 0; y < ny; y++)
+                    {
+                        for (int x = 0; x < nx; x++)
+                        {
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k];
+                        }
+                    }
                 }
             }
         }
     }
 
-    inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp);
-    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
-    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+    struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
     // concat class_embeddings and patch_embeddings
-    struct ggml_tensor *embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_size, num_positions);
+    struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+
     ggml_set_zero(embeddings);
-    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-    embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size);
+    struct ggml_tensor *temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
+
+    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+    embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
 
     struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     for (int i = 0; i < num_positions; i++)
@@ -1127,12 +1149,11 @@ bool clip_image_encode(
         ggml_set_i32_1d(positions, i, i);
     }
 
-    embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
+    embeddings = ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
 
     // pre-layernorm
     {
         embeddings = ggml_norm(ctx0, embeddings);
-        ggml_set_name(embeddings, "check");
 
         embeddings = ggml_add(ctx0,
                               ggml_mul(ctx0,
@@ -1146,6 +1167,8 @@ bool clip_image_encode(
     {
         struct ggml_tensor *cur = embeddings; // embeddings = residual, cur = hidden_states
 
+        const size_t nb_q_w = model.layers[il].q_w->nb[0];
+
         ggml_set_scratch(ctx0, {0, scr0_size, scr0});
 
         // layernorm1
@@ -1161,44 +1184,48 @@ bool clip_image_encode(
 
         // self-attention
         {
+
             struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur),
-                                             ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
+                                             ggml_mul_mat(ctx0, model.layers[il].q_w,
+                                                          cur));
 
             Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head)));
-            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, 1);
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
             Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head);
+            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
 
-            struct ggml_tensor *K =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur),
-                         ggml_mul_mat(ctx0, model.layers[il].k_w, cur));
+            struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur),
+                                             ggml_mul_mat(ctx0, model.layers[il].k_w,
+                                                          cur));
 
-            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, 1);
+            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
             K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head);
+            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
 
-            struct ggml_tensor *V =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur),
-                         ggml_mul_mat(ctx0, model.layers[il].v_w, cur));
-            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, 1);
+            struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur),
+                                             ggml_mul_mat(ctx0, model.layers[il].v_w,
+                                                          cur));
+
+            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
             V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head);
+            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
 
             struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
             KQ = ggml_soft_max_inplace(ctx0, KQ);
             struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, 1);
+            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
             KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
 
             cur = ggml_cpy(ctx0,
                            KQV,
-                           ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_size, num_positions));
+                           ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
         }
 
         // attention output
         cur = ggml_add(ctx0,
                        ggml_repeat(ctx0, model.layers[il].o_b, cur),
-                       ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
+                       ggml_mul_mat(ctx0, model.layers[il].o_w,
+                                    cur));
 
         // re-add the layer input, e.g., residual
         cur = ggml_add(ctx0, cur, embeddings);
@@ -1242,8 +1269,12 @@ bool clip_image_encode(
     }
 
     // get the output of cls token, e.g., 0th index
-    struct ggml_tensor *cls = ggml_new_i32(ctx0, 0);
-    embeddings = ggml_get_rows(ctx0, embeddings, cls);
+    struct ggml_tensor *cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size);
+    for (int b = 0; b < batch_size; b++)
+    {
+        ggml_set_i32_1d(cls, b, b * num_positions);
+    }
+    embeddings = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, embeddings, hidden_size, num_positions * batch_size), cls);
 
     // post-layernorm
     {
@@ -1262,14 +1293,20 @@ bool clip_image_encode(
     embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
 
     // normalize output embeddings
-    ggml_tensor *length = ggml_sqrt(ctx0,
-                                    ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
-    embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+    struct ggml_tensor *output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size);
 
-    ggml_set_name(embeddings, "check");
+    for (int b = 0; b < batch_size; b++)
+    {
+        struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
+        ggml_tensor *length = ggml_sqrt(ctx0,
+                                        ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
+        embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
+    }
+    ggml_set_name(output, "check");
 
     // run the computation
-    ggml_build_forward_expand(&gf, embeddings);
+    ggml_build_forward_expand(&gf, output);
     ggml_graph_compute(ctx0, &gf);
 
 // print
@@ -1313,6 +1350,7 @@ bool clip_image_encode(
         };
 
         auto *t = ggml_get_tensor(ctx0, "check");
+        // auto t = inp_raw;
         if (t->type == GGML_TYPE_F32)
         {
             print_t_f32(t);
@@ -1326,7 +1364,7 @@ bool clip_image_encode(
     printf("used_mem = %zu\n", ggml_used_mem(ctx0));
 #endif
 
-    memcpy(vec, ggml_get_data_f32(embeddings), sizeof(float) * projection_dim);
+    memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size);
 
     ggml_free(ctx0);
 
@@ -1477,306 +1515,3 @@ bool image_normalize(clip_image_u8 *img, clip_image_f32 *res)
     }
     return true;
 }
-
-bool clip_image_batch_encode(
-    const clip_ctx *ctx,
-    int n_threads,
-    const std::vector<clip_image_f32> &imgs,
-    float *vec)
-{
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
-    const int num_positions = num_patches + 1;
-    const int hidden_size = hparams.hidden_size;
-    const int n_head = hparams.n_head;
-    const int d_head = hidden_size / n_head;
-    const int n_layer = hparams.n_layer;
-    const int n_intermediate = hparams.n_intermediate;
-    const int projection_dim = hparams.projection_dim;
-    const int batch_size = imgs.size();
-
-    auto &buf_compute = ctx->buf_compute;
-
-    struct ggml_init_params params = {
-        .mem_size = buf_compute.size,
-        .mem_buffer = buf_compute.data,
-        .no_alloc = false,
-    };
-
-    struct ggml_context *ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;
-
-    static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions);
-    static void *scr0 = malloc(scr0_size);
-
-    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
-
-    {
-        float *data = (float *)ggml_get_data(inp_raw);
-
-        const int nx = imgs[0].nx;
-        const int ny = imgs[0].ny;
-        const int n = nx * ny;
-
-        GGML_ASSERT(nx == image_size && ny == image_size);
-
-        for (int b = 0; b < batch_size; b++)
-        {
-            for (int k = 0; k < 3; k++)
-            {
-                for (int y = 0; y < ny; y++)
-                {
-                    for (int x = 0; x < nx; x++)
-                    {
-                        data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k];
-                    }
-                }
-            }
-        }
-    }
-
-    struct ggml_tensor *inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp_raw);
-    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-
-    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-
-    ggml_set_zero(embeddings);
-    struct ggml_tensor *temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
-
-    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-    embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-
-    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    for (int i = 0; i < num_positions; i++)
-    {
-        ggml_set_i32_1d(positions, i, i);
-    }
-
-    embeddings = ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
-
-    // pre-layernorm
-    {
-        embeddings = ggml_norm(ctx0, embeddings);
-        ggml_set_name(embeddings, "check");
-
-        embeddings = ggml_add(ctx0,
-                              ggml_mul(ctx0,
-                                       ggml_repeat(ctx0, model.pre_ln_w, embeddings),
-                                       embeddings),
-                              ggml_repeat(ctx0, model.pre_ln_b, embeddings));
-    }
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++)
-    {
-        struct ggml_tensor *cur = embeddings; // embeddings = residual, cur = hidden_states
-
-        const size_t nb_q_w = model.layers[il].q_w->nb[0];
-
-        ggml_set_scratch(ctx0, {0, scr0_size, scr0});
-
-        // layernorm1
-        {
-            cur = ggml_norm(ctx0, cur);
-
-            cur = ggml_add(ctx0,
-                           ggml_mul(ctx0,
-                                    ggml_repeat(ctx0, model.layers[il].ln_1_w, cur),
-                                    cur),
-                           ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // self-attention
-        {
-
-            struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur),
-                                             ggml_mul_mat(ctx0, model.layers[il].q_w,
-                                                          cur));
-
-            Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head)));
-            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
-            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
-
-            struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur),
-                                             ggml_mul_mat(ctx0, model.layers[il].k_w,
-                                                          cur));
-
-            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-
-            struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur),
-                                             ggml_mul_mat(ctx0, model.layers[il].v_w,
-                                                          cur));
-
-            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
-
-            struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
-            KQ = ggml_soft_max_inplace(ctx0, KQ);
-            struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
-            KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
-
-            cur = ggml_cpy(ctx0,
-                           KQV,
-                           ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
-        }
-
-        // attention output
-        cur = ggml_add(ctx0,
-                       ggml_repeat(ctx0, model.layers[il].o_b, cur),
-                       ggml_mul_mat(ctx0, model.layers[il].o_w,
-                                    cur));
-
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, embeddings);
-
-        embeddings = cur; // embeddings = residual, cur = hidden_states
-
-        // layernorm2
-        {
-            cur = ggml_norm(ctx0, cur);
-
-            cur = ggml_add(ctx0,
-                           ggml_mul(ctx0,
-                                    ggml_repeat(ctx0, model.layers[il].ln_2_w, cur),
-                                    cur),
-                           ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-        }
-
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0,
-                       ggml_repeat(ctx0, model.layers[il].ff_i_b, cur),
-                       cur);
-
-        if (ctx->use_gelu)
-        {
-            cur = ggml_gelu_inplace(ctx0, cur);
-        }
-        else
-        {
-            cur = ggml_gelu_quick_inplace(ctx0, cur);
-        }
-
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0,
-                       ggml_repeat(ctx0, model.layers[il].ff_o_b, cur),
-                       cur);
-
-        // residual 2
-        cur = ggml_add(ctx0, embeddings, cur);
-
-        embeddings = cur;
-    }
-
-    // get the output of cls token, e.g., 0th index
-    struct ggml_tensor *cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size);
-    for (int b = 0; b < batch_size; b++)
-    {
-        ggml_set_i32_1d(cls, b, b * num_positions);
-    }
-    embeddings = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, embeddings, hidden_size, num_positions * batch_size), cls);
-
-    // post-layernorm
-    {
-        embeddings = ggml_norm(ctx0, embeddings);
-
-        embeddings = ggml_add(ctx0,
-                              ggml_mul(ctx0,
-                                       ggml_repeat(ctx0, model.post_ln_w, embeddings),
-                                       embeddings),
-                              ggml_repeat(ctx0, model.post_ln_b, embeddings));
-    }
-
-    ggml_set_scratch(ctx0, {0, 0, nullptr});
-
-    // final visual projection
-    embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
-
-    // normalize output embeddings
-    struct ggml_tensor *output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size);
-
-    for (int b = 0; b < batch_size; b++)
-    {
-        struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
-        ggml_tensor *length = ggml_sqrt(ctx0,
-                                        ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
-        embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
-        output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
-    }
-
-    // run the computation
-    ggml_build_forward_expand(&gf, output);
-    ggml_graph_compute(ctx0, &gf);
-
-// print
-#ifdef CLIP_DEBUG
-    {
-        auto print_t_f32 = [&](struct ggml_tensor *t)
-        {
-            float *data = (float *)t->data;
-            printf("dtype: f32, dims: %jd %jd %jd %jd, nb: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-            printf("data: ");
-            for (int i = 0; i < std::min((int)t->ne[0], 20); i++)
-            {
-                printf("%f ", data[i]);
-            }
-
-            // printf("\n\n");
-            double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++)
-            {
-                sum += data[i];
-            }
-            printf("sum:  %f\n", sum);
-        };
-
-        auto print_t_f16 = [&](struct ggml_tensor *t)
-        {
-            ggml_fp16_t *data = (ggml_fp16_t *)t->data;
-            printf("dtype: f16, dims: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]);
-            printf("data: ");
-            for (int i = 0; i < std::min((int)t->ne[0], 10); i++)
-            {
-                printf("%f ", ggml_fp16_to_fp32(data[i]));
-            }
-            printf("\n\n");
-            double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++)
-            {
-                sum += ggml_fp16_to_fp32(data[i]);
-            }
-            printf("sum:  %f\n", sum);
-        };
-
-        auto *t = ggml_get_tensor(ctx0, "check");
-        // auto t = inp_raw;
-        if (t->type == GGML_TYPE_F32)
-        {
-            print_t_f32(t);
-        }
-        else
-        {
-            print_t_f16(t);
-        }
-    }
-
-    printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-#endif
-
-    memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size);
-
-    ggml_free(ctx0);
-
-    return true;
-}
diff --git a/examples/main.cpp b/examples/main.cpp
index 49c817b..e38e047 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -54,21 +54,17 @@ int main(int argc, char **argv)
 
     clip_image_preprocess(ctx, &img0, &img_res);
 
-    std::vector<clip_image_f32> imgs;
-    imgs.push_back(img_res);
-    imgs.push_back(img_res);
-
     const int64_t t_image_encode_us = ggml_time_us();
 
-    float img_vec[vec_dim * 2];
-    if (!clip_image_batch_encode(ctx, params.n_threads, imgs, img_vec))
+    float img_vec[vec_dim];
+    if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec))
     {
         return 1;
     }
 
     const int64_t t_similarity_score = ggml_time_us();
 
-    float score = clip_similarity_score(txt_vec, img_vec + vec_dim, vec_dim);
+    float score = clip_similarity_score(txt_vec, img_vec, vec_dim);
     printf("%s Similarity score = %2.3f\n", __func__, score);
 
     const int64_t t_main_end_us = ggml_time_us();
diff --git a/ggml b/ggml
index f967ae8..703c2a6 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit f967ae87c73dbc94201be22e346e35c51aacaa1b
+Subproject commit 703c2a69bc93bb4ddac684ee956db324476e3f7f

From 53250a52ede32666923d967d6356e252ee4c695e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Tue, 4 Jul 2023 00:19:37 +0300
Subject: [PATCH 08/18] Full batch inference is working

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index a96abb9..1d27ed4 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@ This repo is aimed at powering useful applications based on such models on compu
 
 clip.cpp also has a short startup time compared to large ML frameworks, which makes it suitable for serverless deployments where the cold start is an issue.
 
+## Hot topics
+- 06/04/2023: Batch inference support for image encoding.
+
 ## Note about image preprocessing
 PIL uses a two-pass convolutions-based bicubic interpolation in resizing with antialiasing applied. In Pytorch, antialiasing is optional. It needs some extra attention to implement this preprocessing logic that matches their results numerically. However, I found that linear interpolation is also good enough for both comparison of different embeddings from this implementation and also comparison of an embedding from this implementation and another one from Transformers. So let's use it until we craft a proper bicubic interpolation.
 

From 334889648cbad2a108f8fd3f1824ed424476fe58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 6 Jul 2023 14:41:19 +0300
Subject: [PATCH 09/18] Sync ggml

---
 clip.cpp | 2 +-
 ggml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index 6a8db54..e36af2d 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -11,7 +11,7 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
-// #define CLIP_DEBUG
+#define CLIP_DEBUG
 
 // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
 // after that, remove this and use the mechanism implemented in GGML directly
diff --git a/ggml b/ggml
index 703c2a6..bc721e7 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 703c2a69bc93bb4ddac684ee956db324476e3f7f
+Subproject commit bc721e70f390eae8294fc775fb9936103e503787

From b8d0f4318564c99238e4fffbe054f5ed6b9236a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 6 Jul 2023 14:41:58 +0300
Subject: [PATCH 10/18] Sync ggml

---
 clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clip.cpp b/clip.cpp
index e36af2d..6a8db54 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -11,7 +11,7 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
-#define CLIP_DEBUG
+// #define CLIP_DEBUG
 
 // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
 // after that, remove this and use the mechanism implemented in GGML directly

From b1c028974d50f4bcf2b5a378be3e942d908c678b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 6 Jul 2023 14:44:13 +0300
Subject: [PATCH 11/18] Sync ggml

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1d27ed4..9179ba9 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ This repo is aimed at powering useful applications based on such models on compu
 clip.cpp also has a short startup time compared to large ML frameworks, which makes it suitable for serverless deployments where the cold start is an issue.
 
 ## Hot topics
-- 06/04/2023: Batch inference support for image encoding.
+- 07/04/2023: Batch inference support for image encoding.
 
 ## Note about image preprocessing
 PIL uses a two-pass convolutions-based bicubic interpolation in resizing with antialiasing applied. In Pytorch, antialiasing is optional. It needs some extra attention to implement this preprocessing logic that matches their results numerically. However, I found that linear interpolation is also good enough for both comparison of different embeddings from this implementation and also comparison of an embedding from this implementation and another one from Transformers. So let's use it until we craft a proper bicubic interpolation.

From ee6ceca9e033ee04af6bbe87aa7ee08886d98ccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 7 Jul 2023 17:27:46 +0300
Subject: [PATCH 12/18] add multithreaded batched image preprocessing

---
 clip.cpp | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 clip.h   |  1 +
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index 6a8db54..e464e57 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -5,6 +5,8 @@
 #include <iostream>
 #include <regex>
 #include <fstream>
+#include <pthread.h>
+
 #include "ggml/ggml.h"
 #include "clip.h"
 
@@ -23,7 +25,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions)
     case 397:                        // base
         if (n_image_positions == 50) // patch size = 32
         {
-            return 32 * mb;
+            return 8 * mb;
         }
         else // patch size = 16
         {
@@ -54,7 +56,7 @@ size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions)
     case 397:
         if (n_positions <= 50)
         {
-            return 512 * mb;
+            return 16 * mb;
         }
         else
         {
@@ -252,6 +254,77 @@ bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_i
 
     return true;
 }
+// Structure to hold the image data as an input to function to be executed for thread
+typedef struct
+{
+    clip_image_u8 *input;
+    clip_image_f32 *resized;
+    const clip_ctx *ctx;
+} ImageData;
+
+// Function to preprocess a single image in a thread
+void *preprocess_image(void *arg)
+{
+    ImageData *imageData = static_cast<ImageData *>(arg);
+    clip_image_u8 *input = imageData->input;
+    clip_image_f32 *resized = imageData->resized;
+    const clip_ctx *ctx = imageData->ctx;
+
+    // Call the original preprocess function on the image
+    clip_image_preprocess(ctx, input, resized);
+
+    pthread_exit(NULL);
+}
+
+// Function to batch-preprocess multiple images i
+void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_image_u8 *> &img_inputs, std::vector<clip_image_f32> &img_resized, const int n_threads)
+{
+    GGML_ASSERT(img_inputs.size() == img_resized.size());
+    int num_threads = std::min(n_threads, static_cast<int>(img_inputs.size()));
+    int i, t;
+
+    // Divide the images among the threads
+    int images_per_thread = img_inputs.size() / num_threads;
+
+    if (num_threads == 1)
+    {
+        // Single-threaded case
+        for (i = 0; i < img_inputs.size(); i++)
+        {
+            clip_image_preprocess(ctx, img_inputs[i], &img_resized[i]);
+        }
+    }
+    else
+    {
+        // Multi-threaded case
+
+        std::vector<pthread_t> threads(num_threads);
+        std::vector<ImageData> imageData(img_inputs.size());
+
+        for (t = 0; t < num_threads; t++)
+        {
+            int start_index = t * images_per_thread;
+            int end_index = (t == num_threads - 1) ? img_inputs.size() : start_index + images_per_thread;
+
+            // Create ImageData for each thread
+            for (i = start_index; i < end_index; i++)
+            {
+                imageData[i].input = img_inputs[i];
+                imageData[i].resized = &img_resized[i];
+                imageData[i].ctx = ctx;
+            }
+
+            // Create a thread for each batch of images
+            pthread_create(&threads[t], NULL, preprocess_image, static_cast<void *>(&imageData[start_index]));
+        }
+
+        // Wait for all threads to finish
+        for (t = 0; t < num_threads; t++)
+        {
+            pthread_join(threads[t], NULL);
+        }
+    }
+}
 
 struct clip_ctx *clip_model_load(const char *fname, const int verbosity = 1)
 {
diff --git a/clip.h b/clip.h
index 9aa8360..8bc0dc7 100644
--- a/clip.h
+++ b/clip.h
@@ -200,6 +200,7 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx *ctx, const std::string
 
 bool clip_image_load_from_file(const std::string &fname, clip_image_u8 &img);
 bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_image_f32 *res);
+void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_image_u8 *> &img_inputs, std::vector<clip_image_f32> &img_resized, const int n_threads);
 
 bool clip_text_encode(
     const clip_ctx *ctx,

From ef3e0c97fa48ed1da0620259a776966370bc0dd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 7 Jul 2023 17:30:09 +0300
Subject: [PATCH 13/18] add multithreaded batched image preprocessing

---
 clip.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index e464e57..2b8d807 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -277,9 +277,9 @@ void *preprocess_image(void *arg)
 }
 
 // Function to batch-preprocess multiple images i
-void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_image_u8 *> &img_inputs, std::vector<clip_image_f32> &img_resized, const int n_threads)
+void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_image_u8 *> &img_inputs, std::vector<clip_image_f32> &imgs_resized, const int n_threads)
 {
-    GGML_ASSERT(img_inputs.size() == img_resized.size());
+    GGML_ASSERT(img_inputs.size() == imgs_resized.size());
     int num_threads = std::min(n_threads, static_cast<int>(img_inputs.size()));
     int i, t;
 
@@ -291,7 +291,7 @@ void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_ima
         // Single-threaded case
         for (i = 0; i < img_inputs.size(); i++)
         {
-            clip_image_preprocess(ctx, img_inputs[i], &img_resized[i]);
+            clip_image_preprocess(ctx, img_inputs[i], &imgs_resized[i]);
         }
     }
     else
@@ -310,7 +310,7 @@ void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_ima
             for (i = start_index; i < end_index; i++)
             {
                 imageData[i].input = img_inputs[i];
-                imageData[i].resized = &img_resized[i];
+                imageData[i].resized = &imgs_resized[i];
                 imageData[i].ctx = ctx;
             }
 

From f573ba5026f0d397c7708e5da22fccd7b28b9b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 12 Jul 2023 03:08:14 +0300
Subject: [PATCH 14/18] Update batch preprocess function signature

---
 clip.cpp | 14 +++++++-------
 clip.h   |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index 926ed28..7448a11 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -25,7 +25,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions)
     case 397:                        // base
         if (n_image_positions == 50) // patch size = 32
         {
-            return 8 * mb;
+            return 12 * mb;
         }
         else // patch size = 16
         {
@@ -56,7 +56,7 @@ size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions)
     case 397:
         if (n_positions <= 50)
         {
-            return 16 * mb;
+            return 32 * mb;
         }
         else
         {
@@ -257,7 +257,7 @@ bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_i
 // Structure to hold the image data as an input to function to be executed for thread
 typedef struct
 {
-    clip_image_u8 *input;
+    const clip_image_u8 *input;
     clip_image_f32 *resized;
     const clip_ctx *ctx;
 } ImageData;
@@ -266,7 +266,7 @@ typedef struct
 void *preprocess_image(void *arg)
 {
     ImageData *imageData = static_cast<ImageData *>(arg);
-    clip_image_u8 *input = imageData->input;
+    const clip_image_u8 *input = imageData->input;
     clip_image_f32 *resized = imageData->resized;
     const clip_ctx *ctx = imageData->ctx;
 
@@ -277,7 +277,7 @@ void *preprocess_image(void *arg)
 }
 
 // Function to batch-preprocess multiple images i
-void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_image_u8 *> &img_inputs, std::vector<clip_image_f32> &imgs_resized, const int n_threads)
+void clip_image_batch_preprocess(const clip_ctx *ctx, const int n_threads, const std::vector<clip_image_u8> &img_inputs, std::vector<clip_image_f32> &imgs_resized)
 {
     GGML_ASSERT(img_inputs.size() == imgs_resized.size());
     int num_threads = std::min(n_threads, static_cast<int>(img_inputs.size()));
@@ -291,7 +291,7 @@ void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_ima
         // Single-threaded case
         for (i = 0; i < img_inputs.size(); i++)
         {
-            clip_image_preprocess(ctx, img_inputs[i], &imgs_resized[i]);
+            clip_image_preprocess(ctx, &img_inputs[i], &imgs_resized[i]);
         }
     }
     else
@@ -309,7 +309,7 @@ void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_ima
             // Create ImageData for each thread
             for (i = start_index; i < end_index; i++)
             {
-                imageData[i].input = img_inputs[i];
+                imageData[i].input = &img_inputs[i];
                 imageData[i].resized = &imgs_resized[i];
                 imageData[i].ctx = ctx;
             }
diff --git a/clip.h b/clip.h
index 8bc0dc7..bc2b53d 100644
--- a/clip.h
+++ b/clip.h
@@ -200,7 +200,7 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx *ctx, const std::string
 
 bool clip_image_load_from_file(const std::string &fname, clip_image_u8 &img);
 bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_image_f32 *res);
-void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector<clip_image_u8 *> &img_inputs, std::vector<clip_image_f32> &img_resized, const int n_threads);
+void clip_image_batch_preprocess(const clip_ctx *ctx, const int n_threads, const std::vector<clip_image_u8> &img_inputs, std::vector<clip_image_f32> &img_resized);
 
 bool clip_text_encode(
     const clip_ctx *ctx,

From 0907b408d16ab6b34bf101bad4f4eaebdefc9c4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 12 Jul 2023 03:09:12 +0300
Subject: [PATCH 15/18] Implement batch inference in benchmark util

---
 CMakeLists.txt      |  2 +-
 tests/benchmark.cpp | 79 +++++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50a6538..959a78b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ endif()
 
 # general
 option(CLIP_STATIC                 "CLIP: static link libraries"                          OFF)
-option(CLIP_BUILD_TEST             "CLIP: build tests"                                    ${CLIP_STANDALONE})
+option(CLIP_BUILD_TESTS             "CLIP: build tests"                                    ${CLIP_STANDALONE})
 option(CLIP_BUILD_EXAMPLES         "CLIP: build examples"                                 ${CLIP_STANDALONE})
 option(CLIP_BUILD_IMAGE_SEARCH     "CLIP: build image-search"                             OFF)
 option(CLIP_NATIVE                 "CLIP: enable -march=native flag"                      ON)
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 0b61073..6e31e29 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -44,7 +44,7 @@ int main(int argc, char **argv)
         return 1;
     }
 
-    fprintf(fout, "%s: %d directories found in %s\n\n", __func__, n_labels, dir_path.c_str());
+    fprintf(fout, "%s: %zu directories found in %s\n\n", __func__, n_labels, dir_path.c_str());
 
     auto ctx = clip_model_load(model_path.c_str(), 2);
     if (!ctx)
@@ -53,14 +53,11 @@ int main(int argc, char **argv)
         return 1;
     }
 
+    const size_t batch_size = 4;
+
     const int vec_dim = ctx->text_model.hparams.projection_dim;
 
-    // allocate memory for text vectors
-    float *txt_vecs = (float *)malloc(n_labels * vec_dim * sizeof(float));
-    if (!txt_vecs)
-    {
-        printf("%s: Could not allocate memory for %d vectors of %d dimensions\n", __func__, n_labels, vec_dim);
-    }
+    float txt_vecs[n_labels * vec_dim];
 
     ggml_time_init();
 
@@ -88,12 +85,13 @@ int main(int argc, char **argv)
     int n_total_items = 0;         // total number of images processed
     float total_acc1_score = 0.0f; // total accuracy at 1 for the intire dataset
     float total_acc5_score = 0.0f; // total accuracy at 5 in intitre dataset
-    float img_vec[vec_dim];
+    float img_vecs[vec_dim * batch_size];
+
     float similarities[n_labels];
     float sorted_scores[n_labels];
     int indices[n_labels];
-    clip_image_u8 img;
-    clip_image_f32 img_res;
+    std::vector<clip_image_u8> img_inputs(batch_size);
+    std::vector<clip_image_f32> imgs_resized(batch_size);
 
     // print table headers
     fprintf(fout, "| class name           | acc@1  | acc@5  |\n");
@@ -107,56 +105,59 @@ int main(int argc, char **argv)
         int n_acc1 = 0;
         int n_acc5 = 0;
 
-        int64_t t_start_encode_images = ggml_time_us();
+        size_t n_batched = (entry.second.size() / batch_size) * batch_size;
 
-        for (auto &file_path : entry.second)
+        for (size_t i = 0; i < n_batched; i += batch_size)
         {
-            if (!clip_image_load_from_file(file_path, img))
+            for (size_t ib = i; ib < i + batch_size; ib++)
             {
-                printf("%s: cannot load file from %s\n", __func__, file_path.c_str());
-                return 1;
-            }
+                std::string file_path = entry.second[ib];
 
-            if (!clip_image_preprocess(ctx, &img, &img_res))
-            {
-                printf("%s: cannot preprocess image loaded from %s\n", __func__, file_path.c_str());
-                return 1;
+                if (!clip_image_load_from_file(file_path, img_inputs[ib % batch_size]))
+                {
+                    printf("%s: cannot load file from %s\n", __func__, file_path.c_str());
+                    return 1;
+                }
             }
 
-            clip_image_encode(ctx, 4, img_res, img_vec);
-            for (size_t i = 0; i < n_labels; i++)
-            {
-                similarities[i] = clip_similarity_score(img_vec, txt_vecs + i * vec_dim, vec_dim);
-            }
+            clip_image_batch_preprocess(ctx, 4, img_inputs, imgs_resized);
 
-            softmax_with_sorting(similarities, n_labels, sorted_scores, indices);
-            for (int j = 0; j < 5; j++)
+            clip_image_batch_encode(ctx, 4, imgs_resized, img_vecs);
+
+            for (size_t b = 0; b < batch_size; b++)
             {
-                if (j == 0 && indices[j] == label_idx)
+                for (size_t j = 0; j < n_labels; j++)
                 {
-                    n_acc1 += 1;
-                    n_acc5 += 1;
-                    break;
+                    similarities[j] = clip_similarity_score(img_vecs + b * vec_dim, txt_vecs + j * vec_dim, vec_dim);
                 }
-                else if (indices[j] == label_idx)
+                softmax_with_sorting(similarities, n_labels, sorted_scores, indices);
+
+                for (int k = 0; k < 5; k++)
                 {
-                    n_acc5 += 1;
-                    break;
+                    if (k == 0 && indices[k] == label_idx)
+                    {
+                        n_acc1 += 1;
+                        n_acc5 += 1;
+                        break;
+                    }
+                    else if (indices[k] == label_idx)
+                    {
+                        n_acc5 += 1;
+                        break;
+                    }
                 }
-            }
 
-            n_items += 1;
-            n_total_items += 1;
+                n_items += 1;
+                n_total_items += 1;
+            }
         }
 
         float acc1_score = (float)n_acc1 / n_items;
         float acc5_score = (float)n_acc5 / n_items;
         total_acc1_score += acc1_score;
         total_acc5_score += acc5_score;
-        // printf("%s: acc@1 = %2.4f - acc@5 = %2.4f\n", entry.first.c_str(), acc1_score, acc5_score);
         fprintf(fout, "| %-*s ", 20, entry.first.c_str());
         fprintf(fout, "| %2.4f | %2.4f |\n", acc1_score, acc5_score);
-
         label_idx += 1;
     }
 

From 8c09113e154b8f3a589a47d5780a19e4546c227a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 12 Jul 2023 15:29:46 +0300
Subject: [PATCH 16/18] sync ggml

---
 clip.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/clip.cpp b/clip.cpp
index 7448a11..b77cbba 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -913,7 +913,6 @@ bool clip_text_encode(
 
     struct ggml_context *ctx0 = ggml_init(params);
     struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;
 
     static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), N);
     static void *scr0 = malloc(scr0_size);
@@ -1064,7 +1063,7 @@ bool clip_text_encode(
 
     // run the computation
     ggml_build_forward_expand(&gf, embeddings);
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
 // print
 #ifdef CLIP_DEBUG
@@ -1168,7 +1167,6 @@ bool clip_image_batch_encode(
 
     struct ggml_context *ctx0 = ggml_init(params);
     struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;
 
     static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions);
     static void *scr0 = malloc(scr0_size);
@@ -1380,7 +1378,7 @@ bool clip_image_batch_encode(
 
     // run the computation
     ggml_build_forward_expand(&gf, output);
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
 // print
 #ifdef CLIP_DEBUG

From 8ea50bf72716e2268a4dc77854b4300bde50118c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 12 Jul 2023 15:54:12 +0300
Subject: [PATCH 17/18] set n_threads as const

---
 ggml                | 2 +-
 tests/benchmark.cpp | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ggml b/ggml
index bc721e7..ad18208 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit bc721e70f390eae8294fc775fb9936103e503787
+Subproject commit ad18208ff5c2446ccc2b66c9f3bee6d59c1731f5
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 6e31e29..f23da4c 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -54,6 +54,7 @@ int main(int argc, char **argv)
     }
 
     const size_t batch_size = 4;
+    const size_t n_threads = 4;
 
     const int vec_dim = ctx->text_model.hparams.projection_dim;
 
@@ -70,7 +71,7 @@ int main(int argc, char **argv)
     for (const auto &entry : result)
     {
         auto tokens = clip_tokenize(ctx, entry.first);
-        if (!clip_text_encode(ctx, 4, tokens, txt_vecs + label_idx * vec_dim))
+        if (!clip_text_encode(ctx, n_threads, tokens, txt_vecs + label_idx * vec_dim))
         {
             printf("%s: Could not encode the label at index %d: %s\n", __func__, label_idx, entry.first.c_str());
             return 1;
@@ -120,9 +121,9 @@ int main(int argc, char **argv)
                 }
             }
 
-            clip_image_batch_preprocess(ctx, 4, img_inputs, imgs_resized);
+            clip_image_batch_preprocess(ctx, n_threads, img_inputs, imgs_resized);
 
-            clip_image_batch_encode(ctx, 4, imgs_resized, img_vecs);
+            clip_image_batch_encode(ctx, n_threads, imgs_resized, img_vecs);
 
             for (size_t b = 0; b < batch_size; b++)
             {

From 5f4fa2fb48b5a787785876b6087d1016dab829e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 12 Jul 2023 20:52:31 +0300
Subject: [PATCH 18/18] Sync ggml

---
 README.md | 3 ++-
 ggml      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c378faf..7fc927e 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,8 @@ This repo is aimed at powering useful applications based on such models on compu
 clip.cpp also has a short startup time compared to large ML frameworks, which makes it suitable for serverless deployments where the cold start is an issue.
 
 ## Hot topics
-- 07/04/2023: Batch inference support for image encoding.
+- 07/12/2023: Batch inference support for image encoding.
+- 07/11/2023: Semantic image search [example](examples/image-search/README.md) directly in C++.
 
 ## Note about image preprocessing
 PIL uses a two-pass convolutions-based bicubic interpolation in resizing with antialiasing applied. In Pytorch, antialiasing is optional. It needs some extra attention to implement this preprocessing logic that matches their results numerically. However, I found that linear interpolation is also good enough for both comparison of different embeddings from this implementation and also comparison of an embedding from this implementation and another one from Transformers. So let's use it until we craft a proper bicubic interpolation.
diff --git a/ggml b/ggml
index ad18208..5621652 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit ad18208ff5c2446ccc2b66c9f3bee6d59c1731f5
+Subproject commit 56216523fa1df0c0bc36201dbecd8e0a01668d91