From c37aa34a9d89b3d272e14973fdeba3989c1e725e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 29 Jun 2023 23:11:55 +0300 Subject: [PATCH 01/18] WIP: Implement batch inference --- clip.cpp | 310 +++++++++++++++++++++++++++++++++++++++++++++- clip.h | 7 +- examples/main.cpp | 8 +- ggml | 2 +- 4 files changed, 319 insertions(+), 8 deletions(-) diff --git a/clip.cpp b/clip.cpp index b917270..b722abe 100644 --- a/clip.cpp +++ b/clip.cpp @@ -11,7 +11,7 @@ #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" -// #define CLIP_DEBUG +#define CLIP_DEBUG // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved // after that, remove this and use the mechanism implemented in GGML directly @@ -23,7 +23,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions) case 397: // base if (n_image_positions == 50) // patch size = 32 { - return 8 * mb; + return 16 * mb; } else // patch size = 16 { @@ -1236,7 +1236,6 @@ bool clip_image_encode( // residual 2 cur = ggml_add(ctx0, embeddings, cur); - // ggml_set_name(cur, "check"); embeddings = cur; } @@ -1477,3 +1476,308 @@ bool image_normalize(clip_image_u8 *img, clip_image_f32 *res) } return true; } + +bool clip_image_batch_encode( + const clip_ctx *ctx, + int n_threads, + const std::vector &imgs, + float *vec) +{ + const auto &model = ctx->vision_model; + const auto &hparams = model.hparams; + + const int image_size = hparams.image_size; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_positions = num_patches + 1; + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const int n_intermediate = hparams.n_intermediate; + const int projection_dim = hparams.projection_dim; + const int batch_size = imgs.size(); + + auto &buf_compute = ctx->buf_compute; + + struct ggml_init_params params = { + .mem_size = buf_compute.size, + .mem_buffer = buf_compute.data, + .no_alloc = false, + }; + + struct ggml_context *ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions); + static void *scr0 = malloc(scr0_size); + + struct ggml_tensor *inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); + + { + float *data = (float *)ggml_get_data(inp); + + const int nx = imgs[0].nx; + const int ny = imgs[0].ny; + const int n = nx * ny; + + GGML_ASSERT(nx == image_size && ny == image_size); + + for (int b = 0; b < batch_size; b++) + { + for (int k = 0; k < 3; k++) + { + for (int y = 0; y < ny; y++) + { + for (int x = 0; x < nx; x++) + { + data[(b * k * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k]; + } + } + } + } + } + + inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp); + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + // concat class_embeddings and patch_embeddings + struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + /* + ggml_set_zero(embeddings); + for (int b = 0; b < batch_size; b++) + { + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, b * (ggml_nbytes(embeddings) / batch_size)); + embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size); + } + */ + + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); + + struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + for (int i = 0; i < num_positions; i++) + { + ggml_set_i32_1d(positions, i, i); + } + + embeddings = ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings)); + + // pre-layernorm + { + embeddings = ggml_norm(ctx0, embeddings); + + embeddings = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.pre_ln_w, embeddings), + embeddings), + ggml_repeat(ctx0, model.pre_ln_b, embeddings)); + } + + struct ggml_tensor *temp_w = ggml_new_tensor_4d(ctx0, model.layers[0].q_w->type, hidden_size, hidden_size, batch_size, 1); + struct ggml_tensor *temp_i = ggml_new_tensor_4d(ctx0, model.layers[0].ff_i_w->type, hidden_size, n_intermediate, batch_size, 1); + struct ggml_tensor *temp_o = ggml_new_tensor_4d(ctx0, model.layers[0].ff_o_w->type, n_intermediate, hidden_size, batch_size, 1); + + // loop over layers + for (int il = 0; il < n_layer; il++) + { + struct ggml_tensor *cur = embeddings; // embeddings = residual, cur = hidden_states + + const size_t nb_q_w = model.layers[il].q_w->nb[0]; + + ggml_set_scratch(ctx0, {0, scr0_size, scr0}); + + // layernorm1 + { + cur = ggml_norm(ctx0, cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + // self-attention + { + + struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), + ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].q_w, temp_w), + cur)); + + Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))); + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), + ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].k_w, temp_w), + cur)); + + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), + ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].v_w, temp_w), + cur)); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + + struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); + KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); + + cur = ggml_cpy(ctx0, + KQV, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size)); + } + + // attention output + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].o_b, cur), + ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].o_w, temp_w), + cur)); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // layernorm2 + { + cur = ggml_norm(ctx0, cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + } + + cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_w, temp_i), cur); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), + cur); + + if (ctx->use_gelu) + { + cur = ggml_gelu_inplace(ctx0, cur); + } + else + { + cur = ggml_gelu_quick_inplace(ctx0, cur); + } + + cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_w, temp_o), cur); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), + cur); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // get the output of cls token, e.g., 0th index + struct ggml_tensor *cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size); + for (int b = 0; b < batch_size; b++) + { + ggml_set_i32_1d(cls, b, b * num_positions); + } + embeddings = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, embeddings, hidden_size, num_positions * batch_size), cls); + + // post-layernorm + { + embeddings = ggml_norm(ctx0, embeddings); + + embeddings = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.post_ln_w, embeddings), + embeddings), + ggml_repeat(ctx0, model.post_ln_b, embeddings)); + } + + ggml_set_scratch(ctx0, {0, 0, nullptr}); + + // final visual projection + embeddings = ggml_mul_mat(ctx0, model.projection, embeddings); + + // normalize output embeddings + ggml_tensor *length = ggml_sqrt(ctx0, + ggml_sum(ctx0, ggml_sqr(ctx0, embeddings))); + embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + + ggml_set_name(embeddings, "check"); + + // run the computation + ggml_build_forward_expand(&gf, embeddings); + ggml_graph_compute(ctx0, &gf); + +// print +#ifdef CLIP_DEBUG + { + auto print_t_f32 = [&](struct ggml_tensor *t) + { + float *data = (float *)t->data; + printf("dtype: f32, dims: %jd %jd %jd %jd, nb: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->nb[0], t->nb[1], t->nb[2], t->nb[3]); + printf("data: "); + for (int i = 0; i < std::min((int)t->ne[0], 20); i++) + { + printf("%f ", data[i]); + } + + // printf("\n\n"); + double sum = 0.0; + for (int i = 0; i < ggml_nelements(t); i++) + { + sum += data[i]; + } + printf("sum: %f\n", sum); + }; + + auto print_t_f16 = [&](struct ggml_tensor *t) + { + ggml_fp16_t *data = (ggml_fp16_t *)t->data; + printf("dtype: f16, dims: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]); + printf("data: "); + for (int i = 0; i < std::min((int)t->ne[0], 10); i++) + { + printf("%f ", ggml_fp16_to_fp32(data[i])); + } + printf("\n\n"); + double sum = 0.0; + for (int i = 0; i < ggml_nelements(t); i++) + { + sum += ggml_fp16_to_fp32(data[i]); + } + printf("sum: %f\n", sum); + }; + + auto *t = ggml_get_tensor(ctx0, "check"); + if (t->type == GGML_TYPE_F32) + { + print_t_f32(t); + } + else + { + print_t_f16(t); + } + } + + printf("used_mem = %zu\n", ggml_used_mem(ctx0)); +#endif + + memcpy(vec, ggml_get_data_f32(embeddings), sizeof(float) * projection_dim); + + ggml_free(ctx0); + + return true; +} diff --git a/clip.h b/clip.h index 7978019..9aa8360 100644 --- a/clip.h +++ b/clip.h @@ -219,8 +219,11 @@ bool clip_compare_text_and_image(clip_ctx *ctx, int n_threads, std::string &text float clip_similarity_score(float *vec1, float *vec2, int vec_dim); bool softmax_with_sorting(float *arr, int length, float *sorted_scores, int *indices); -// utils for debugging -void write_floats_to_file(float *array, int size, char *filename); +bool clip_image_batch_encode( + const clip_ctx *ctx, + int n_threads, + const std::vector &imgs, + float *vec); // #ifdef __cplusplus // } diff --git a/examples/main.cpp b/examples/main.cpp index e38e047..920b356 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -54,10 +54,14 @@ int main(int argc, char **argv) clip_image_preprocess(ctx, &img0, &img_res); + std::vector imgs; + imgs.push_back(img_res); + // imgs.push_back(img_res); + const int64_t t_image_encode_us = ggml_time_us(); - float img_vec[vec_dim]; - if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec)) + float img_vec[vec_dim * 2]; + if (!clip_image_batch_encode(ctx, params.n_threads, imgs, img_vec)) { return 1; } diff --git a/ggml b/ggml index 93b94a2..d2b23a4 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 93b94a2d41e880cb2abfb708535d5b04ad05b7a5 +Subproject commit d2b23a4d628317e7ab5efbba8d22d178af381369 From 09dff96af030fb3301ac1fcf894ba0d6c584a680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 30 Jun 2023 10:40:14 +0300 Subject: [PATCH 02/18] WIP: use broadcastable mul_mat --- clip.cpp | 31 +++++++++++++------------------ ggml | 2 +- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/clip.cpp b/clip.cpp index b722abe..b7a793f 100644 --- a/clip.cpp +++ b/clip.cpp @@ -23,7 +23,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions) case 397: // base if (n_image_positions == 50) // patch size = 32 { - return 16 * mb; + return 32 * mb; } else // patch size = 16 { @@ -54,7 +54,7 @@ size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions) case 397: if (n_positions <= 50) { - return 16 * mb; + return 512 * mb; } else { @@ -1545,17 +1545,16 @@ bool clip_image_batch_encode( // concat class_embeddings and patch_embeddings struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - /* + ggml_set_zero(embeddings); for (int b = 0; b < batch_size; b++) { - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, b * (ggml_nbytes(embeddings) / batch_size)); - embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1] / batch_size, embeddings->nb[2] / batch_size, embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, 0); + embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size); } - */ - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); + // embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + // embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); for (int i = 0; i < num_positions; i++) @@ -1576,10 +1575,6 @@ bool clip_image_batch_encode( ggml_repeat(ctx0, model.pre_ln_b, embeddings)); } - struct ggml_tensor *temp_w = ggml_new_tensor_4d(ctx0, model.layers[0].q_w->type, hidden_size, hidden_size, batch_size, 1); - struct ggml_tensor *temp_i = ggml_new_tensor_4d(ctx0, model.layers[0].ff_i_w->type, hidden_size, n_intermediate, batch_size, 1); - struct ggml_tensor *temp_o = ggml_new_tensor_4d(ctx0, model.layers[0].ff_o_w->type, n_intermediate, hidden_size, batch_size, 1); - // loop over layers for (int il = 0; il < n_layer; il++) { @@ -1604,7 +1599,7 @@ bool clip_image_batch_encode( { struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), - ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].q_w, temp_w), + ggml_mul_mat(ctx0, model.layers[il].q_w, cur)); Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))); @@ -1613,7 +1608,7 @@ bool clip_image_batch_encode( Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), - ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].k_w, temp_w), + ggml_mul_mat(ctx0, model.layers[il].k_w, cur)); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); @@ -1621,7 +1616,7 @@ bool clip_image_batch_encode( K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), - ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].v_w, temp_w), + ggml_mul_mat(ctx0, model.layers[il].v_w, cur)); V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); @@ -1642,7 +1637,7 @@ bool clip_image_batch_encode( // attention output cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), - ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].o_w, temp_w), + ggml_mul_mat(ctx0, model.layers[il].o_w, cur)); // re-add the layer input, e.g., residual @@ -1661,7 +1656,7 @@ bool clip_image_batch_encode( ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); } - cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_w, temp_i), cur); + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur); @@ -1675,7 +1670,7 @@ bool clip_image_batch_encode( cur = ggml_gelu_quick_inplace(ctx0, cur); } - cur = ggml_mul_mat(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_w, temp_o), cur); + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur); diff --git a/ggml b/ggml index d2b23a4..c16c9e5 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit d2b23a4d628317e7ab5efbba8d22d178af381369 +Subproject commit c16c9e56ec48a139fa179f3852f4c154b2272f26 From bfab116fd3b7e09081ea2b8e368d7b4e3fffc05e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 30 Jun 2023 15:00:27 +0300 Subject: [PATCH 03/18] Batched Conv2D is working --- clip.cpp | 22 ++++++++++++---------- examples/main.cpp | 2 +- ggml | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/clip.cpp b/clip.cpp index b7a793f..dccdef5 100644 --- a/clip.cpp +++ b/clip.cpp @@ -1513,10 +1513,10 @@ bool clip_image_batch_encode( static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions); static void *scr0 = malloc(scr0_size); - struct ggml_tensor *inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); + struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); { - float *data = (float *)ggml_get_data(inp); + float *data = (float *)ggml_get_data(inp_raw); const int nx = imgs[0].nx; const int ny = imgs[0].ny; @@ -1532,29 +1532,30 @@ bool clip_image_batch_encode( { for (int x = 0; x < nx; x++) { - data[(b * k * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k]; + data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k]; } } } } } - inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp); + struct ggml_tensor *inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp_raw); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + ggml_set_name(inp, "check"); // concat class_embeddings and patch_embeddings struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); ggml_set_zero(embeddings); - for (int b = 0; b < batch_size; b++) + + // TODO: correct thisconcat op + // for (int b = 0; b < batch_size; b++) { - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, 0); - embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3] / batch_size, ggml_element_size(model.class_embedding) * hidden_size); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); } - - // embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - // embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); + ggml_set_name(embeddings, "check"); struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); for (int i = 0; i < num_positions; i++) @@ -1757,6 +1758,7 @@ bool clip_image_batch_encode( }; auto *t = ggml_get_tensor(ctx0, "check"); + // auto t = inp_raw; if (t->type == GGML_TYPE_F32) { print_t_f32(t); diff --git a/examples/main.cpp b/examples/main.cpp index 920b356..51b1ae6 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -56,7 +56,7 @@ int main(int argc, char **argv) std::vector imgs; imgs.push_back(img_res); - // imgs.push_back(img_res); + imgs.push_back(img_res); const int64_t t_image_encode_us = ggml_time_us(); diff --git a/ggml b/ggml index c16c9e5..f967ae8 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit c16c9e56ec48a139fa179f3852f4c154b2272f26 +Subproject commit f967ae87c73dbc94201be22e346e35c51aacaa1b From c80a020a25881d3102e34ae88be6d4b6af93f0f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 30 Jun 2023 15:06:13 +0300 Subject: [PATCH 04/18] Batched Conv2D is working --- clip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clip.cpp b/clip.cpp index dccdef5..49bc8f0 100644 --- a/clip.cpp +++ b/clip.cpp @@ -1549,7 +1549,7 @@ bool clip_image_batch_encode( ggml_set_zero(embeddings); - // TODO: correct thisconcat op + // TODO: correct this concat op // for (int b = 0; b < batch_size; b++) { embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); From 502a6694f665b23a58c6c1edf8244ec6b95cddf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 30 Jun 2023 19:38:38 +0300 Subject: [PATCH 05/18] Fix concat --- clip.cpp | 19 +++++++------------ examples/main.cpp | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/clip.cpp b/clip.cpp index 49bc8f0..e381942 100644 --- a/clip.cpp +++ b/clip.cpp @@ -1542,20 +1542,15 @@ bool clip_image_batch_encode( struct ggml_tensor *inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp_raw); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - ggml_set_name(inp, "check"); // concat class_embeddings and patch_embeddings struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); ggml_set_zero(embeddings); + struct ggml_tensor *temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size); - // TODO: correct this concat op - // for (int b = 0; b < batch_size; b++) - { - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); - } - ggml_set_name(embeddings, "check"); + embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); for (int i = 0; i < num_positions; i++) @@ -1568,6 +1563,7 @@ bool clip_image_batch_encode( // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings); + ggml_set_name(embeddings, "check"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, @@ -1707,11 +1703,10 @@ bool clip_image_batch_encode( embeddings = ggml_mul_mat(ctx0, model.projection, embeddings); // normalize output embeddings + struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, 0)); ggml_tensor *length = ggml_sqrt(ctx0, - ggml_sum(ctx0, ggml_sqr(ctx0, embeddings))); - embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); - - ggml_set_name(embeddings, "check"); + ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); + embeddings = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); // run the computation ggml_build_forward_expand(&gf, embeddings); diff --git a/examples/main.cpp b/examples/main.cpp index 51b1ae6..920b356 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -56,7 +56,7 @@ int main(int argc, char **argv) std::vector imgs; imgs.push_back(img_res); - imgs.push_back(img_res); + // imgs.push_back(img_res); const int64_t t_image_encode_us = ggml_time_us(); From 734f6db4a0528b1810729f03b2ef3fd60bdcdf49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Sun, 2 Jul 2023 13:28:08 +0300 Subject: [PATCH 06/18] Batched output normalization --- clip.cpp | 19 +++++++++++++------ examples/main.cpp | 4 ++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/clip.cpp b/clip.cpp index e381942..6086011 100644 --- a/clip.cpp +++ b/clip.cpp @@ -1132,6 +1132,7 @@ bool clip_image_encode( // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings); + ggml_set_name(embeddings, "check"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, @@ -1703,13 +1704,19 @@ bool clip_image_batch_encode( embeddings = ggml_mul_mat(ctx0, model.projection, embeddings); // normalize output embeddings - struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, 0)); - ggml_tensor *length = ggml_sqrt(ctx0, - ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); - embeddings = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + struct ggml_tensor *output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size); + + for (int b = 0; b < batch_size; b++) + { + struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b)); + ggml_tensor *length = ggml_sqrt(ctx0, + ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); + embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding)); + } // run the computation - ggml_build_forward_expand(&gf, embeddings); + ggml_build_forward_expand(&gf, output); ggml_graph_compute(ctx0, &gf); // print @@ -1767,7 +1774,7 @@ bool clip_image_batch_encode( printf("used_mem = %zu\n", ggml_used_mem(ctx0)); #endif - memcpy(vec, ggml_get_data_f32(embeddings), sizeof(float) * projection_dim); + memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size); ggml_free(ctx0); diff --git a/examples/main.cpp b/examples/main.cpp index 920b356..49c817b 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -56,7 +56,7 @@ int main(int argc, char **argv) std::vector imgs; imgs.push_back(img_res); - // imgs.push_back(img_res); + imgs.push_back(img_res); const int64_t t_image_encode_us = ggml_time_us(); @@ -68,7 +68,7 @@ int main(int argc, char **argv) const int64_t t_similarity_score = ggml_time_us(); - float score = clip_similarity_score(txt_vec, img_vec, vec_dim); + float score = clip_similarity_score(txt_vec, img_vec + vec_dim, vec_dim); printf("%s Similarity score = %2.3f\n", __func__, score); const int64_t t_main_end_us = ggml_time_us(); From 236b75537d34ed3467c5e81ebb750dd1f04becc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Tue, 4 Jul 2023 00:17:02 +0300 Subject: [PATCH 07/18] Full batch inference is working --- clip.cpp | 429 +++++++++------------------------------------- examples/main.cpp | 10 +- ggml | 2 +- 3 files changed, 86 insertions(+), 355 deletions(-) diff --git a/clip.cpp b/clip.cpp index 6086011..6a8db54 100644 --- a/clip.cpp +++ b/clip.cpp @@ -11,7 +11,7 @@ #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" -#define CLIP_DEBUG +// #define CLIP_DEBUG // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved // after that, remove this and use the mechanism implemented in GGML directly @@ -1058,6 +1058,17 @@ bool clip_image_encode( int n_threads, const clip_image_f32 &img, float *vec) +{ + std::vector imgs; + imgs.push_back(img); + return clip_image_batch_encode(ctx, n_threads, imgs, vec); +} + +bool clip_image_batch_encode( + const clip_ctx *ctx, + int n_threads, + const std::vector &imgs, + float *vec) { const auto &model = ctx->vision_model; const auto &hparams = model.hparams; @@ -1072,6 +1083,7 @@ bool clip_image_encode( const int n_layer = hparams.n_layer; const int n_intermediate = hparams.n_intermediate; const int projection_dim = hparams.projection_dim; + int batch_size = imgs.size(); auto &buf_compute = ctx->buf_compute; @@ -1088,38 +1100,48 @@ bool clip_image_encode( static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions); static void *scr0 = malloc(scr0_size); - struct ggml_tensor *inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, 1); + struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); { - float *data = (float *)ggml_get_data(inp); + float *data = (float *)ggml_get_data(inp_raw); - const int nx = img.nx; - const int ny = img.ny; - const int n = nx * ny; + for (int b = 0; b < imgs.size(); b++) + { + const int nx = imgs[b].nx; + const int ny = imgs[b].ny; + GGML_ASSERT(nx == image_size && ny == image_size); - GGML_ASSERT(nx == image_size && ny == image_size); + const int n = nx * ny; - for (int k = 0; k < 3; k++) - { - for (int y = 0; y < ny; y++) + for (int b = 0; b < batch_size; b++) { - for (int x = 0; x < nx; x++) + for (int k = 0; k < 3; k++) { - data[k * n + y * nx + x] = img.data[3 * (y * nx + x) + k]; + for (int y = 0; y < ny; y++) + { + for (int x = 0; x < nx; x++) + { + data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k]; + } + } } } } } - inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp); - inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size); - inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); // concat class_embeddings and patch_embeddings - struct ggml_tensor *embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_size, num_positions); + struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_zero(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], ggml_element_size(model.class_embedding) * hidden_size); + struct ggml_tensor *temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size); + + embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); for (int i = 0; i < num_positions; i++) @@ -1127,12 +1149,11 @@ bool clip_image_encode( ggml_set_i32_1d(positions, i, i); } - embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + embeddings = ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings)); // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings); - ggml_set_name(embeddings, "check"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, @@ -1146,6 +1167,8 @@ bool clip_image_encode( { struct ggml_tensor *cur = embeddings; // embeddings = residual, cur = hidden_states + const size_t nb_q_w = model.layers[il].q_w->nb[0]; + ggml_set_scratch(ctx0, {0, scr0_size, scr0}); // layernorm1 @@ -1161,44 +1184,48 @@ bool clip_image_encode( // self-attention { + struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), - ggml_mul_mat(ctx0, model.layers[il].q_w, cur)); + ggml_mul_mat(ctx0, model.layers[il].q_w, + cur)); Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))); - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, 1); + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); - struct ggml_tensor *K = - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), - ggml_mul_mat(ctx0, model.layers[il].k_w, cur)); + struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), + ggml_mul_mat(ctx0, model.layers[il].k_w, + cur)); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, 1); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - struct ggml_tensor *V = - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), - ggml_mul_mat(ctx0, model.layers[il].v_w, cur)); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, 1); + struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), + ggml_mul_mat(ctx0, model.layers[il].v_w, + cur)); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_soft_max_inplace(ctx0, KQ); struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, 1); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); cur = ggml_cpy(ctx0, KQV, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_size, num_positions)); + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size)); } // attention output cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), - ggml_mul_mat(ctx0, model.layers[il].o_w, cur)); + ggml_mul_mat(ctx0, model.layers[il].o_w, + cur)); // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, embeddings); @@ -1242,8 +1269,12 @@ bool clip_image_encode( } // get the output of cls token, e.g., 0th index - struct ggml_tensor *cls = ggml_new_i32(ctx0, 0); - embeddings = ggml_get_rows(ctx0, embeddings, cls); + struct ggml_tensor *cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size); + for (int b = 0; b < batch_size; b++) + { + ggml_set_i32_1d(cls, b, b * num_positions); + } + embeddings = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, embeddings, hidden_size, num_positions * batch_size), cls); // post-layernorm { @@ -1262,14 +1293,20 @@ bool clip_image_encode( embeddings = ggml_mul_mat(ctx0, model.projection, embeddings); // normalize output embeddings - ggml_tensor *length = ggml_sqrt(ctx0, - ggml_sum(ctx0, ggml_sqr(ctx0, embeddings))); - embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + struct ggml_tensor *output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size); - ggml_set_name(embeddings, "check"); + for (int b = 0; b < batch_size; b++) + { + struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b)); + ggml_tensor *length = ggml_sqrt(ctx0, + ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); + embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding)); + } + ggml_set_name(output, "check"); // run the computation - ggml_build_forward_expand(&gf, embeddings); + ggml_build_forward_expand(&gf, output); ggml_graph_compute(ctx0, &gf); // print @@ -1313,6 +1350,7 @@ bool clip_image_encode( }; auto *t = ggml_get_tensor(ctx0, "check"); + // auto t = inp_raw; if (t->type == GGML_TYPE_F32) { print_t_f32(t); @@ -1326,7 +1364,7 @@ bool clip_image_encode( printf("used_mem = %zu\n", ggml_used_mem(ctx0)); #endif - memcpy(vec, ggml_get_data_f32(embeddings), sizeof(float) * projection_dim); + memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size); ggml_free(ctx0); @@ -1477,306 +1515,3 @@ bool image_normalize(clip_image_u8 *img, clip_image_f32 *res) } return true; } - -bool clip_image_batch_encode( - const clip_ctx *ctx, - int n_threads, - const std::vector &imgs, - float *vec) -{ - const auto &model = ctx->vision_model; - const auto &hparams = model.hparams; - - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); - const int num_positions = num_patches + 1; - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - const int n_intermediate = hparams.n_intermediate; - const int projection_dim = hparams.projection_dim; - const int batch_size = imgs.size(); - - auto &buf_compute = ctx->buf_compute; - - struct ggml_init_params params = { - .mem_size = buf_compute.size, - .mem_buffer = buf_compute.data, - .no_alloc = false, - }; - - struct ggml_context *ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; - - static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions); - static void *scr0 = malloc(scr0_size); - - struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); - - { - float *data = (float *)ggml_get_data(inp_raw); - - const int nx = imgs[0].nx; - const int ny = imgs[0].ny; - const int n = nx * ny; - - GGML_ASSERT(nx == image_size && ny == image_size); - - for (int b = 0; b < batch_size; b++) - { - for (int k = 0; k < 3; k++) - { - for (int y = 0; y < ny; y++) - { - for (int x = 0; x < nx; x++) - { - data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k]; - } - } - } - } - } - - struct ggml_tensor *inp = ggml_conv_2d_sk_p0(ctx0, model.patch_embeddings, inp_raw); - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - - // concat class_embeddings and patch_embeddings - struct ggml_tensor *embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - - ggml_set_zero(embeddings); - struct ggml_tensor *temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size); - - embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - - struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - for (int i = 0; i < num_positions; i++) - { - ggml_set_i32_1d(positions, i, i); - } - - embeddings = ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings)); - - // pre-layernorm - { - embeddings = ggml_norm(ctx0, embeddings); - ggml_set_name(embeddings, "check"); - - embeddings = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.pre_ln_w, embeddings), - embeddings), - ggml_repeat(ctx0, model.pre_ln_b, embeddings)); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) - { - struct ggml_tensor *cur = embeddings; // embeddings = residual, cur = hidden_states - - const size_t nb_q_w = model.layers[il].q_w->nb[0]; - - ggml_set_scratch(ctx0, {0, scr0_size, scr0}); - - // layernorm1 - { - cur = ggml_norm(ctx0, cur); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); - } - - // self-attention - { - - struct ggml_tensor *Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), - ggml_mul_mat(ctx0, model.layers[il].q_w, - cur)); - - Q = ggml_scale_inplace(ctx0, Q, ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))); - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); - - struct ggml_tensor *K = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), - ggml_mul_mat(ctx0, model.layers[il].k_w, - cur)); - - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - - struct ggml_tensor *V = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), - ggml_mul_mat(ctx0, model.layers[il].v_w, - cur)); - - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - - struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_inplace(ctx0, KQ); - struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); - KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); - - cur = ggml_cpy(ctx0, - KQV, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size)); - } - - // attention output - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].o_b, cur), - ggml_mul_mat(ctx0, model.layers[il].o_w, - cur)); - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, embeddings); - - embeddings = cur; // embeddings = residual, cur = hidden_states - - // layernorm2 - { - cur = ggml_norm(ctx0, cur); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), - cur); - - if (ctx->use_gelu) - { - cur = ggml_gelu_inplace(ctx0, cur); - } - else - { - cur = ggml_gelu_quick_inplace(ctx0, cur); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), - cur); - - // residual 2 - cur = ggml_add(ctx0, embeddings, cur); - - embeddings = cur; - } - - // get the output of cls token, e.g., 0th index - struct ggml_tensor *cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size); - for (int b = 0; b < batch_size; b++) - { - ggml_set_i32_1d(cls, b, b * num_positions); - } - embeddings = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, embeddings, hidden_size, num_positions * batch_size), cls); - - // post-layernorm - { - embeddings = ggml_norm(ctx0, embeddings); - - embeddings = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.post_ln_w, embeddings), - embeddings), - ggml_repeat(ctx0, model.post_ln_b, embeddings)); - } - - ggml_set_scratch(ctx0, {0, 0, nullptr}); - - // final visual projection - embeddings = ggml_mul_mat(ctx0, model.projection, embeddings); - - // normalize output embeddings - struct ggml_tensor *output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size); - - for (int b = 0; b < batch_size; b++) - { - struct ggml_tensor *embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b)); - ggml_tensor *length = ggml_sqrt(ctx0, - ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); - embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); - output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding)); - } - - // run the computation - ggml_build_forward_expand(&gf, output); - ggml_graph_compute(ctx0, &gf); - -// print -#ifdef CLIP_DEBUG - { - auto print_t_f32 = [&](struct ggml_tensor *t) - { - float *data = (float *)t->data; - printf("dtype: f32, dims: %jd %jd %jd %jd, nb: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->nb[0], t->nb[1], t->nb[2], t->nb[3]); - printf("data: "); - for (int i = 0; i < std::min((int)t->ne[0], 20); i++) - { - printf("%f ", data[i]); - } - - // printf("\n\n"); - double sum = 0.0; - for (int i = 0; i < ggml_nelements(t); i++) - { - sum += data[i]; - } - printf("sum: %f\n", sum); - }; - - auto print_t_f16 = [&](struct ggml_tensor *t) - { - ggml_fp16_t *data = (ggml_fp16_t *)t->data; - printf("dtype: f16, dims: %jd %jd %jd %jd\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]); - printf("data: "); - for (int i = 0; i < std::min((int)t->ne[0], 10); i++) - { - printf("%f ", ggml_fp16_to_fp32(data[i])); - } - printf("\n\n"); - double sum = 0.0; - for (int i = 0; i < ggml_nelements(t); i++) - { - sum += ggml_fp16_to_fp32(data[i]); - } - printf("sum: %f\n", sum); - }; - - auto *t = ggml_get_tensor(ctx0, "check"); - // auto t = inp_raw; - if (t->type == GGML_TYPE_F32) - { - print_t_f32(t); - } - else - { - print_t_f16(t); - } - } - - printf("used_mem = %zu\n", ggml_used_mem(ctx0)); -#endif - - memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size); - - ggml_free(ctx0); - - return true; -} diff --git a/examples/main.cpp b/examples/main.cpp index 49c817b..e38e047 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -54,21 +54,17 @@ int main(int argc, char **argv) clip_image_preprocess(ctx, &img0, &img_res); - std::vector imgs; - imgs.push_back(img_res); - imgs.push_back(img_res); - const int64_t t_image_encode_us = ggml_time_us(); - float img_vec[vec_dim * 2]; - if (!clip_image_batch_encode(ctx, params.n_threads, imgs, img_vec)) + float img_vec[vec_dim]; + if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec)) { return 1; } const int64_t t_similarity_score = ggml_time_us(); - float score = clip_similarity_score(txt_vec, img_vec + vec_dim, vec_dim); + float score = clip_similarity_score(txt_vec, img_vec, vec_dim); printf("%s Similarity score = %2.3f\n", __func__, score); const int64_t t_main_end_us = ggml_time_us(); diff --git a/ggml b/ggml index f967ae8..703c2a6 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit f967ae87c73dbc94201be22e346e35c51aacaa1b +Subproject commit 703c2a69bc93bb4ddac684ee956db324476e3f7f From 53250a52ede32666923d967d6356e252ee4c695e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Tue, 4 Jul 2023 00:19:37 +0300 Subject: [PATCH 08/18] Full batch inference is working --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a96abb9..1d27ed4 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,9 @@ This repo is aimed at powering useful applications based on such models on compu clip.cpp also has a short startup time compared to large ML frameworks, which makes it suitable for serverless deployments where the cold start is an issue. +## Hot topics +- 06/04/2023: Batch inference support for image encoding. + ## Note about image preprocessing PIL uses a two-pass convolutions-based bicubic interpolation in resizing with antialiasing applied. In Pytorch, antialiasing is optional. It needs some extra attention to implement this preprocessing logic that matches their results numerically. However, I found that linear interpolation is also good enough for both comparison of different embeddings from this implementation and also comparison of an embedding from this implementation and another one from Transformers. So let's use it until we craft a proper bicubic interpolation. From 334889648cbad2a108f8fd3f1824ed424476fe58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 6 Jul 2023 14:41:19 +0300 Subject: [PATCH 09/18] Sync ggml --- clip.cpp | 2 +- ggml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clip.cpp b/clip.cpp index 6a8db54..e36af2d 100644 --- a/clip.cpp +++ b/clip.cpp @@ -11,7 +11,7 @@ #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" -// #define CLIP_DEBUG +#define CLIP_DEBUG // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved // after that, remove this and use the mechanism implemented in GGML directly diff --git a/ggml b/ggml index 703c2a6..bc721e7 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 703c2a69bc93bb4ddac684ee956db324476e3f7f +Subproject commit bc721e70f390eae8294fc775fb9936103e503787 From b8d0f4318564c99238e4fffbe054f5ed6b9236a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 6 Jul 2023 14:41:58 +0300 Subject: [PATCH 10/18] Sync ggml --- clip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clip.cpp b/clip.cpp index e36af2d..6a8db54 100644 --- a/clip.cpp +++ b/clip.cpp @@ -11,7 +11,7 @@ #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" -#define CLIP_DEBUG +// #define CLIP_DEBUG // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved // after that, remove this and use the mechanism implemented in GGML directly From b1c028974d50f4bcf2b5a378be3e942d908c678b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 6 Jul 2023 14:44:13 +0300 Subject: [PATCH 11/18] Sync ggml --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1d27ed4..9179ba9 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ This repo is aimed at powering useful applications based on such models on compu clip.cpp also has a short startup time compared to large ML frameworks, which makes it suitable for serverless deployments where the cold start is an issue. ## Hot topics -- 06/04/2023: Batch inference support for image encoding. +- 07/04/2023: Batch inference support for image encoding. ## Note about image preprocessing PIL uses a two-pass convolutions-based bicubic interpolation in resizing with antialiasing applied. In Pytorch, antialiasing is optional. It needs some extra attention to implement this preprocessing logic that matches their results numerically. However, I found that linear interpolation is also good enough for both comparison of different embeddings from this implementation and also comparison of an embedding from this implementation and another one from Transformers. So let's use it until we craft a proper bicubic interpolation. From ee6ceca9e033ee04af6bbe87aa7ee08886d98ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 7 Jul 2023 17:27:46 +0300 Subject: [PATCH 12/18] add multithreaded batched image preprocessing --- clip.cpp | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- clip.h | 1 + 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/clip.cpp b/clip.cpp index 6a8db54..e464e57 100644 --- a/clip.cpp +++ b/clip.cpp @@ -5,6 +5,8 @@ #include #include #include +#include + #include "ggml/ggml.h" #include "clip.h" @@ -23,7 +25,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions) case 397: // base if (n_image_positions == 50) // patch size = 32 { - return 32 * mb; + return 8 * mb; } else // patch size = 16 { @@ -54,7 +56,7 @@ size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions) case 397: if (n_positions <= 50) { - return 512 * mb; + return 16 * mb; } else { @@ -252,6 +254,77 @@ bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_i return true; } +// Structure to hold the image data as an input to function to be executed for thread +typedef struct +{ + clip_image_u8 *input; + clip_image_f32 *resized; + const clip_ctx *ctx; +} ImageData; + +// Function to preprocess a single image in a thread +void *preprocess_image(void *arg) +{ + ImageData *imageData = static_cast(arg); + clip_image_u8 *input = imageData->input; + clip_image_f32 *resized = imageData->resized; + const clip_ctx *ctx = imageData->ctx; + + // Call the original preprocess function on the image + clip_image_preprocess(ctx, input, resized); + + pthread_exit(NULL); +} + +// Function to batch-preprocess multiple images i +void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector &img_inputs, std::vector &img_resized, const int n_threads) +{ + GGML_ASSERT(img_inputs.size() == img_resized.size()); + int num_threads = std::min(n_threads, static_cast(img_inputs.size())); + int i, t; + + // Divide the images among the threads + int images_per_thread = img_inputs.size() / num_threads; + + if (num_threads == 1) + { + // Single-threaded case + for (i = 0; i < img_inputs.size(); i++) + { + clip_image_preprocess(ctx, img_inputs[i], &img_resized[i]); + } + } + else + { + // Multi-threaded case + + std::vector threads(num_threads); + std::vector imageData(img_inputs.size()); + + for (t = 0; t < num_threads; t++) + { + int start_index = t * images_per_thread; + int end_index = (t == num_threads - 1) ? img_inputs.size() : start_index + images_per_thread; + + // Create ImageData for each thread + for (i = start_index; i < end_index; i++) + { + imageData[i].input = img_inputs[i]; + imageData[i].resized = &img_resized[i]; + imageData[i].ctx = ctx; + } + + // Create a thread for each batch of images + pthread_create(&threads[t], NULL, preprocess_image, static_cast(&imageData[start_index])); + } + + // Wait for all threads to finish + for (t = 0; t < num_threads; t++) + { + pthread_join(threads[t], NULL); + } + } +} struct clip_ctx *clip_model_load(const char *fname, const int verbosity = 1) { diff --git a/clip.h b/clip.h index 9aa8360..8bc0dc7 100644 --- a/clip.h +++ b/clip.h @@ -200,6 +200,7 @@ std::vector clip_tokenize(const clip_ctx *ctx, const std::string bool clip_image_load_from_file(const std::string &fname, clip_image_u8 &img); bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_image_f32 *res); +void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector &img_inputs, std::vector &img_resized, const int n_threads); bool clip_text_encode( const clip_ctx *ctx, From ef3e0c97fa48ed1da0620259a776966370bc0dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 7 Jul 2023 17:30:09 +0300 Subject: [PATCH 13/18] add multithreaded batched image preprocessing --- clip.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clip.cpp b/clip.cpp index e464e57..2b8d807 100644 --- a/clip.cpp +++ b/clip.cpp @@ -277,9 +277,9 @@ void *preprocess_image(void *arg) } // Function to batch-preprocess multiple images i -void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector &img_inputs, std::vector &img_resized, const int n_threads) +void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector &img_inputs, std::vector &imgs_resized, const int n_threads) { - GGML_ASSERT(img_inputs.size() == img_resized.size()); + GGML_ASSERT(img_inputs.size() == imgs_resized.size()); int num_threads = std::min(n_threads, static_cast(img_inputs.size())); int i, t; @@ -291,7 +291,7 @@ void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector Date: Wed, 12 Jul 2023 03:08:14 +0300 Subject: [PATCH 14/18] Update batch preprocess function signature --- clip.cpp | 14 +++++++------- clip.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/clip.cpp b/clip.cpp index 926ed28..7448a11 100644 --- a/clip.cpp +++ b/clip.cpp @@ -25,7 +25,7 @@ size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions) case 397: // base if (n_image_positions == 50) // patch size = 32 { - return 8 * mb; + return 12 * mb; } else // patch size = 16 { @@ -56,7 +56,7 @@ size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions) case 397: if (n_positions <= 50) { - return 16 * mb; + return 32 * mb; } else { @@ -257,7 +257,7 @@ bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_i // Structure to hold the image data as an input to function to be executed for thread typedef struct { - clip_image_u8 *input; + const clip_image_u8 *input; clip_image_f32 *resized; const clip_ctx *ctx; } ImageData; @@ -266,7 +266,7 @@ typedef struct void *preprocess_image(void *arg) { ImageData *imageData = static_cast(arg); - clip_image_u8 *input = imageData->input; + const clip_image_u8 *input = imageData->input; clip_image_f32 *resized = imageData->resized; const clip_ctx *ctx = imageData->ctx; @@ -277,7 +277,7 @@ void *preprocess_image(void *arg) } // Function to batch-preprocess multiple images i -void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector &img_inputs, std::vector &imgs_resized, const int n_threads) +void clip_image_batch_preprocess(const clip_ctx *ctx, const int n_threads, const std::vector &img_inputs, std::vector &imgs_resized) { GGML_ASSERT(img_inputs.size() == imgs_resized.size()); int num_threads = std::min(n_threads, static_cast(img_inputs.size())); @@ -291,7 +291,7 @@ void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector clip_tokenize(const clip_ctx *ctx, const std::string bool clip_image_load_from_file(const std::string &fname, clip_image_u8 &img); bool clip_image_preprocess(const clip_ctx *ctx, const clip_image_u8 *img, clip_image_f32 *res); -void clip_image_batch_preprocess(const clip_ctx *ctx, const std::vector &img_inputs, std::vector &img_resized, const int n_threads); +void clip_image_batch_preprocess(const clip_ctx *ctx, const int n_threads, const std::vector &img_inputs, std::vector &img_resized); bool clip_text_encode( const clip_ctx *ctx, From 0907b408d16ab6b34bf101bad4f4eaebdefc9c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Wed, 12 Jul 2023 03:09:12 +0300 Subject: [PATCH 15/18] Implement batch inference in benchmark util --- CMakeLists.txt | 2 +- tests/benchmark.cpp | 79 +++++++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50a6538..959a78b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ endif() # general option(CLIP_STATIC "CLIP: static link libraries" OFF) -option(CLIP_BUILD_TEST "CLIP: build tests" ${CLIP_STANDALONE}) +option(CLIP_BUILD_TESTS "CLIP: build tests" ${CLIP_STANDALONE}) option(CLIP_BUILD_EXAMPLES "CLIP: build examples" ${CLIP_STANDALONE}) option(CLIP_BUILD_IMAGE_SEARCH "CLIP: build image-search" OFF) option(CLIP_NATIVE "CLIP: enable -march=native flag" ON) diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 0b61073..6e31e29 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -44,7 +44,7 @@ int main(int argc, char **argv) return 1; } - fprintf(fout, "%s: %d directories found in %s\n\n", __func__, n_labels, dir_path.c_str()); + fprintf(fout, "%s: %zu directories found in %s\n\n", __func__, n_labels, dir_path.c_str()); auto ctx = clip_model_load(model_path.c_str(), 2); if (!ctx) @@ -53,14 +53,11 @@ int main(int argc, char **argv) return 1; } + const size_t batch_size = 4; + const int vec_dim = ctx->text_model.hparams.projection_dim; - // allocate memory for text vectors - float *txt_vecs = (float *)malloc(n_labels * vec_dim * sizeof(float)); - if (!txt_vecs) - { - printf("%s: Could not allocate memory for %d vectors of %d dimensions\n", __func__, n_labels, vec_dim); - } + float txt_vecs[n_labels * vec_dim]; ggml_time_init(); @@ -88,12 +85,13 @@ int main(int argc, char **argv) int n_total_items = 0; // total number of images processed float total_acc1_score = 0.0f; // total accuracy at 1 for the intire dataset float total_acc5_score = 0.0f; // total accuracy at 5 in intitre dataset - float img_vec[vec_dim]; + float img_vecs[vec_dim * batch_size]; + float similarities[n_labels]; float sorted_scores[n_labels]; int indices[n_labels]; - clip_image_u8 img; - clip_image_f32 img_res; + std::vector img_inputs(batch_size); + std::vector imgs_resized(batch_size); // print table headers fprintf(fout, "| class name | acc@1 | acc@5 |\n"); @@ -107,56 +105,59 @@ int main(int argc, char **argv) int n_acc1 = 0; int n_acc5 = 0; - int64_t t_start_encode_images = ggml_time_us(); + size_t n_batched = (entry.second.size() / batch_size) * batch_size; - for (auto &file_path : entry.second) + for (size_t i = 0; i < n_batched; i += batch_size) { - if (!clip_image_load_from_file(file_path, img)) + for (size_t ib = i; ib < i + batch_size; ib++) { - printf("%s: cannot load file from %s\n", __func__, file_path.c_str()); - return 1; - } + std::string file_path = entry.second[ib]; - if (!clip_image_preprocess(ctx, &img, &img_res)) - { - printf("%s: cannot preprocess image loaded from %s\n", __func__, file_path.c_str()); - return 1; + if (!clip_image_load_from_file(file_path, img_inputs[ib % batch_size])) + { + printf("%s: cannot load file from %s\n", __func__, file_path.c_str()); + return 1; + } } - clip_image_encode(ctx, 4, img_res, img_vec); - for (size_t i = 0; i < n_labels; i++) - { - similarities[i] = clip_similarity_score(img_vec, txt_vecs + i * vec_dim, vec_dim); - } + clip_image_batch_preprocess(ctx, 4, img_inputs, imgs_resized); - softmax_with_sorting(similarities, n_labels, sorted_scores, indices); - for (int j = 0; j < 5; j++) + clip_image_batch_encode(ctx, 4, imgs_resized, img_vecs); + + for (size_t b = 0; b < batch_size; b++) { - if (j == 0 && indices[j] == label_idx) + for (size_t j = 0; j < n_labels; j++) { - n_acc1 += 1; - n_acc5 += 1; - break; + similarities[j] = clip_similarity_score(img_vecs + b * vec_dim, txt_vecs + j * vec_dim, vec_dim); } - else if (indices[j] == label_idx) + softmax_with_sorting(similarities, n_labels, sorted_scores, indices); + + for (int k = 0; k < 5; k++) { - n_acc5 += 1; - break; + if (k == 0 && indices[k] == label_idx) + { + n_acc1 += 1; + n_acc5 += 1; + break; + } + else if (indices[k] == label_idx) + { + n_acc5 += 1; + break; + } } - } - n_items += 1; - n_total_items += 1; + n_items += 1; + n_total_items += 1; + } } float acc1_score = (float)n_acc1 / n_items; float acc5_score = (float)n_acc5 / n_items; total_acc1_score += acc1_score; total_acc5_score += acc5_score; - // printf("%s: acc@1 = %2.4f - acc@5 = %2.4f\n", entry.first.c_str(), acc1_score, acc5_score); fprintf(fout, "| %-*s ", 20, entry.first.c_str()); fprintf(fout, "| %2.4f | %2.4f |\n", acc1_score, acc5_score); - label_idx += 1; } From 8c09113e154b8f3a589a47d5780a19e4546c227a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Wed, 12 Jul 2023 15:29:46 +0300 Subject: [PATCH 16/18] sync ggml --- clip.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/clip.cpp b/clip.cpp index 7448a11..b77cbba 100644 --- a/clip.cpp +++ b/clip.cpp @@ -913,7 +913,6 @@ bool clip_text_encode( struct ggml_context *ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), N); static void *scr0 = malloc(scr0_size); @@ -1064,7 +1063,7 @@ bool clip_text_encode( // run the computation ggml_build_forward_expand(&gf, embeddings); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); // print #ifdef CLIP_DEBUG @@ -1168,7 +1167,6 @@ bool clip_image_batch_encode( struct ggml_context *ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions); static void *scr0 = malloc(scr0_size); @@ -1380,7 +1378,7 @@ bool clip_image_batch_encode( // run the computation ggml_build_forward_expand(&gf, output); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); // print #ifdef CLIP_DEBUG From 8ea50bf72716e2268a4dc77854b4300bde50118c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Wed, 12 Jul 2023 15:54:12 +0300 Subject: [PATCH 17/18] set n_threads as const --- ggml | 2 +- tests/benchmark.cpp | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ggml b/ggml index bc721e7..ad18208 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit bc721e70f390eae8294fc775fb9936103e503787 +Subproject commit ad18208ff5c2446ccc2b66c9f3bee6d59c1731f5 diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 6e31e29..f23da4c 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -54,6 +54,7 @@ int main(int argc, char **argv) } const size_t batch_size = 4; + const size_t n_threads = 4; const int vec_dim = ctx->text_model.hparams.projection_dim; @@ -70,7 +71,7 @@ int main(int argc, char **argv) for (const auto &entry : result) { auto tokens = clip_tokenize(ctx, entry.first); - if (!clip_text_encode(ctx, 4, tokens, txt_vecs + label_idx * vec_dim)) + if (!clip_text_encode(ctx, n_threads, tokens, txt_vecs + label_idx * vec_dim)) { printf("%s: Could not encode the label at index %d: %s\n", __func__, label_idx, entry.first.c_str()); return 1; @@ -120,9 +121,9 @@ int main(int argc, char **argv) } } - clip_image_batch_preprocess(ctx, 4, img_inputs, imgs_resized); + clip_image_batch_preprocess(ctx, n_threads, img_inputs, imgs_resized); - clip_image_batch_encode(ctx, 4, imgs_resized, img_vecs); + clip_image_batch_encode(ctx, n_threads, imgs_resized, img_vecs); for (size_t b = 0; b < batch_size; b++) { From 5f4fa2fb48b5a787785876b6087d1016dab829e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Wed, 12 Jul 2023 20:52:31 +0300 Subject: [PATCH 18/18] Sync ggml --- README.md | 3 ++- ggml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c378faf..7fc927e 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,8 @@ This repo is aimed at powering useful applications based on such models on compu clip.cpp also has a short startup time compared to large ML frameworks, which makes it suitable for serverless deployments where the cold start is an issue. ## Hot topics -- 07/04/2023: Batch inference support for image encoding. +- 07/12/2023: Batch inference support for image encoding. +- 07/11/2023: Semantic image search [example](examples/image-search/README.md) directly in C++. ## Note about image preprocessing PIL uses a two-pass convolutions-based bicubic interpolation in resizing with antialiasing applied. In Pytorch, antialiasing is optional. It needs some extra attention to implement this preprocessing logic that matches their results numerically. However, I found that linear interpolation is also good enough for both comparison of different embeddings from this implementation and also comparison of an embedding from this implementation and another one from Transformers. So let's use it until we craft a proper bicubic interpolation. diff --git a/ggml b/ggml index ad18208..5621652 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit ad18208ff5c2446ccc2b66c9f3bee6d59c1731f5 +Subproject commit 56216523fa1df0c0bc36201dbecd8e0a01668d91