From 8f39716d8e1fbfb7024b8762447949e294d1d30e Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Tue, 23 Jan 2024 05:40:13 +0100 Subject: [PATCH 1/5] Support for Yi-VL, templating fix for mobileVLM --- examples/llava/clip.cpp | 72 ++++++++++++++++++++++++++++++++---- examples/llava/llava-cli.cpp | 38 ++++++++++++++++++- 2 files changed, 101 insertions(+), 9 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 6161fd858c29f..e2ba301bd4bca 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -97,6 +97,7 @@ static std::string format(const char * fmt, ...) { enum projector_type { PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_UNKNOWN, }; @@ -303,10 +304,18 @@ struct clip_vision_model { struct ggml_tensor * projection; // LLaVA projection - struct ggml_tensor * mm_0_w; - struct ggml_tensor * mm_0_b; - struct ggml_tensor * mm_2_w; - struct ggml_tensor * mm_2_b; + struct ggml_tensor * mm_0_w = NULL; + struct ggml_tensor * mm_0_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; + + // Yi type models with mlp+normalization projection + struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_3_w = NULL; + struct ggml_tensor * mm_3_b = NULL; + struct ggml_tensor * mm_4_w = NULL; + struct ggml_tensor * mm_4_b = NULL; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w; @@ -459,6 +468,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } @@ -574,6 +584,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); } else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projector @@ -815,7 +846,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->proj_type = clip_projector_type_from_string(proj_type); } else { + if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { + new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; + } + else new_clip->proj_type = PROJECTOR_TYPE_MLP; + } } @@ -965,11 +1001,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); // LLaVA projection - if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + try { + // Yi-type llava + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); + } catch (std::runtime_error & e) { } + try { + // missing in Yi-type llava + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); + vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); + vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); + } catch (std::runtime_error & e) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1442,6 +1496,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + return ctx->vision_model.mm_3_b->ne[0]; } else { std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; @@ -1460,4 +1516,4 @@ int clip_n_patches(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} +} \ No newline at end of file diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index d94795fe317d4..8012d3f59d4c8 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -148,10 +148,45 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); + #if 0 // llava chat format is "\nUSER:\n\nASSISTANT:" eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false); + #else + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find(""); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("").length()); + // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string + size_t pos = 0; + while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { + user_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { + system_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + + printf("system_prompt: %s\n", system_prompt.c_str()); + printf("user_prompt: %s\n", user_prompt.c_str()); + } else { + // llava-1.5 native mode + system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; + user_prompt = prompt + "\nASSISTANT:"; + } + + + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + + #endif // generate the response @@ -162,6 +197,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior printf("%s", tmp); fflush(stdout); @@ -240,7 +276,7 @@ int main(int argc, char ** argv) { if (ctx_llava == NULL) { fprintf(stderr, "%s: error: failed to init llava\n", __func__); return 1; - } + } auto image_embed = load_image(ctx_llava, ¶ms); if (!image_embed) { From ffcd8e7fbda174475693e56e383a8f35c30caba8 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Tue, 23 Jan 2024 05:52:12 +0100 Subject: [PATCH 2/5] ws --- examples/llava/clip.cpp | 4 ++-- examples/llava/llava-cli.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index e2ba301bd4bca..3f8baade061cc 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -584,7 +584,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); @@ -1516,4 +1516,4 @@ int clip_n_patches(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} \ No newline at end of file +} diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 8012d3f59d4c8..149fbb23093d3 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -276,7 +276,7 @@ int main(int argc, char ** argv) { if (ctx_llava == NULL) { fprintf(stderr, "%s: error: failed to init llava\n", __func__); return 1; - } + } auto image_embed = load_image(ctx_llava, ¶ms); if (!image_embed) { From 51462f1f2343f1640082528a2f5e66357b39ccc4 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Tue, 23 Jan 2024 15:26:26 +0100 Subject: [PATCH 3/5] Update examples/llava/clip.cpp Co-authored-by: Georgi Gerganov --- examples/llava/clip.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 3f8baade061cc..47079e06af56e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -848,10 +848,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { else { if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; + } else { + new_clip->proj_type = PROJECTOR_TYPE_MLP; } - else - new_clip->proj_type = PROJECTOR_TYPE_MLP; - } } From 0dbd295e39149be17a50bdfd4bf04fafb1bcd0a8 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Tue, 23 Jan 2024 15:28:16 +0100 Subject: [PATCH 4/5] Update llava-cli.cpp --- examples/llava/llava-cli.cpp | 62 +++++++++++++++--------------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 149fbb23093d3..6ac70ba69e281 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -148,45 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); - #if 0 - // llava chat format is "\nUSER:\n\nASSISTANT:" - eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false); - #else - std::string system_prompt, user_prompt; - size_t image_pos = prompt.find(""); - if (image_pos != std::string::npos) { - // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - - system_prompt = prompt.substr(0, image_pos); - user_prompt = prompt.substr(image_pos + std::string("").length()); - // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string - size_t pos = 0; - while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { - user_prompt.replace(pos, 2, "\n"); - pos += 1; // Advance past the replaced newline - } - while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { - system_prompt.replace(pos, 2, "\n"); - pos += 1; // Advance past the replaced newline - } - - printf("system_prompt: %s\n", system_prompt.c_str()); - printf("user_prompt: %s\n", user_prompt.c_str()); - } else { - // llava-1.5 native mode - system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; - user_prompt = prompt + "\nASSISTANT:"; + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find(""); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("").length()); + // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string + size_t pos = 0; + while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { + user_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { + system_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline } + printf("system_prompt: %s\n", system_prompt.c_str()); + printf("user_prompt: %s\n", user_prompt.c_str()); + } else { + // llava-1.5 native mode + system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; + user_prompt = prompt + "\nASSISTANT:"; + } - - eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - - #endif + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); // generate the response From fccab82f5e6f07d3f881e0a049e2b80653eec162 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Wed, 24 Jan 2024 01:43:00 +0100 Subject: [PATCH 5/5] Update clip.cpp bugfix for new conversions --- examples/llava/clip.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 47079e06af56e..3774dcc070a6b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -846,10 +846,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->proj_type = clip_projector_type_from_string(proj_type); } else { + new_clip->proj_type = PROJECTOR_TYPE_MLP; + } + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; - } else { - new_clip->proj_type = PROJECTOR_TYPE_MLP; } } }