From 8f39716d8e1fbfb7024b8762447949e294d1d30e Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 05:40:13 +0100
Subject: [PATCH 1/5] Support for Yi-VL, templating fix for mobileVLM

---
 examples/llava/clip.cpp      | 72 ++++++++++++++++++++++++++++++++----
 examples/llava/llava-cli.cpp | 38 ++++++++++++++++++-
 2 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 6161fd858c29f..e2ba301bd4bca 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -97,6 +97,7 @@ static std::string format(const char * fmt, ...) {
 
 enum projector_type {
     PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
     PROJECTOR_TYPE_LDP,
     PROJECTOR_TYPE_UNKNOWN,
 };
@@ -303,10 +304,18 @@ struct clip_vision_model {
     struct ggml_tensor * projection;
 
     // LLaVA projection
-    struct ggml_tensor * mm_0_w;
-    struct ggml_tensor * mm_0_b;
-    struct ggml_tensor * mm_2_w;
-    struct ggml_tensor * mm_2_b;
+    struct ggml_tensor * mm_0_w = NULL;
+    struct ggml_tensor * mm_0_b = NULL;
+    struct ggml_tensor * mm_2_w = NULL;
+    struct ggml_tensor * mm_2_b = NULL;
+
+    // Yi type models with mlp+normalization projection
+    struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
+    struct ggml_tensor * mm_1_b = NULL;
+    struct ggml_tensor * mm_3_w = NULL;
+    struct ggml_tensor * mm_3_b = NULL;
+    struct ggml_tensor * mm_4_w = NULL;
+    struct ggml_tensor * mm_4_b = NULL;
 
     // MobileVLM projection
     struct ggml_tensor * mm_model_mlp_1_w;
@@ -459,6 +468,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     // pre-layernorm
     {
         embeddings = ggml_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "pre_ln");
 
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
     }
@@ -574,6 +584,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+   
+        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
         }
         else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projector
@@ -815,7 +846,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             new_clip->proj_type = clip_projector_type_from_string(proj_type);
         }
         else {
+            if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
+                new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+            }
+            else
             new_clip->proj_type = PROJECTOR_TYPE_MLP;
+
         }
     }
 
@@ -965,11 +1001,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
 
         // LLaVA projection
-        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
             vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            try {
+                // Yi-type llava
+                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
+                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // missing in Yi-type llava
+                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
+                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
+                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
+            } catch (std::runtime_error & e) {  }
         }
         else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
@@ -1442,6 +1496,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     }
     else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
         return ctx->vision_model.mm_2_b->ne[0];
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        return ctx->vision_model.mm_3_b->ne[0];
     }
     else {
         std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
@@ -1460,4 +1516,4 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
+}
\ No newline at end of file
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index d94795fe317d4..8012d3f59d4c8 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -148,10 +148,45 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
 
+    #if 0
     // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
     eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
     llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
     eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+    #else
+        std::string system_prompt, user_prompt;
+        size_t image_pos = prompt.find("<image>");
+        if (image_pos != std::string::npos) {
+            // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+
+            system_prompt = prompt.substr(0, image_pos);
+            user_prompt = prompt.substr(image_pos + std::string("<image>").length());
+            // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
+            size_t pos = 0;
+            while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
+                user_prompt.replace(pos, 2, "\n");
+                pos += 1; // Advance past the replaced newline
+            }
+            while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
+                system_prompt.replace(pos, 2, "\n");
+                pos += 1; // Advance past the replaced newline
+            }
+
+            printf("system_prompt: %s\n", system_prompt.c_str());
+            printf("user_prompt: %s\n", user_prompt.c_str());
+        } else {
+            // llava-1.5 native mode
+            system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
+            user_prompt = prompt + "\nASSISTANT:";
+        }
+
+
+
+        eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
+        llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
+        eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+
+    #endif
 
     // generate the response
 
@@ -162,6 +197,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     for (int i = 0; i < max_tgt_len; i++) {
         const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
         if (strcmp(tmp, "</s>") == 0) break;
+        if (strstr(tmp, "###")) break; // Yi-VL behavior
 
         printf("%s", tmp);
         fflush(stdout);
@@ -240,7 +276,7 @@ int main(int argc, char ** argv) {
     if (ctx_llava == NULL) {
         fprintf(stderr, "%s: error: failed to init llava\n", __func__);
         return 1;
-    }
+    }  
 
     auto image_embed = load_image(ctx_llava, &params);
     if (!image_embed) {

From ffcd8e7fbda174475693e56e383a8f35c30caba8 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 05:52:12 +0100
Subject: [PATCH 2/5] ws

---
 examples/llava/clip.cpp      | 4 ++--
 examples/llava/llava-cli.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e2ba301bd4bca..3f8baade061cc 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -584,7 +584,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-   
+
         } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@@ -1516,4 +1516,4 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
\ No newline at end of file
+}
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 8012d3f59d4c8..149fbb23093d3 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -276,7 +276,7 @@ int main(int argc, char ** argv) {
     if (ctx_llava == NULL) {
         fprintf(stderr, "%s: error: failed to init llava\n", __func__);
         return 1;
-    }  
+    }
 
     auto image_embed = load_image(ctx_llava, &params);
     if (!image_embed) {

From 51462f1f2343f1640082528a2f5e66357b39ccc4 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:26:26 +0100
Subject: [PATCH 3/5] Update examples/llava/clip.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/llava/clip.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3f8baade061cc..47079e06af56e 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -848,10 +848,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         else {
             if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
                 new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+            } else {
+                new_clip->proj_type = PROJECTOR_TYPE_MLP;
             }
-            else
-            new_clip->proj_type = PROJECTOR_TYPE_MLP;
-
         }
     }
 

From 0dbd295e39149be17a50bdfd4bf04fafb1bcd0a8 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:28:16 +0100
Subject: [PATCH 4/5] Update llava-cli.cpp

---
 examples/llava/llava-cli.cpp | 62 +++++++++++++++---------------------
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 149fbb23093d3..6ac70ba69e281 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -148,45 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
 
-    #if 0
-    // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
-    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
-    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
-    #else
-        std::string system_prompt, user_prompt;
-        size_t image_pos = prompt.find("<image>");
-        if (image_pos != std::string::npos) {
-            // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-
-            system_prompt = prompt.substr(0, image_pos);
-            user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-            // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
-            size_t pos = 0;
-            while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
-                user_prompt.replace(pos, 2, "\n");
-                pos += 1; // Advance past the replaced newline
-            }
-            while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
-                system_prompt.replace(pos, 2, "\n");
-                pos += 1; // Advance past the replaced newline
-            }
-
-            printf("system_prompt: %s\n", system_prompt.c_str());
-            printf("user_prompt: %s\n", user_prompt.c_str());
-        } else {
-            // llava-1.5 native mode
-            system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
-            user_prompt = prompt + "\nASSISTANT:";
+    std::string system_prompt, user_prompt;
+    size_t image_pos = prompt.find("<image>");
+    if (image_pos != std::string::npos) {
+        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+
+        system_prompt = prompt.substr(0, image_pos);
+        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
+        // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
+        size_t pos = 0;
+        while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
+            user_prompt.replace(pos, 2, "\n");
+            pos += 1; // Advance past the replaced newline
+        }
+        while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
+            system_prompt.replace(pos, 2, "\n");
+            pos += 1; // Advance past the replaced newline
         }
 
+        printf("system_prompt: %s\n", system_prompt.c_str());
+        printf("user_prompt: %s\n", user_prompt.c_str());
+    } else {
+        // llava-1.5 native mode
+        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
+        user_prompt = prompt + "\nASSISTANT:";
+    }
 
-
-        eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
-        llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-        eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-
-    #endif
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
+    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
 
     // generate the response
 

From fccab82f5e6f07d3f881e0a049e2b80653eec162 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Wed, 24 Jan 2024 01:43:00 +0100
Subject: [PATCH 5/5] Update clip.cpp

bugfix for new conversions
---
 examples/llava/clip.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 47079e06af56e..3774dcc070a6b 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -846,10 +846,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             new_clip->proj_type = clip_projector_type_from_string(proj_type);
         }
         else {
+            new_clip->proj_type = PROJECTOR_TYPE_MLP;
+        }
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
             if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
                 new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
-            } else {
-                new_clip->proj_type = PROJECTOR_TYPE_MLP;
             }
         }
     }