From 8f0350578d14047d33304dd9aecde07fd3f355c7 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Tue, 25 Jun 2024 18:51:06 +0800
Subject: [PATCH] fix quality problem in pr code

---
 .gitignore               |  1 +
 Makefile                 |  2 +-
 examples/llava/clip.cpp  |  4 ++--
 examples/llava/llava.cpp | 21 ++++++++++++++++++++-
 examples/llava/llava.h   |  1 +
 5 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6c3a0abf09264..409c0659323f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,6 +117,7 @@ poetry.toml
 /tests/test-tokenizer-0
 /tests/test-tokenizer-1-bpe
 /tests/test-tokenizer-1-spm
+/openbmb
 
 # Scripts
 !/scripts/install-oneapi.bat
diff --git a/Makefile b/Makefile
index 2ffee6338d6ac..1be0154ee9733 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ BUILD_TARGETS = \
 	llama-imatrix \
 	llama-infill \
 	llama-llava-cli \
-  llama-minicpmv-cli\
+    llama-minicpmv-cli\
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9353f5a02283f..5b6f7aef31917 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -647,7 +647,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     }
 
     // loop over layers
-    for (int il = 0; il < n_layer - 1; il++) {
+    for (int il = 0; il < n_layer; il++) {
         struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
 
         //const size_t nb_q_w = model.layers[il].q_w->nb[0];
@@ -2077,7 +2077,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 
     // build the inference graph
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, load_image_size);
     ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
 
     // set inputs
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 0d7324037833c..93a5b0ea4f424 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -657,7 +657,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
         for (size_t j = 0; j < imgs[i].size(); ++j) {
             float* image_embed = NULL;
             int n_image_pos = 0;
-            bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
+            bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
             if (!image_embed_result) {
                 LOG_TEE("%s: coulnd't embed the image\n", __func__);
                 return NULL;
@@ -672,6 +672,25 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
     return results;
 }
 
+bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
+    if (!image_embd) {
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        return false;
+    }
+
+    int n_img_pos;
+    if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        free(image_embd);
+        return false;
+    }
+    *image_embd_out = image_embd;
+    *n_img_pos_out = n_img_pos;
+
+    return true;
+}
+
 bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
     auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
     auto image_embed_slices = embeds->image_embeds;
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index 420ae15d641f2..5f29f02c55a66 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -47,6 +47,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** build an image embed from image file bytes */
 LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
 /** build an image embed from a path to an image filename */
+LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 LLAVA_API struct uhd_image_embed *  llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed *  embed);