kherud · kherud · Aug 7, 2024 · Jun 29, 2024 · Jun 30, 2024 · Jul 15, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,11 +22,16 @@ jobs:
         # cmake should figure out OS and ARCH automatically when running build.sh (but we need mvn compile for it)
         run: |
           mvn compile
-          .github/build.sh
+          .github/build.sh -DLLAMA_VERBOSE=ON
       - name: Download model
         run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
       - name: Run tests
         run: mvn test
+      - if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          path: ${{ github.workspace }}/hs_err_pid*.log
+          if-no-files-found: warn
 
   build-and-test-macos:
     name: ${{ matrix.target.runner }}
@@ -37,11 +42,11 @@ jobs:
         target:
           - {
             runner: macos-13,
-            cmake: '-DLLAMA_METAL=OFF'
+            cmake: '-DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
           }
           - {
             runner: macos-14,
-            cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF'
+            cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
           }
     steps:
       - uses: actions/checkout@v4
@@ -70,8 +75,13 @@ jobs:
       - name: Build libraries
         run: |
           mvn compile
-          .github\build.bat
+          .github\build.bat -DLLAMA_VERBOSE=ON
       - name: Download model
         run: curl -L $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME
       - name: Run tests
         run: mvn test
+      - if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          path: ${{ github.workspace }}\hs_err_pid*.log
+          if-no-files-found: warn
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -137,25 +137,24 @@ jobs:
       - name: Run tests
         run: mvn test
 
-  # disabled for now, we don't have access to a macos arm64 runner and testing on x86_64 doesn't work
-#  test-macos:
-#    name: Test Mac
-#    needs: build-macos-native
-#    runs-on: macos-latest
-#    steps:
-#      - uses: actions/checkout@v4
-#      - uses: actions/download-artifact@v3
-#        with:
-#          name: artifacts
-#          path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
-#      - name: Download model
-#        run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
-#      - uses: actions/setup-java@v4
-#        with:
-#          distribution: 'zulu'
-#          java-version: '11'
-#      - name: Run tests
-#        run: mvn test
+  test-macos:
+    name: Test Mac
+    needs: build-macos-native
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v3
+        with:
+          name: artifacts
+          path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
+      - name: Download model
+        run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'zulu'
+          java-version: '11'
+      - name: Run tests
+        run: mvn test
 
 
   test-windows:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,7 +5,6 @@ project(jllama CXX)
 include(FetchContent)
 
 set(BUILD_SHARED_LIBS ON)
-set(LLAMA_STATIC OFF)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(LLAMA_VERBOSE	"llama: verbose output"		OFF)
@@ -24,7 +23,7 @@ FetchContent_MakeAvailable(json)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b3008
+	GIT_TAG        b3534
 )
 FetchContent_MakeAvailable(llama.cpp)
 
@@ -98,11 +97,12 @@ target_compile_definitions(jllama PRIVATE
 )
 
 if(OS_NAME STREQUAL "Windows")
-	set_target_properties(jllama llama PROPERTIES
+	set_target_properties(jllama llama ggml PROPERTIES
+	  RUNTIME_OUTPUT_DIRECTORY_DEBUG ${JLLAMA_DIR}
 	  RUNTIME_OUTPUT_DIRECTORY_RELEASE ${JLLAMA_DIR}
 	)
 else()
-	set_target_properties(jllama llama PROPERTIES
+	set_target_properties(jllama llama ggml PROPERTIES
 	  LIBRARY_OUTPUT_DIRECTORY ${JLLAMA_DIR}
 	)
 endif()

diff --git a/README.md b/README.md
@@ -3,8 +3,7 @@
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 
-The main goal of llama.cpp is to run the LLaMA model using 4-bit integer quantization on a MacBook.
-This repository provides Java bindings for the C++ library.
+Inference of Meta's LLaMA model (and others) in pure C/C++.
 
 **You are welcome to contribute**
 
@@ -32,97 +31,68 @@ Access this library via Maven:
 </dependency>
 ```
 
-There are multiple [examples](src/test/java/examples):
+There are multiple [examples](src/test/java/examples).
 
 ### No Setup required
 
 We support CPU inference for the following platforms out of the box:
 
 - Linux x86-64, aarch64
-- MacOS x86-64, aarch64 (M1)
+- MacOS x86-64, aarch64 (M-series)
 - Windows x86-64, x64, arm (32 bit)
 
 If any of these match your platform, you can include the Maven dependency and get started.
 
 ### Setup required
 
 If none of the above listed platforms matches yours, currently you have to compile the library yourself (also if you 
-want GPU acceleration, see below).
+want GPU acceleration).
 
-This requires:
+This consists of two steps: 1) Compiling the libraries and 2) putting them in the right location.
 
-- Git
-- A C++11 conforming compiler
-- The [cmake](https://www.cmake.org/) build system
-- Java, Maven, and setting [JAVA_HOME](https://www.baeldung.com/java-home-on-windows-7-8-10-mac-os-x-linux)
+##### Library Compilation
 
-Make sure everything works by running
-
-```
-g++ -v  # depending on your compiler
-java -version
-mvn -v
-echo $JAVA_HOME # for linux/macos
-echo %JAVA_HOME% # for windows
-```
-
-Then, checkout [llama.cpp](https://github.com/ggerganov/llama.cpp) to know which build arguments to use (e.g. for CUDA support).
-Finally, you have to run following commands in the directory of this repository (java-llama.cpp).
-Remember to add your build arguments in the fourth line (`cmake ..`):
+First, have a look at [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to know which build arguments to use (e.g. for CUDA support).
+Any build option of llama.cpp works equivalently for this project.
+You then have to run the following commands in the directory of this repository (java-llama.cpp):
 
 ```shell
-mvn compile
-mkdir build
-cd build
-cmake .. # add any other arguments for your backend
-cmake --build . --config Release
+mvn compile  # don't forget this line
+cmake -B build # add any other arguments for your backend, e.g. -DGGML_CUDA=ON
+cmake --build build --config Release
 ```
 
 > [!TIP]
-> Use `-DLLAMA_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
+> Use `-DGGML_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
 
-All required files will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
+All compiled libraries will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
 
 ```shell
 --  Installing files to /java-llama.cpp/src/main/resources/de/kherud/llama/Linux/x86_64
 ```
 
-This includes:
-
-- Linux: `libllama.so`, `libjllama.so`
-- MacOS: `libllama.dylib`, `libjllama.dylib`, `ggml-metal.metal`
-- Windows: `llama.dll`, `jllama.dll`
-
-If you then compile your own JAR from this directory, you are ready to go. Otherwise, if you still want to use the library
-as a Maven dependency, see below how to set the necessary paths in order for Java to find your compiled libraries.
+#### Library Location
 
-### Custom llama.cpp Setup (GPU acceleration)
+This project has to load three shared libraries:
 
-This repository provides default support for CPU based inference. You can compile `llama.cpp` any way you want, however (see [Setup Required](#setup-required)).
-In order to use your self-compiled library, set either of the [JVM options](https://www.jetbrains.com/help/idea/tuning-the-ide.html#configure-jvm-options):
+- ggml
+- llama
+- jllama
 
-- `de.kherud.llama.lib.path`, for example `-Dde.kherud.llama.lib.path=/directory/containing/lib`
-- `java.library.path`, for example `-Djava.library.path=/directory/containing/lib`
+Note, that the file names vary between operating systems, e.g., `ggml.dll` on Windows, `libggml.so` on Linux, and `libggml.dylib` on macOS.
 
-This repository uses [`System#mapLibraryName`](https://docs.oracle.com/javase%2F7%2Fdocs%2Fapi%2F%2F/java/lang/System.html) to determine the name of the shared library for you platform.
-If for any reason your library has a different name, you can set it with
-
-- `de.kherud.llama.lib.name`, for example `-Dde.kherud.llama.lib.name=myname.so`
-
-For compiling `llama.cpp`, refer to the official [readme](https://github.com/ggerganov/llama.cpp#build) for details.
-The library can be built with the `llama.cpp` project:
-
-```shell
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=ON  # add any other arguments for your backend
-cmake --build . --config Release
-```
+The application will search in the following order in the following locations:
 
-Look for the shared library in `build`.
+- In **de.kherud.llama.lib.path**: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`.
+- In **java.library.path**: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux.
+  You can find out the locations using `System.out.println(System.getProperty("java.library.path"))`.
+  Use this option if you want to install the shared libraries as system libraries.
+- From the **JAR**: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library.
+  This of course only works for the [supported platforms](#no-setup-required) .
 
-> [!IMPORTANT]
-> If you are running MacOS with Metal, you have to put the file `ggml-metal.metal` from `build/bin` in the same directory as the shared library.
+Not all libraries have to be in the same location.
+For example, if you already have a llama.cpp and ggml version you can install them as a system library and rely on the jllama library from the JAR.
+This way, you don't have to compile anything. 
 
 ## Documentation
 

diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 
 	<groupId>de.kherud</groupId>
 	<artifactId>llama</artifactId>
-	<version>3.2.1</version>
+	<version>3.3.0</version>
 	<packaging>jar</packaging>
 
 	<name>${project.groupId}:${project.artifactId}</name>

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
@@ -355,13 +355,12 @@ JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *vm, void *reserved)
 JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jobject obj, jstring jparams)
 {
     gpt_params params;
-    server_params sparams;
 
     auto *ctx_server = new server_context();
 
     std::string c_params = parse_jstring(env, jparams);
     json json_params = json::parse(c_params);
-    server_params_parse(json_params, sparams, params);
+    server_params_parse(json_params, params);
 
     if (json_value(json_params, "disable_log", false))
     {
@@ -372,9 +371,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
         log_enable();
     }
 
-    if (!sparams.system_prompt.empty())
+    if (!params.system_prompt.empty())
     {
-        ctx_server->system_prompt_set(sparams.system_prompt);
+        ctx_server->system_prompt_set(params.system_prompt);
     }
 
     if (params.model_alias == "unknown")
@@ -395,6 +394,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
 
+    // Necessary similarity of prompt for slot selection
+    ctx_server->slot_prompt_similarity = params.slot_prompt_similarity;
+
     // load the model
     if (!ctx_server->load_model(params))
     {
@@ -411,32 +413,36 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
     const auto model_meta = ctx_server->model_meta();
 
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-    if (sparams.chat_template.empty())
+    if (params.chat_template.empty())
     {
         if (!ctx_server->validate_model_chat_template())
         {
             LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
                       "may cause the model to output suboptimal responses",
                       {});
-            sparams.chat_template = "chatml";
+            params.chat_template = "chatml";
         }
     }
-    ctx_server->chat_template = sparams.chat_template;
 
-    // print sample chat example to make it clear which template is used
+    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    if (params.chat_template.empty())
     {
-        json chat;
-        chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
-        chat.push_back({{"role", "user"}, {"content", "Hello"}});
-        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
-        chat.push_back({{"role", "user"}, {"content", "How are you?"}});
-
-        const std::string chat_example = format_chat(ctx_server->model, sparams.chat_template, chat);
+        if (!ctx_server->validate_model_chat_template())
+        {
+            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
+                      "may cause the model to output suboptimal responses",
+                      {});
+            params.chat_template = "chatml";
+        }
+    }
 
-        LOG_INFO("chat template", {
-                                      {"chat_example", chat_example},
-                                      {"built_in", sparams.chat_template.empty()},
-                                  });
+    // print sample chat example to make it clear which template is used
+    {
+        LOG_INFO("chat template",
+                 {
+                     {"chat_example", llama_chat_format_example(ctx_server->model, params.chat_template)},
+                     {"built_in", params.chat_template.empty()},
+                 });
     }
 
     ctx_server->queue_tasks.on_new_task(
@@ -480,7 +486,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv
         json chat;
         chat.push_back({{"role", "system"}, {"content", ctx_server->system_prompt}});
         chat.push_back({{"role", "user"}, {"content", json_params["prompt"]}});
-        json_params["prompt"] = format_chat(ctx_server->model, ctx_server->chat_template, chat);
+        json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat);
     }
 
     const int id_task = ctx_server->queue_tasks.get_new_id();