Skip to content

Upgrade llama.cpp to b3265, support gemma2, remove beam parameter[ https://github.com/ggerganov/llama.cpp/pull/7985 ] #71

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@ jobs:
# cmake should figure out OS and ARCH automatically when running build.sh (but we need mvn compile for it)
run: |
mvn compile
.github/build.sh
.github/build.sh -DLLAMA_VERBOSE=ON
- name: Download model
run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
- name: Run tests
run: mvn test
- if: failure()
uses: actions/upload-artifact@v3
with:
path: ${{ github.workspace }}/hs_err_pid*.log
if-no-files-found: warn

build-and-test-macos:
name: ${{ matrix.target.runner }}
Expand All @@ -37,11 +42,11 @@ jobs:
target:
- {
runner: macos-13,
cmake: '-DLLAMA_METAL=OFF'
cmake: '-DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
}
- {
runner: macos-14,
cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF'
cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
}
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -70,8 +75,13 @@ jobs:
- name: Build libraries
run: |
mvn compile
.github\build.bat
.github\build.bat -DLLAMA_VERBOSE=ON
- name: Download model
run: curl -L $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME
- name: Run tests
run: mvn test
- if: failure()
uses: actions/upload-artifact@v3
with:
path: ${{ github.workspace }}\hs_err_pid*.log
if-no-files-found: warn
37 changes: 18 additions & 19 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,25 +137,24 @@ jobs:
- name: Run tests
run: mvn test

# disabled for now, we don't have access to a macos arm64 runner and testing on x86_64 doesn't work
# test-macos:
# name: Test Mac
# needs: build-macos-native
# runs-on: macos-latest
# steps:
# - uses: actions/checkout@v4
# - uses: actions/download-artifact@v3
# with:
# name: artifacts
# path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
# - name: Download model
# run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
# - uses: actions/setup-java@v4
# with:
# distribution: 'zulu'
# java-version: '11'
# - name: Run tests
# run: mvn test
test-macos:
name: Test Mac
needs: build-macos-native
runs-on: macos-14
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v3
with:
name: artifacts
path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
- name: Download model
run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
- uses: actions/setup-java@v4
with:
distribution: 'zulu'
java-version: '11'
- name: Run tests
run: mvn test


test-windows:
Expand Down
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ project(jllama CXX)
include(FetchContent)

set(BUILD_SHARED_LIBS ON)
set(LLAMA_STATIC OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

option(LLAMA_VERBOSE "llama: verbose output" OFF)
Expand All @@ -24,7 +23,7 @@ FetchContent_MakeAvailable(json)
FetchContent_Declare(
llama.cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG b3008
GIT_TAG b3534
)
FetchContent_MakeAvailable(llama.cpp)

Expand Down Expand Up @@ -98,11 +97,12 @@ target_compile_definitions(jllama PRIVATE
)

if(OS_NAME STREQUAL "Windows")
set_target_properties(jllama llama PROPERTIES
set_target_properties(jllama llama ggml PROPERTIES
RUNTIME_OUTPUT_DIRECTORY_DEBUG ${JLLAMA_DIR}
RUNTIME_OUTPUT_DIRECTORY_RELEASE ${JLLAMA_DIR}
)
else()
set_target_properties(jllama llama PROPERTIES
set_target_properties(jllama llama ggml PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${JLLAMA_DIR}
)
endif()
Expand Down
90 changes: 30 additions & 60 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

# Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)

The main goal of llama.cpp is to run the LLaMA model using 4-bit integer quantization on a MacBook.
This repository provides Java bindings for the C++ library.
Inference of Meta's LLaMA model (and others) in pure C/C++.

**You are welcome to contribute**

Expand Down Expand Up @@ -32,97 +31,68 @@ Access this library via Maven:
</dependency>
```

There are multiple [examples](src/test/java/examples):
There are multiple [examples](src/test/java/examples).

### No Setup required

We support CPU inference for the following platforms out of the box:

- Linux x86-64, aarch64
- MacOS x86-64, aarch64 (M1)
- MacOS x86-64, aarch64 (M-series)
- Windows x86-64, x64, arm (32 bit)

If any of these match your platform, you can include the Maven dependency and get started.

### Setup required

If none of the above listed platforms matches yours, currently you have to compile the library yourself (also if you
want GPU acceleration, see below).
want GPU acceleration).

This requires:
This consists of two steps: 1) Compiling the libraries and 2) putting them in the right location.

- Git
- A C++11 conforming compiler
- The [cmake](https://www.cmake.org/) build system
- Java, Maven, and setting [JAVA_HOME](https://www.baeldung.com/java-home-on-windows-7-8-10-mac-os-x-linux)
##### Library Compilation

Make sure everything works by running

```
g++ -v # depending on your compiler
java -version
mvn -v
echo $JAVA_HOME # for linux/macos
echo %JAVA_HOME% # for windows
```

Then, checkout [llama.cpp](https://github.com/ggerganov/llama.cpp) to know which build arguments to use (e.g. for CUDA support).
Finally, you have to run following commands in the directory of this repository (java-llama.cpp).
Remember to add your build arguments in the fourth line (`cmake ..`):
First, have a look at [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to know which build arguments to use (e.g. for CUDA support).
Any build option of llama.cpp works equivalently for this project.
You then have to run the following commands in the directory of this repository (java-llama.cpp):

```shell
mvn compile
mkdir build
cd build
cmake .. # add any other arguments for your backend
cmake --build . --config Release
mvn compile # don't forget this line
cmake -B build # add any other arguments for your backend, e.g. -DGGML_CUDA=ON
cmake --build build --config Release
```

> [!TIP]
> Use `-DLLAMA_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
> Use `-DGGML_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.

All required files will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
All compiled libraries will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:

```shell
-- Installing files to /java-llama.cpp/src/main/resources/de/kherud/llama/Linux/x86_64
```

This includes:

- Linux: `libllama.so`, `libjllama.so`
- MacOS: `libllama.dylib`, `libjllama.dylib`, `ggml-metal.metal`
- Windows: `llama.dll`, `jllama.dll`

If you then compile your own JAR from this directory, you are ready to go. Otherwise, if you still want to use the library
as a Maven dependency, see below how to set the necessary paths in order for Java to find your compiled libraries.
#### Library Location

### Custom llama.cpp Setup (GPU acceleration)
This project has to load three shared libraries:

This repository provides default support for CPU based inference. You can compile `llama.cpp` any way you want, however (see [Setup Required](#setup-required)).
In order to use your self-compiled library, set either of the [JVM options](https://www.jetbrains.com/help/idea/tuning-the-ide.html#configure-jvm-options):
- ggml
- llama
- jllama

- `de.kherud.llama.lib.path`, for example `-Dde.kherud.llama.lib.path=/directory/containing/lib`
- `java.library.path`, for example `-Djava.library.path=/directory/containing/lib`
Note, that the file names vary between operating systems, e.g., `ggml.dll` on Windows, `libggml.so` on Linux, and `libggml.dylib` on macOS.

This repository uses [`System#mapLibraryName`](https://docs.oracle.com/javase%2F7%2Fdocs%2Fapi%2F%2F/java/lang/System.html) to determine the name of the shared library for you platform.
If for any reason your library has a different name, you can set it with

- `de.kherud.llama.lib.name`, for example `-Dde.kherud.llama.lib.name=myname.so`

For compiling `llama.cpp`, refer to the official [readme](https://github.com/ggerganov/llama.cpp#build) for details.
The library can be built with the `llama.cpp` project:

```shell
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=ON # add any other arguments for your backend
cmake --build . --config Release
```
The application will search in the following order in the following locations:

Look for the shared library in `build`.
- In **de.kherud.llama.lib.path**: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`.
- In **java.library.path**: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux.
You can find out the locations using `System.out.println(System.getProperty("java.library.path"))`.
Use this option if you want to install the shared libraries as system libraries.
- From the **JAR**: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library.
This of course only works for the [supported platforms](#no-setup-required) .

> [!IMPORTANT]
> If you are running MacOS with Metal, you have to put the file `ggml-metal.metal` from `build/bin` in the same directory as the shared library.
Not all libraries have to be in the same location.
For example, if you already have a llama.cpp and ggml version you can install them as a system library and rely on the jllama library from the JAR.
This way, you don't have to compile anything.

## Documentation

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>de.kherud</groupId>
<artifactId>llama</artifactId>
<version>3.2.1</version>
<version>3.3.0</version>
<packaging>jar</packaging>

<name>${project.groupId}:${project.artifactId}</name>
Expand Down
46 changes: 26 additions & 20 deletions src/main/cpp/jllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,13 +355,12 @@ JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *vm, void *reserved)
JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jobject obj, jstring jparams)
{
gpt_params params;
server_params sparams;

auto *ctx_server = new server_context();

std::string c_params = parse_jstring(env, jparams);
json json_params = json::parse(c_params);
server_params_parse(json_params, sparams, params);
server_params_parse(json_params, params);

if (json_value(json_params, "disable_log", false))
{
Expand All @@ -372,9 +371,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
log_enable();
}

if (!sparams.system_prompt.empty())
if (!params.system_prompt.empty())
{
ctx_server->system_prompt_set(sparams.system_prompt);
ctx_server->system_prompt_set(params.system_prompt);
}

if (params.model_alias == "unknown")
Expand All @@ -395,6 +394,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo

std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};

// Necessary similarity of prompt for slot selection
ctx_server->slot_prompt_similarity = params.slot_prompt_similarity;

// load the model
if (!ctx_server->load_model(params))
{
Expand All @@ -411,32 +413,36 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
const auto model_meta = ctx_server->model_meta();

// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (sparams.chat_template.empty())
if (params.chat_template.empty())
{
if (!ctx_server->validate_model_chat_template())
{
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
"may cause the model to output suboptimal responses",
{});
sparams.chat_template = "chatml";
params.chat_template = "chatml";
}
}
ctx_server->chat_template = sparams.chat_template;

// print sample chat example to make it clear which template is used
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty())
{
json chat;
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
chat.push_back({{"role", "user"}, {"content", "Hello"}});
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
chat.push_back({{"role", "user"}, {"content", "How are you?"}});

const std::string chat_example = format_chat(ctx_server->model, sparams.chat_template, chat);
if (!ctx_server->validate_model_chat_template())
{
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
"may cause the model to output suboptimal responses",
{});
params.chat_template = "chatml";
}
}

LOG_INFO("chat template", {
{"chat_example", chat_example},
{"built_in", sparams.chat_template.empty()},
});
// print sample chat example to make it clear which template is used
{
LOG_INFO("chat template",
{
{"chat_example", llama_chat_format_example(ctx_server->model, params.chat_template)},
{"built_in", params.chat_template.empty()},
});
}

ctx_server->queue_tasks.on_new_task(
Expand Down Expand Up @@ -480,7 +486,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv
json chat;
chat.push_back({{"role", "system"}, {"content", ctx_server->system_prompt}});
chat.push_back({{"role", "user"}, {"content", json_params["prompt"]}});
json_params["prompt"] = format_chat(ctx_server->model, ctx_server->chat_template, chat);
json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat);
}

const int id_task = ctx_server->queue_tasks.get_new_id();
Expand Down
Loading
Loading