Skip to content

Commit 35a6031

Browse files
committed
Merge 'origin/master' into hipblas
2 parents df7346c + 66a2555 commit 35a6031

File tree

22 files changed

+452
-211
lines changed

22 files changed

+452
-211
lines changed

README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
99

1010
**Hot topics:**
1111

12+
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
1213
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
1314
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
1415

@@ -29,6 +30,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
2930
<li><a href="#quantization">Quantization</a></li>
3031
<li><a href="#interactive-mode">Interactive mode</a></li>
3132
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
33+
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
3234
<li><a href="#using-gpt4all">Using GPT4All</a></li>
3335
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
3436
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
@@ -543,6 +545,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
543545
>
544546
```
545547

548+
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
549+
550+
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
551+
552+
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
553+
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
554+
546555
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
547556
548557
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
@@ -672,12 +681,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
672681
```
673682
GGML_OPENCL_PLATFORM=0
674683
GGML_OPENCL_DEVICE=0
675-
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
676-
./main (...)
684+
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
677685
```
678686

679687
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
680688

689+
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
690+
681691
### Docker
682692

683693
#### Prerequisites

build.zig

Lines changed: 44 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,58 @@
11
const std = @import("std");
22

3+
// Zig Version: 0.11.0-dev.3379+629f0d23b
34
pub fn build(b: *std.build.Builder) void {
45
const target = b.standardTargetOptions(.{});
5-
const optimize = b.standardReleaseOptions();
6-
const want_lto = b.option(bool, "lto", "Want -fLTO");
7-
8-
const lib = b.addStaticLibrary("llama", null);
9-
lib.want_lto = want_lto;
10-
lib.setTarget(target);
11-
lib.setBuildMode(optimize);
6+
const optimize = b.standardOptimizeOption(.{});
7+
const lib = b.addStaticLibrary(.{
8+
.name = "llama",
9+
.target = target,
10+
.optimize = optimize,
11+
});
12+
lib.linkLibC();
1213
lib.linkLibCpp();
1314
lib.addIncludePath(".");
14-
lib.addIncludePath("examples");
15+
lib.addIncludePath("./examples");
1516
lib.addCSourceFiles(&.{
1617
"ggml.c",
1718
}, &.{"-std=c11"});
1819
lib.addCSourceFiles(&.{
1920
"llama.cpp",
2021
}, &.{"-std=c++11"});
21-
lib.install();
22-
23-
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
24-
25-
const exe = build_example("main", build_args);
26-
_ = build_example("quantize", build_args);
27-
_ = build_example("perplexity", build_args);
28-
_ = build_example("embedding", build_args);
29-
30-
// create "zig build run" command for ./main
31-
32-
const run_cmd = exe.run();
33-
run_cmd.step.dependOn(b.getInstallStep());
34-
if (b.args) |args| {
35-
run_cmd.addArgs(args);
22+
b.installArtifact(lib);
23+
24+
const examples = .{
25+
"main",
26+
"baby-llama",
27+
"embedding",
28+
// "metal",
29+
"perplexity",
30+
"quantize",
31+
"quantize-stats",
32+
"save-load-state",
33+
// "server",
34+
"simple",
35+
"train-text-from-scratch",
36+
};
37+
38+
inline for (examples) |example_name| {
39+
const exe = b.addExecutable(.{
40+
.name = example_name,
41+
.target = target,
42+
.optimize = optimize,
43+
});
44+
exe.addIncludePath(".");
45+
exe.addIncludePath("./examples");
46+
exe.addCSourceFiles(&.{
47+
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
48+
"examples/common.cpp",
49+
}, &.{"-std=c++11"});
50+
exe.linkLibrary(lib);
51+
b.installArtifact(exe);
52+
const run_cmd = b.addRunArtifact(exe);
53+
run_cmd.step.dependOn(b.getInstallStep());
54+
if (b.args) |args| run_cmd.addArgs(args);
55+
const run_step = b.step("run_" ++ example_name, "Run the app");
56+
run_step.dependOn(&run_cmd.step);
3657
}
37-
38-
const run_step = b.step("run", "Run the app");
39-
run_step.dependOn(&run_cmd.step);
40-
}
41-
42-
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
43-
const b = args.b;
44-
const lib = args.lib;
45-
const want_lto = args.want_lto;
46-
47-
const exe = b.addExecutable(name, null);
48-
exe.want_lto = want_lto;
49-
lib.setTarget(args.target);
50-
lib.setBuildMode(args.optimize);
51-
exe.addIncludePath(".");
52-
exe.addIncludePath("examples");
53-
exe.addCSourceFiles(&.{
54-
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
55-
"examples/common.cpp",
56-
}, &.{"-std=c++11"});
57-
exe.linkLibrary(lib);
58-
exe.install();
59-
60-
return exe;
6158
}

convert.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -998,9 +998,9 @@ def write_vocab(self, vocab: Vocab) -> None:
998998
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
999999
of = OutputFile(fname_out)
10001000
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
1001-
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
1001+
n_head=1, n_layer=0)
10021002
of = OutputFile(fname_out)
1003-
of.write_file_header(params)
1003+
of.write_file_header(params, file_type=GGMLFileType.AllF32)
10041004
of.write_vocab(vocab)
10051005
of.fout.close()
10061006

examples/common.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
536536
return res;
537537
}
538538

539-
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
539+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
540540
auto lparams = llama_context_default_params();
541541

542542
lparams.n_ctx = params.n_ctx;
@@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
552552
lparams.logits_all = params.perplexity;
553553
lparams.embedding = params.embedding;
554554

555-
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
555+
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
556+
if (model == NULL) {
557+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
558+
return std::make_tuple(nullptr, nullptr);
559+
}
556560

561+
llama_context * lctx = llama_new_context_with_model(model, lparams);
557562
if (lctx == NULL) {
558-
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
559-
return NULL;
563+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
564+
llama_free_model(model);
565+
return std::make_tuple(nullptr, nullptr);
560566
}
561567

562568
if (!params.lora_adapter.empty()) {
563-
int err = llama_apply_lora_from_file(lctx,
569+
int err = llama_model_apply_lora_from_file(model,
564570
params.lora_adapter.c_str(),
565571
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
566572
params.n_threads);
567573
if (err != 0) {
568574
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
569-
return NULL;
575+
llama_free(lctx);
576+
llama_free_model(model);
577+
return std::make_tuple(nullptr, nullptr);
570578
}
571579
}
572580

573-
return lctx;
581+
return std::make_tuple(model, lctx);
574582
}
575583

576584
void console_init(console_state & con_st) {

examples/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <random>
1010
#include <thread>
1111
#include <unordered_map>
12+
#include <tuple>
1213

1314
#if !defined (_WIN32)
1415
#include <stdio.h>
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
9596
// Model utils
9697
//
9798

98-
struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
99+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
99100

100101
//
101102
// Console utils

examples/embedding/embedding.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
3737

3838
llama_init_backend();
3939

40+
llama_model * model;
4041
llama_context * ctx;
4142

4243
// load the model
43-
ctx = llama_init_from_gpt_params(params);
44-
if (ctx == NULL) {
44+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
45+
if (model == NULL) {
4546
fprintf(stderr, "%s: error: unable to load model\n", __func__);
4647
return 1;
4748
}
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
9091

9192
llama_print_timings(ctx);
9293
llama_free(ctx);
94+
llama_free_model(model);
9395

9496
return 0;
9597
}

examples/main/main.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
107107

108108
llama_init_backend();
109109

110+
llama_model * model;
110111
llama_context * ctx;
111112
g_ctx = &ctx;
112113

113114
// load the model and apply lora adapter, if any
114-
ctx = llama_init_from_gpt_params(params);
115-
if (ctx == NULL) {
115+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
116+
if (model == NULL) {
116117
fprintf(stderr, "%s: error: unable to load model\n", __func__);
117118
return 1;
118119
}
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
139140

140141
llama_print_timings(ctx);
141142
llama_free(ctx);
143+
llama_free_model(model);
142144

143145
return 0;
144146
}
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
147149
if (params.export_cgraph) {
148150
llama_eval_export(ctx, "llama.ggml");
149151
llama_free(ctx);
152+
llama_free_model(model);
150153

151154
return 0;
152155
}
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
666669

667670
llama_print_timings(ctx);
668671
llama_free(ctx);
672+
llama_free_model(model);
669673

670674
return 0;
671675
}

examples/perplexity/perplexity.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
149149

150150
llama_init_backend();
151151

152+
llama_model * model;
152153
llama_context * ctx;
153154

154155
// load the model and apply lora adapter, if any
155-
ctx = llama_init_from_gpt_params(params);
156-
if (ctx == NULL) {
156+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
157+
if (model == NULL) {
157158
fprintf(stderr, "%s: error: unable to load model\n", __func__);
158159
return 1;
159160
}
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
169170

170171
llama_print_timings(ctx);
171172
llama_free(ctx);
173+
llama_free_model(model);
172174

173175
return 0;
174176
}

examples/quantize-stats/quantize-stats.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
320320
fprintf(stderr, "Loading model\n");
321321

322322
const int64_t t_main_start_us = ggml_time_us();
323+
llama_model * model;
323324
llama_context * ctx;
324325

325326
{
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
330331
lparams.f16_kv = false;
331332
lparams.use_mlock = false;
332333

333-
ctx = llama_init_from_file(params.model.c_str(), lparams);
334+
model = llama_load_model_from_file(params.model.c_str(), lparams);
334335

335-
if (ctx == NULL) {
336+
if (model == NULL) {
336337
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
337338
return 1;
338339
}
340+
341+
ctx = llama_new_context_with_model(model, lparams);
342+
343+
if (ctx == NULL) {
344+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
345+
llama_free_model(model);
346+
return 1;
347+
}
339348
}
340349

341350
const auto &tensors = llama_internal_get_tensor_map(ctx);
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
357366
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
358367
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
359368
llama_free(ctx);
369+
llama_free_model(model);
360370
return 1;
361371
}
362372
included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
415425

416426

417427
llama_free(ctx);
428+
llama_free_model(model);
418429
// report timing
419430
{
420431
const int64_t t_main_end_us = ggml_time_us();

0 commit comments

Comments
 (0)