Skip to content

Commit b8b61f0

Browse files
ngxsonarthw
authored andcommitted
clip : bring back GPU support (ggml-org#12322)
* clip : bring back GPU support * use n_gpu_layers param * fix double free * ggml_backend_init_by_type * clean up
1 parent 1989578 commit b8b61f0

File tree

3 files changed

+89
-77
lines changed

3 files changed

+89
-77
lines changed

examples/llava/clip.cpp

Lines changed: 75 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,12 @@
44
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
55
#include "clip.h"
66
#include "ggml.h"
7+
#include "ggml-cpp.h"
78
#include "ggml-cpu.h"
89
#include "ggml-alloc.h"
910
#include "ggml-backend.h"
1011
#include "gguf.h"
1112

12-
//#ifdef GGML_USE_CUDA
13-
//#include "ggml-cuda.h"
14-
//#endif
15-
//
16-
//#ifdef GGML_USE_SYCL
17-
//#include "ggml-sycl.h"
18-
//#endif
19-
//
20-
//#ifdef GGML_USE_METAL
21-
//#include "ggml-metal.h"
22-
//#endif
23-
//
24-
//#ifdef GGML_USE_CANN
25-
//#include "ggml-cann.h"
26-
//#endif
27-
//
28-
//#ifdef GGML_USE_VULKAN
29-
//#include "ggml-vulkan.h"
30-
//#endif
31-
3213
#define STB_IMAGE_IMPLEMENTATION
3314
#include "stb_image.h"
3415

@@ -600,18 +581,54 @@ struct clip_ctx {
600581
bool has_post_norm = false;
601582
bool has_patch_bias = false;
602583

603-
struct gguf_context * ctx_gguf;
604-
struct ggml_context * ctx_data;
584+
struct gguf_context * ctx_gguf = nullptr;
585+
struct ggml_context * ctx_data = nullptr;
605586

606587
std::vector<uint8_t> buf_compute_meta;
607588

608-
// memory buffers to evaluate the model
609-
ggml_backend_buffer_t params_buffer = NULL;
589+
std::vector<ggml_backend_t> backend_ptrs;
590+
std::vector<ggml_backend_buffer_type_t> backend_buft;
591+
592+
ggml_backend_t backend = nullptr;
593+
ggml_backend_t backend_cpu = nullptr;
594+
ggml_backend_buffer_t buf = nullptr;
610595

611-
ggml_backend_t backend = NULL;
612-
ggml_gallocr_t compute_alloc = NULL;
596+
ggml_backend_sched_ptr sched;
613597

614598
struct clip_image_size * load_image_size;
599+
600+
clip_ctx(clip_context_params & ctx_params) {
601+
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
602+
backend = ctx_params.use_gpu
603+
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
604+
: nullptr;
605+
606+
if (backend) {
607+
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
608+
backend_ptrs.push_back(backend);
609+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
610+
} else {
611+
backend = backend_cpu;
612+
LOG_INF("%s: CLIP using CPU backend\n", __func__);
613+
}
614+
615+
backend_ptrs.push_back(backend_cpu);
616+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
617+
618+
sched.reset(
619+
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
620+
);
621+
}
622+
623+
~clip_ctx() {
624+
ggml_free(ctx_data);
625+
gguf_free(ctx_gguf);
626+
ggml_backend_buffer_free(buf);
627+
ggml_backend_free(backend);
628+
if (backend_cpu != backend) {
629+
ggml_backend_free(backend_cpu);
630+
}
631+
}
615632
};
616633

617634
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
@@ -1184,6 +1201,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
11841201

11851202
// read and create ggml_context containing the tensors and their data
11861203
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1204+
return clip_init(fname, clip_context_params{
1205+
/* use_gpu */ true,
1206+
/* verbosity */ verbosity,
1207+
});
1208+
}
1209+
1210+
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
1211+
int verbosity = ctx_params.verbosity;
11871212
struct ggml_context * meta = NULL;
11881213

11891214
struct gguf_init_params params = {
@@ -1277,7 +1302,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12771302
}
12781303
}
12791304

1280-
clip_ctx * new_clip = new clip_ctx{};
1305+
clip_ctx * new_clip = new clip_ctx(ctx_params);
12811306

12821307
// update projector type
12831308
{
@@ -1296,36 +1321,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961321
}
12971322
}
12981323

1299-
//#ifdef GGML_USE_CUDA
1300-
// new_clip->backend = ggml_backend_cuda_init(0);
1301-
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302-
//#endif
1303-
//
1304-
//#ifdef GGML_USE_METAL
1305-
// new_clip->backend = ggml_backend_metal_init();
1306-
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307-
//#endif
1308-
//
1309-
//#ifdef GGML_USE_CANN
1310-
// new_clip->backend = ggml_backend_cann_init(0);
1311-
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312-
//#endif
1313-
//
1314-
//#ifdef GGML_USE_VULKAN
1315-
// new_clip->backend = ggml_backend_vk_init(0);
1316-
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317-
//#endif
1318-
//
1319-
//#ifdef GGML_USE_SYCL
1320-
// new_clip->backend = ggml_backend_sycl_init(0);
1321-
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322-
//#endif
1323-
1324-
if (!new_clip->backend) {
1325-
new_clip->backend = ggml_backend_cpu_init();
1326-
LOG_INF("%s: CLIP using CPU backend\n", __func__);
1327-
}
1328-
13291324
// model size and capabilities
13301325
{
13311326
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1416,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14211416
}
14221417

14231418
// alloc memory and offload data
1424-
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
1419+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
1420+
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
1421+
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
14251422
for (int i = 0; i < n_tensors; ++i) {
14261423
const char * name = gguf_get_tensor_name(ctx, i);
14271424
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
@@ -1434,7 +1431,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14341431
return nullptr;
14351432
}
14361433
int num_bytes = ggml_nbytes(cur);
1437-
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
1434+
if (ggml_backend_buft_is_host(buft)) {
14381435
// for the CPU and Metal backend, we can read directly into the tensor
14391436
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
14401437
} else {
@@ -1720,14 +1717,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
17201717
// measure mem requirement and allocate
17211718
{
17221719
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
1723-
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
17241720
clip_image_f32_batch batch;
17251721
batch.size = 1;
17261722
batch.data = nullptr;
17271723
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1728-
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1729-
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1730-
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1724+
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
1725+
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
1726+
ggml_backend_t backend = new_clip->backend_ptrs[i];
1727+
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
1728+
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
1729+
if (size > 1) {
1730+
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
1731+
ggml_backend_buft_name(buft),
1732+
size / 1024.0 / 1024.0);
1733+
}
1734+
}
17311735
}
17321736

17331737
return new_clip;
@@ -2408,12 +2412,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
24082412
}
24092413

24102414
void clip_free(clip_ctx * ctx) {
2411-
ggml_free(ctx->ctx_data);
2412-
gguf_free(ctx->ctx_gguf);
2413-
2414-
ggml_backend_buffer_free(ctx->params_buffer);
2415-
ggml_backend_free(ctx->backend);
2416-
ggml_gallocr_free(ctx->compute_alloc);
24172415
delete ctx;
24182416
}
24192417

@@ -2609,8 +2607,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26092607
}
26102608

26112609
// build the inference graph
2610+
ggml_backend_sched_reset(ctx->sched.get());
26122611
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
2613-
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
2612+
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
26142613

26152614
// set inputs
26162615
const auto & model = ctx->vision_model;
@@ -2775,11 +2774,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27752774
}
27762775
}
27772776

2778-
if (ggml_backend_is_cpu(ctx->backend)) {
2779-
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
2780-
}
2777+
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
27812778

2782-
ggml_backend_graph_compute(ctx->backend, gf);
2779+
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
2780+
if (status != GGML_STATUS_SUCCESS) {
2781+
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
2782+
return false;
2783+
}
27832784

27842785
// the last node is the embedding tensor
27852786
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);

examples/llava/clip.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
3939
size_t size;
4040
};
4141

42-
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
43-
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
42+
struct clip_context_params {
43+
bool use_gpu;
44+
int verbosity;
45+
};
46+
47+
// deprecated, use clip_init
48+
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
49+
50+
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
4451

4552
CLIP_API void clip_free(struct clip_ctx * ctx);
4653

examples/llava/minicpmv-cli.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
8686
if (prompt.empty()) {
8787
prompt = "describe the image in detail.";
8888
}
89-
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
89+
struct clip_context_params clip_params = {
90+
/* use_gpu */ params->n_gpu_layers != 0,
91+
/* verbosity */ params->verbosity,
92+
};
93+
auto * ctx_clip = clip_init(clip_path, clip_params);
9094
return ctx_clip;
9195
}
9296

0 commit comments

Comments
 (0)