4
4
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
5
#include " clip.h"
6
6
#include " ggml.h"
7
+ #include " ggml-cpp.h"
7
8
#include " ggml-cpu.h"
8
9
#include " ggml-alloc.h"
9
10
#include " ggml-backend.h"
10
11
#include " gguf.h"
11
12
12
- // #ifdef GGML_USE_CUDA
13
- // #include "ggml-cuda.h"
14
- // #endif
15
- //
16
- // #ifdef GGML_USE_SYCL
17
- // #include "ggml-sycl.h"
18
- // #endif
19
- //
20
- // #ifdef GGML_USE_METAL
21
- // #include "ggml-metal.h"
22
- // #endif
23
- //
24
- // #ifdef GGML_USE_CANN
25
- // #include "ggml-cann.h"
26
- // #endif
27
- //
28
- // #ifdef GGML_USE_VULKAN
29
- // #include "ggml-vulkan.h"
30
- // #endif
31
-
32
13
#define STB_IMAGE_IMPLEMENTATION
33
14
#include " stb_image.h"
34
15
@@ -600,18 +581,54 @@ struct clip_ctx {
600
581
bool has_post_norm = false ;
601
582
bool has_patch_bias = false ;
602
583
603
- struct gguf_context * ctx_gguf;
604
- struct ggml_context * ctx_data;
584
+ struct gguf_context * ctx_gguf = nullptr ;
585
+ struct ggml_context * ctx_data = nullptr ;
605
586
606
587
std::vector<uint8_t > buf_compute_meta;
607
588
608
- // memory buffers to evaluate the model
609
- ggml_backend_buffer_t params_buffer = NULL ;
589
+ std::vector<ggml_backend_t > backend_ptrs;
590
+ std::vector<ggml_backend_buffer_type_t > backend_buft;
591
+
592
+ ggml_backend_t backend = nullptr ;
593
+ ggml_backend_t backend_cpu = nullptr ;
594
+ ggml_backend_buffer_t buf = nullptr ;
610
595
611
- ggml_backend_t backend = NULL ;
612
- ggml_gallocr_t compute_alloc = NULL ;
596
+ ggml_backend_sched_ptr sched;
613
597
614
598
struct clip_image_size * load_image_size;
599
+
600
+ clip_ctx (clip_context_params & ctx_params) {
601
+ backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
602
+ backend = ctx_params.use_gpu
603
+ ? ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_GPU, nullptr )
604
+ : nullptr ;
605
+
606
+ if (backend) {
607
+ LOG_INF (" %s: CLIP using %s backend\n " , __func__, ggml_backend_name (backend));
608
+ backend_ptrs.push_back (backend);
609
+ backend_buft.push_back (ggml_backend_get_default_buffer_type (backend));
610
+ } else {
611
+ backend = backend_cpu;
612
+ LOG_INF (" %s: CLIP using CPU backend\n " , __func__);
613
+ }
614
+
615
+ backend_ptrs.push_back (backend_cpu);
616
+ backend_buft.push_back (ggml_backend_get_default_buffer_type (backend_cpu));
617
+
618
+ sched.reset (
619
+ ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), 8192 , false )
620
+ );
621
+ }
622
+
623
+ ~clip_ctx () {
624
+ ggml_free (ctx_data);
625
+ gguf_free (ctx_gguf);
626
+ ggml_backend_buffer_free (buf);
627
+ ggml_backend_free (backend);
628
+ if (backend_cpu != backend) {
629
+ ggml_backend_free (backend_cpu);
630
+ }
631
+ }
615
632
};
616
633
617
634
static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false ) {
@@ -1184,6 +1201,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1184
1201
1185
1202
// read and create ggml_context containing the tensors and their data
1186
1203
struct clip_ctx * clip_model_load (const char * fname, const int verbosity = 1 ) {
1204
+ return clip_init (fname, clip_context_params{
1205
+ /* use_gpu */ true ,
1206
+ /* verbosity */ verbosity,
1207
+ });
1208
+ }
1209
+
1210
+ struct clip_ctx * clip_init (const char * fname, struct clip_context_params ctx_params) {
1211
+ int verbosity = ctx_params.verbosity ;
1187
1212
struct ggml_context * meta = NULL ;
1188
1213
1189
1214
struct gguf_init_params params = {
@@ -1277,7 +1302,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1277
1302
}
1278
1303
}
1279
1304
1280
- clip_ctx * new_clip = new clip_ctx{} ;
1305
+ clip_ctx * new_clip = new clip_ctx (ctx_params) ;
1281
1306
1282
1307
// update projector type
1283
1308
{
@@ -1296,36 +1321,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1296
1321
}
1297
1322
}
1298
1323
1299
- // #ifdef GGML_USE_CUDA
1300
- // new_clip->backend = ggml_backend_cuda_init(0);
1301
- // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302
- // #endif
1303
- //
1304
- // #ifdef GGML_USE_METAL
1305
- // new_clip->backend = ggml_backend_metal_init();
1306
- // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307
- // #endif
1308
- //
1309
- // #ifdef GGML_USE_CANN
1310
- // new_clip->backend = ggml_backend_cann_init(0);
1311
- // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312
- // #endif
1313
- //
1314
- // #ifdef GGML_USE_VULKAN
1315
- // new_clip->backend = ggml_backend_vk_init(0);
1316
- // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317
- // #endif
1318
- //
1319
- // #ifdef GGML_USE_SYCL
1320
- // new_clip->backend = ggml_backend_sycl_init(0);
1321
- // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322
- // #endif
1323
-
1324
- if (!new_clip->backend ) {
1325
- new_clip->backend = ggml_backend_cpu_init ();
1326
- LOG_INF (" %s: CLIP using CPU backend\n " , __func__);
1327
- }
1328
-
1329
1324
// model size and capabilities
1330
1325
{
1331
1326
int idx = get_key_idx (ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1416,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1421
1416
}
1422
1417
1423
1418
// alloc memory and offload data
1424
- new_clip->params_buffer = ggml_backend_alloc_ctx_tensors (new_clip->ctx_data , new_clip->backend );
1419
+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (new_clip->backend );
1420
+ new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft (new_clip->ctx_data , buft);
1421
+ ggml_backend_buffer_set_usage (new_clip->buf , GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1425
1422
for (int i = 0 ; i < n_tensors; ++i) {
1426
1423
const char * name = gguf_get_tensor_name (ctx, i);
1427
1424
struct ggml_tensor * cur = ggml_get_tensor (new_clip->ctx_data , name);
@@ -1434,7 +1431,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1434
1431
return nullptr ;
1435
1432
}
1436
1433
int num_bytes = ggml_nbytes (cur);
1437
- if (ggml_backend_buffer_is_host (new_clip-> params_buffer )) {
1434
+ if (ggml_backend_buft_is_host (buft )) {
1438
1435
// for the CPU and Metal backend, we can read directly into the tensor
1439
1436
fin.read (reinterpret_cast <char *>(cur->data ), num_bytes);
1440
1437
} else {
@@ -1720,14 +1717,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1720
1717
// measure mem requirement and allocate
1721
1718
{
1722
1719
new_clip->buf_compute_meta .resize (GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead () + ggml_graph_overhead ());
1723
- new_clip->compute_alloc = ggml_gallocr_new (ggml_backend_get_default_buffer_type (new_clip->backend ));
1724
1720
clip_image_f32_batch batch;
1725
1721
batch.size = 1 ;
1726
1722
batch.data = nullptr ;
1727
1723
ggml_cgraph * gf = clip_image_build_graph (new_clip, &batch, nullptr , false );
1728
- ggml_gallocr_reserve (new_clip->compute_alloc , gf);
1729
- size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size (new_clip->compute_alloc , 0 );
1730
- LOG_INF (" %s: compute allocated memory: %.2f MB\n " , __func__, compute_memory_buffer_size /1024.0 /1024.0 );
1724
+ ggml_backend_sched_reserve (new_clip->sched .get (), gf);
1725
+ for (size_t i = 0 ; i < new_clip->backend_ptrs .size (); ++i) {
1726
+ ggml_backend_t backend = new_clip->backend_ptrs [i];
1727
+ ggml_backend_buffer_type_t buft = new_clip->backend_buft [i];
1728
+ size_t size = ggml_backend_sched_get_buffer_size (new_clip->sched .get (), backend);
1729
+ if (size > 1 ) {
1730
+ LOG_INF (" %s: %10s compute buffer size = %8.2f MiB\n " , __func__,
1731
+ ggml_backend_buft_name (buft),
1732
+ size / 1024.0 / 1024.0 );
1733
+ }
1734
+ }
1731
1735
}
1732
1736
1733
1737
return new_clip;
@@ -2408,12 +2412,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
2408
2412
}
2409
2413
2410
2414
void clip_free (clip_ctx * ctx) {
2411
- ggml_free (ctx->ctx_data );
2412
- gguf_free (ctx->ctx_gguf );
2413
-
2414
- ggml_backend_buffer_free (ctx->params_buffer );
2415
- ggml_backend_free (ctx->backend );
2416
- ggml_gallocr_free (ctx->compute_alloc );
2417
2415
delete ctx;
2418
2416
}
2419
2417
@@ -2609,8 +2607,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2609
2607
}
2610
2608
2611
2609
// build the inference graph
2610
+ ggml_backend_sched_reset (ctx->sched .get ());
2612
2611
ggml_cgraph * gf = clip_image_build_graph (ctx, imgs, ctx->load_image_size , true );
2613
- ggml_gallocr_alloc_graph (ctx->compute_alloc , gf);
2612
+ ggml_backend_sched_alloc_graph (ctx->sched . get () , gf);
2614
2613
2615
2614
// set inputs
2616
2615
const auto & model = ctx->vision_model ;
@@ -2775,11 +2774,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2775
2774
}
2776
2775
}
2777
2776
2778
- if (ggml_backend_is_cpu (ctx->backend )) {
2779
- ggml_backend_cpu_set_n_threads (ctx->backend , n_threads);
2780
- }
2777
+ ggml_backend_cpu_set_n_threads (ctx->backend_cpu , n_threads);
2781
2778
2782
- ggml_backend_graph_compute (ctx->backend , gf);
2779
+ auto status = ggml_backend_sched_graph_compute (ctx->sched .get (), gf);
2780
+ if (status != GGML_STATUS_SUCCESS) {
2781
+ LOG_ERR (" %s: ggml_backend_sched_graph_compute failed with error %d\n " , __func__, status);
2782
+ return false ;
2783
+ }
2783
2784
2784
2785
// the last node is the embedding tensor
2785
2786
struct ggml_tensor * embeddings = ggml_graph_node (gf, -1 );
0 commit comments