Description
Hi, I'm new to ggml, so apologies if I'm missing something obvious.
I wrote a simple program to add two float32 tensors in ggml using CUDA, and that works fine.
But when I changed the two tensor types to GGML_TYPE_F16
and tried to add them, I got a GGML assertion error:
ggml-cuda\binbcast.cu:297: GGML_ASSERT(src1->type == GGML_TYPE_F32) failed
Key snippets (and I've included the complete program at the bottom):
struct ggml_tensor* a = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3);
struct ggml_tensor* b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3);
struct ggml_tensor* result = ggml_add(ctx, a, b);
printf("Computing graph...\n");
ggml_backend_graph_compute(backend, gf); // <---- fails here
printf("Finished computing\n");
I'm sending float16 data, but that doesn't seem to matter.
I have an NVIDIA 3060 12 GB, with compute capability 8.6. PyTorch works just fine in float16 for me.
Digging into the code, it looks like a lot of operations enforce F32 for the second tensor (add, sub, mul, div etc).
Am I missing something, and if not, why can't we add two float16 tensors using ggml?
Thanks for your help! :)
Complete program
#include "ggml.h"
#include "ggml-cpu.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#include <vector>
#include <iostream>
ggml_backend_t backend = NULL;
ggml_gallocr_t allocr = NULL;
void init_backend() {
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
if (!backend) {
backend = ggml_backend_cpu_init();
}
}
void init_mem_allocator() {
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
}
void predict() {
// create a context
struct ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context* ctx = ggml_init(params);
// 1. Define the tensor variables
struct ggml_tensor* a = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3);
struct ggml_tensor* b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3);
// 2. Define the computation graph
struct ggml_tensor* result = ggml_add(ctx, a, b);
struct ggml_cgraph* gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, result);
// 3. Allocate memory for the tensor variables, and assign the data
ggml_gallocr_alloc_graph(allocr, gf);
std::vector<float> a_data_f32 = {1, 2, 3};
std::vector<float> b_data_f32 = {10, 20, 30};
// Convert float data to ggml_fp16_t
std::vector<ggml_fp16_t> a_data(a_data_f32.size());
std::vector<ggml_fp16_t> b_data(b_data_f32.size());
for (size_t i = 0; i < a_data_f32.size(); ++i) {
a_data[i] = ggml_fp32_to_fp16(a_data_f32[i]);
b_data[i] = ggml_fp32_to_fp16(b_data_f32[i]);
}
ggml_backend_tensor_set(a, a_data.data(), 0, ggml_nbytes(a));
ggml_backend_tensor_set(b, b_data.data(), 0, ggml_nbytes(b));
// 4. Run the computation, and read the result
printf("Computing graph...\n");
ggml_backend_graph_compute(backend, gf);
printf("Finished computing\n");
struct ggml_tensor* result_node = ggml_graph_node(gf, -1); // get the last node in the graph
int n = ggml_nelements(result_node); // create an array to store the result data
std::vector<ggml_fp16_t> result_data(n);
// copy the data from the backend memory into the result array
ggml_backend_tensor_get(result_node, result_data.data(), 0, ggml_nbytes(result_node));
// print the data
for (int i = 0; i < n; i++) {
std::cout<<ggml_fp16_to_fp32(result_data[i])<<", ";
}
std::cout<<std::endl;
// free the resources
ggml_free(ctx);
}
int main(int argc, char* argv[]) {
init_backend();
init_mem_allocator();
predict();
// free the resources
ggml_gallocr_free(allocr);
ggml_backend_free(backend);
return 0;
}