Skip to content

Track and free temporary ggml_tensor_extra_gpu struct #2195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,11 @@ struct ggml_tensor_extra_gpu {
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};

struct ggml_tensor_extra_gpu_tmp {
struct ggml_tensor_extra_gpu main;
struct ggml_tensor_extra_gpu_tmp * next;
};

static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;

Expand Down Expand Up @@ -2080,6 +2085,9 @@ struct cuda_buffer {
size_t size = 0;
};

// TODO: Make this TLS?
static struct ggml_tensor_extra_gpu_tmp * g_eval_extras = NULL;
static bool g_evaluating = false;
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;

Expand Down Expand Up @@ -3260,6 +3268,38 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
delete extra;
}

void ggml_cuda_begin_eval() {
GGML_ASSERT(!g_evaluating);
g_evaluating = true;
}

struct ggml_tensor_extra_gpu * ggml_cuda_alloc_extra() {
if (g_evaluating) {
struct ggml_tensor_extra_gpu_tmp * extra = new ggml_tensor_extra_gpu_tmp;
memset(extra, 0, sizeof(*extra));
extra->next = g_eval_extras;
g_eval_extras = extra;
return &extra->main;
} else {
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
return extra;
}
}

void ggml_cuda_end_eval() {
GGML_ASSERT(g_evaluating);

for (struct ggml_tensor_extra_gpu_tmp * i = g_eval_extras; i != nullptr;) {
struct ggml_tensor_extra_gpu_tmp * next = i->next;
delete i;
i = next;
}

g_eval_extras = nullptr;
g_evaluating = false;
}

void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
if (scratch && g_scratch_size == 0) {
return;
Expand All @@ -3277,8 +3317,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
}

tensor->backend = GGML_BACKEND_GPU;
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_extra();

const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW ||
Expand Down
2 changes: 2 additions & 0 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
void ggml_cuda_begin_eval(void);
void ggml_cuda_end_eval(void);

#ifdef __cplusplus
}
Expand Down
6 changes: 6 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1379,6 +1379,8 @@ static bool llama_eval_internal(
offload_func_t offload_func_v = llama_nop;

#ifdef GGML_USE_CUBLAS
ggml_cuda_begin_eval();

if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers;
}
Expand Down Expand Up @@ -1721,6 +1723,10 @@ static bool llama_eval_internal(
lctx.get_buf_max_mem(1)/1024.0/1024.0);
#endif

#ifdef GGML_USE_CUBLAS
ggml_cuda_end_eval();
#endif

ggml_free(ctx0);

// measure the performance only for the single-token evals
Expand Down