Skip to content

Commit 2a773ce

Browse files
committed
ggml_graph_compute: deprecate using ggml_context, try resolve issue #287
1 parent 0be54f7 commit 2a773ce

File tree

3 files changed

+86
-31
lines changed

3 files changed

+86
-31
lines changed

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
14261426

14271427
gf->n_nodes = 0;
14281428
gf->n_leafs = 0;
1429-
gf->work_size = 0;
14301429
gf->perf_runs = 0;
14311430
gf->perf_cycles = 0;
14321431
gf->perf_time_us = 0;
1433-
gf->work = NULL;
14341432

14351433
const auto & hparams = model->hparams;
14361434
//const int n_ctx = hparams.n_ctx;

ggml.c

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16568,8 +16568,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
1656816568
/*.n_nodes =*/ 0,
1656916569
/*.n_leafs =*/ 0,
1657016570
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
16571-
/*.work_size =*/ 0,
16572-
/*.work =*/ NULL,
1657316571
/*.nodes =*/ { NULL },
1657416572
/*.grads =*/ { NULL },
1657516573
/*.leafs =*/ { NULL },
@@ -16740,6 +16738,7 @@ void clear_numa_thread_affinity(void) {}
1674016738

1674116739
struct ggml_compute_state_shared {
1674216740
struct ggml_cgraph * cgraph;
16741+
struct ggml_cgraph_context * cgraph_ctx;
1674316742

1674416743
int64_t perf_node_start_cycles;
1674516744
int64_t perf_node_start_time_us;
@@ -16769,6 +16768,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
1676916768
static thread_ret_t ggml_graph_compute_thread(void * data) {
1677016769
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1677116770
struct ggml_cgraph * cgraph = state->shared->cgraph;
16771+
struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
1677216772

1677316773
const int n_threads = state->shared->n_threads;
1677416774
set_numa_thread_affinity(state->ith, n_threads);
@@ -16783,8 +16783,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1678316783
/*.type =*/ GGML_TASK_FINALIZE,
1678416784
/*.ith =*/ 0,
1678516785
/*.nth =*/ 0,
16786-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16787-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16786+
/*.wsize =*/ ctx->work_size,
16787+
/*.wdata =*/ ctx->work_data,
1678816788
};
1678916789

1679016790
if (node_n != -1) {
@@ -16844,8 +16844,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1684416844
/*.type =*/ GGML_TASK_COMPUTE,
1684516845
/*.ith =*/ state->ith,
1684616846
/*.nth =*/ node->n_tasks,
16847-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16848-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16847+
/*.wsize =*/ ctx->work_size,
16848+
/*.wdata =*/ ctx->work_data,
1684916849
};
1685016850

1685116851
if (state->ith < node->n_tasks) {
@@ -16856,23 +16856,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1685616856
return 0;
1685716857
}
1685816858

16859-
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
16860-
const int n_threads = cgraph->n_threads;
16859+
// Prepare for graph computing.
16860+
// Will set: node->n_tasks, ctx->{work_size, planned}
16861+
void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
16862+
GGML_ASSERT(ctx);
16863+
// This function is actually reentrant, but duplicate calls is unnecessary.
16864+
GGML_ASSERT(ctx->work_size == 0);
16865+
GGML_ASSERT(ctx->work_data == NULL);
16866+
GGML_ASSERT(!ctx->planned);
1686116867

16862-
struct ggml_compute_state_shared state_shared = {
16863-
/*.cgraph =*/ cgraph,
16864-
/*.perf_node_start_cycles =*/ 0,
16865-
/*.perf_node_start_time_us =*/ 0,
16866-
/*.n_threads =*/ n_threads,
16867-
/*.n_active =*/ n_threads,
16868-
/*.node_n =*/ -1,
16869-
};
16870-
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16868+
int n_threads = cgraph->n_threads;
16869+
size_t work_size = 0;
1687116870

1687216871
// initialize tasks + work buffer
1687316872
{
16874-
size_t work_size = 0;
16875-
1687616873
// thread scheduling for the different operations
1687716874
for (int i = 0; i < cgraph->n_nodes; i++) {
1687816875
struct ggml_tensor * node = cgraph->nodes[i];
@@ -17202,19 +17199,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1720217199
} break;
1720317200
}
1720417201
}
17202+
}
1720517203

17206-
if (cgraph->work != NULL && work_size > cgraph->work_size) {
17207-
GGML_ASSERT(false); // TODO: better handling
17208-
}
17204+
if (work_size > 0) {
17205+
work_size += CACHE_LINE_SIZE*(n_threads - 1);
17206+
}
17207+
17208+
ctx->work_size = work_size;
17209+
ctx->work_data = NULL;
17210+
ctx->planned = true;
17211+
}
1720917212

17210-
if (work_size > 0 && cgraph->work == NULL) {
17211-
cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
17213+
void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
17214+
if (ctx == NULL) {
17215+
ctx = alloca(sizeof(struct ggml_cgraph_context));
17216+
GGML_ASSERT(ctx);
17217+
ctx->work_size = 0;
17218+
ctx->work_data = NULL;
17219+
ctx->planned = false;
17220+
} else {
17221+
// The work_size and work_data MAY have default values even if has been planned.
17222+
if (ctx->work_size > 0) {
17223+
GGML_ASSERT(ctx->work_data);
17224+
}
17225+
}
1721217226

17213-
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
17214-
cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
17227+
if (!ctx->planned) {
17228+
ggml_graph_compute_plan(ctx, cgraph);
17229+
if (ctx->work_size > 0) {
17230+
ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
17231+
GGML_ASSERT(ctx->work_data);
17232+
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
1721517233
}
1721617234
}
1721717235

17236+
const int n_threads = cgraph->n_threads;
17237+
17238+
struct ggml_compute_state_shared state_shared = {
17239+
/*.cgraph =*/ cgraph,
17240+
/*.cgraph_ctx =*/ ctx,
17241+
/*.perf_node_start_cycles =*/ 0,
17242+
/*.perf_node_start_time_us =*/ 0,
17243+
/*.n_threads =*/ n_threads,
17244+
/*.n_active =*/ n_threads,
17245+
/*.node_n =*/ -1,
17246+
};
17247+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
17248+
1721817249
// create thread pool
1721917250
if (n_threads > 1) {
1722017251
for (int j = 1; j < n_threads; ++j) {
@@ -17266,6 +17297,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1726617297
}
1726717298
}
1726817299

17300+
// Deprecated, keep it only for backward compatibility.
17301+
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
17302+
UNUSED(ctx);
17303+
ggml_graph_compute_v2(NULL, cgraph);
17304+
}
17305+
1726917306
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
1727017307
for (int i = 0; i < cgraph->n_nodes; i++) {
1727117308
struct ggml_tensor * grad = cgraph->grads[i];

ggml.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -409,15 +409,23 @@ extern "C" {
409409

410410
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
411411

412+
// graph compute context
413+
struct ggml_cgraph_context {
414+
// After call to `ggml_graph_compute_plan()`, `planned` is set as true,
415+
// `work_size` will be updated as non-zero when buffer is required. When
416+
// need buffer, caller MUST allocate memory for `work_data`.
417+
// See https://github.com/ggerganov/ggml/issues/287
418+
size_t work_size;
419+
void * work_data;
420+
bool planned; // true means ready to compute graph nodes.
421+
};
422+
412423
// computation graph
413424
struct ggml_cgraph {
414425
int n_nodes;
415426
int n_leafs;
416427
int n_threads;
417428

418-
size_t work_size;
419-
struct ggml_tensor * work;
420-
421429
struct ggml_tensor * nodes[GGML_MAX_NODES];
422430
struct ggml_tensor * grads[GGML_MAX_NODES];
423431
struct ggml_tensor * leafs[GGML_MAX_NODES];
@@ -1270,6 +1278,18 @@ extern "C" {
12701278
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
12711279
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
12721280

1281+
// Since https://github.com/ggerganov/ggml/issues/287
1282+
GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
1283+
// Since https://github.com/ggerganov/ggml/issues/287
1284+
// When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
1285+
// Another use case: allocate buffer explicitly:
1286+
// - call `ggml_graph_compute_plan()`;
1287+
// - allocate memory for `ctx->work_data`;
1288+
// - finally call `ggml_graph_compute_v2()`.
1289+
// NOTE: don't manually set `ctx->planned`.
1290+
GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
1291+
// Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
1292+
// See https://github.com/ggerganov/ggml/issues/287
12731293
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
12741294
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
12751295

0 commit comments

Comments
 (0)