threadpool: move all pause/resume logic into ggml

max-krasnyansky · max-krasnyansky · commit e3c2202049b7 · 2024-08-27T17:20:46.000-07:00
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -1537,7 +1537,7 @@ int main(int argc, char ** argv) {
             exit(1);
         }
 
-        llama_attach_threadpool(ctx, threadpool);
+        llama_attach_threadpool(ctx, threadpool, NULL);
 
         // warmup run
         if (t.n_prompt > 0) {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -240,11 +240,6 @@ int main(int argc, char ** argv) {
             exit(1);
         }
 
-        llama_attach_batch_threadpool(ctx, threadpool_batch);
-        if (ctx_guidance) {
-            llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
-        }
-
         // Start the non-batch threadpool in the paused state
         tpp.paused = true;
     }
@@ -255,9 +250,9 @@ int main(int argc, char ** argv) {
         exit(1);
     }
 
-    llama_attach_threadpool(ctx, threadpool);
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
     if (ctx_guidance) {
-        llama_attach_threadpool(ctx_guidance, threadpool);
+        llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
@@ -910,6 +910,11 @@ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_th
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
     struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+       // already had a different threadpool, pause/suspend it before switching
+       ggml_pause_threadpool(ctx->threadpool);
+    }
     ctx->threadpool = threadpool;
 }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -19198,9 +19198,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
             state->pending = false;
 
             ggml_graph_compute_thread(state);
-            if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
-                break;
-            }
         }
     }
 
diff --git a/include/llama.h b/include/llama.h
@@ -431,16 +431,9 @@ extern "C" {
     // Optional: an auto threadpool gets created in ggml if not passed explicitly
     LLAMA_API void llama_attach_threadpool(
                struct   llama_context * ctx,
-            ggml_compute_threadpool_t   threadpool);
-    LLAMA_API void llama_attach_batch_threadpool(
-               struct   llama_context * ctx,
-            ggml_compute_threadpool_t   threadpool);
+            ggml_compute_threadpool_t   threadpool,
+            ggml_compute_threadpool_t   threadpool_batch);
     LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-    LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
-    LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
-
-    // Pauses all attached threadpools
-    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -15523,39 +15523,6 @@ static void llama_graph_compute(
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
 
-// Optionally swaps the batch and single-tok threadpools.
-// Returns the number of threads, and if a valid threadpool exists, returns it too.
-static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
-        llama_context & lctx,
-              int32_t   n_tokens) {
-
-    const auto & cparams = lctx.cparams;
-    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-
-    ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
-
-    // A batch threadpool without a non-batch threadpool isn't supported.
-    GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool);
-
-    if (lctx.threadpool_batch && lctx.threadpool) {
-        // Switch between the 2 threadpools as needed
-        if (n_tokens > 1) {
-            ggml_pause_threadpool(lctx.threadpool);
-            threadpool = lctx.threadpool_batch;
-            n_threads = cparams.n_threads_batch;
-        } else {
-            ggml_pause_threadpool(lctx.threadpool_batch);
-            threadpool = lctx.threadpool;
-            n_threads = cparams.n_threads;
-        }
-    } else if (lctx.threadpool) {
-        threadpool = lctx.threadpool;
-        n_threads = cparams.n_threads;
-    }
-    return std::make_pair(n_threads, threadpool);
-}
-
-
 // decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
@@ -15662,11 +15629,8 @@ static int llama_decode_internal(
             lctx.n_outputs = n_outputs_new;
         }
 
-        std::pair<int32_t, ggml_compute_threadpool_t> threads =
-            llama_swap_threadpools(lctx, n_tokens);
-
-        int n_threads                        = threads.first;
-        ggml_compute_threadpool_t threadpool = threads.second;
+        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
 
         GGML_ASSERT(n_threads > 0);
 
@@ -15906,11 +15870,9 @@ static int llama_encode_internal(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    std::pair<int32_t, ggml_compute_threadpool_t> threads =
-        llama_swap_threadpools(lctx, n_tokens);
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
 
-    int n_threads                        = threads.first;
-    ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
     ggml_backend_sched_reset(lctx.sched);
@@ -17500,36 +17462,15 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 
 void llama_attach_threadpool(
              struct llama_context * ctx,
-        ggml_compute_threadpool_t   threadpool) {
-    ctx->threadpool = threadpool;
-}
-
-void llama_attach_batch_threadpool(
-             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool,
         ggml_compute_threadpool_t   threadpool_batch) {
-    ctx->threadpool_batch = threadpool_batch;
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
 
 void llama_detach_threadpool(struct llama_context * ctx) {
-    ctx->threadpool = nullptr;
-}
-
-void llama_detach_batch_threadpool(struct llama_context * ctx) {
-    ctx->threadpool = nullptr;
-}
-
-void llama_detach_threadpools(struct llama_context * ctx) {
-    llama_detach_threadpool(ctx);
-    llama_detach_batch_threadpool(ctx);
-}
-
-void llama_pause_threadpools(struct llama_context * ctx) {
-    if (ctx->threadpool) {
-        ggml_pause_threadpool(ctx->threadpool);
-    }
-    if (ctx->threadpool_batch) {
-        ggml_pause_threadpool(ctx->threadpool_batch);
-    }
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
 }
 
 void llama_backend_free(void) {

Original file line number	Diff line number	Diff line change
`@@ -1537,7 +1537,7 @@ int main(int argc, char ** argv) {`
`1537`	`1537`	`exit(1);`
`1538`	`1538`	`}`
`1539`	`1539`
`1540`		`- llama_attach_threadpool(ctx, threadpool);`
	`1540`	`+ llama_attach_threadpool(ctx, threadpool, NULL);`
`1541`	`1541`
`1542`	`1542`	`// warmup run`
`1543`	`1543`	`if (t.n_prompt > 0) {`
Original file line number	Diff line number	Diff line change
`@@ -240,11 +240,6 @@ int main(int argc, char ** argv) {`
`240`	`240`	`exit(1);`
`241`	`241`	`}`
`242`	`242`
`243`		`- llama_attach_batch_threadpool(ctx, threadpool_batch);`
`244`		`- if (ctx_guidance) {`
`245`		`- llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);`
`246`		`- }`
`247`		`-`
`248`	`243`	`// Start the non-batch threadpool in the paused state`
`249`	`244`	`tpp.paused = true;`
`250`	`245`	`}`
`@@ -255,9 +250,9 @@ int main(int argc, char ** argv) {`
`255`	`250`	`exit(1);`
`256`	`251`	`}`
`257`	`252`
`258`		`- llama_attach_threadpool(ctx, threadpool);`
	`253`	`+ llama_attach_threadpool(ctx, threadpool, threadpool_batch);`
`259`	`254`	`if (ctx_guidance) {`
`260`		`- llama_attach_threadpool(ctx_guidance, threadpool);`
	`255`	`+ llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);`
`261`	`256`	`}`
`262`	`257`
`263`	`258`	`const int n_ctx_train = llama_n_ctx_train(model);`
Original file line number	Diff line number	Diff line change
`@@ -19198,9 +19198,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {`
`19198`	`19198`	`state->pending = false;`
`19199`	`19199`
`19200`	`19200`	`ggml_graph_compute_thread(state);`
`19201`		`- if (state->threadpool->ec != GGML_STATUS_SUCCESS) {`
`19202`		`- break;`
`19203`		`- }`
`19204`	`19201`	`}`
`19205`	`19202`	`}`
`19206`	`19203`