--main-gpu CLI option

JohannesGaessler · JohannesGaessler · commit e35e377b92b6 · 2023-06-06T18:38:12.000+02:00
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -295,6 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #else
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
+        } else if (arg == "--main-gpu" || arg == "-mg") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            params.main_gpu = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
@@ -318,7 +328,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 }
             }
 #else
-      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
@@ -465,6 +475,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "                        number of layers to store in VRAM\n");
     fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
     fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
@@ -512,6 +523,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
     lparams.n_ctx        = params.n_ctx;
     lparams.n_batch      = params.n_batch;
     lparams.n_gpu_layers = params.n_gpu_layers;
+    lparams.main_gpu     = params.main_gpu;
     memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
     lparams.seed         = params.seed;
     lparams.f16_kv       = params.memory_f16;
diff --git a/examples/common.h b/examples/common.h
@@ -21,13 +21,14 @@
 int32_t get_num_physical_cores();
 
 struct gpt_params {
-    int32_t seed                           = -1;  // RNG seed
+    int32_t seed                           = -1;   // RNG seed
     int32_t n_threads                      = get_num_physical_cores();
-    int32_t n_predict                      = -1;  // new tokens to predict
-    int32_t n_ctx                          = 512; // context size
-    int32_t n_batch                        = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                         = 0;   // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers                   = 0;   // number of layers to store in VRAM
+    int32_t n_predict                      = -1;   // new tokens to predict
+    int32_t n_ctx                          = 512;  // context size
+    int32_t n_batch                        = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                         = 0;    // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers                   = 0;    // number of layers to store in VRAM
+    int32_t main_gpu                       = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
 
     // sampling parameters
diff --git a/examples/main/README.md b/examples/main/README.md
@@ -286,5 +286,7 @@ These options provide extra functionality and customization when running the LLa
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -287,6 +287,8 @@ Test();
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
 -   `--port`: Set the port to listen. Default: `8080`.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -403,6 +403,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
   fprintf(stderr, "                        number of layers to store in VRAM\n");
   fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
   fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+  fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
 #endif
   fprintf(stderr, "  -m FNAME, --model FNAME\n");
   fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
@@ -536,6 +538,19 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
 #else
       fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
+    }
+    else if (arg == "--main-gpu" || arg == "-mg")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+#ifdef GGML_USE_CUBLAS
+      params.main_gpu = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+#endif
     }
     else
     {
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -561,15 +561,15 @@ void ggml_init_cublas() {
         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;
         fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
-        for (int i = 0; i < g_device_count; ++i) {
+        for (int id = 0; id < g_device_count; ++id) {
             cudaDeviceProp prop;
-            CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-            fprintf(stderr, "  %d. %s\n", i+1, prop.name);
-            g_tensor_split[i] = total_vram;
+            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+            fprintf(stderr, "  Device %d: %s\n", id, prop.name);
+            g_tensor_split[id] = total_vram;
             total_vram += prop.totalGlobalMem;
         }
-        for (int i = 0; i < g_device_count; ++i) {
-            g_tensor_split[i] /= total_vram;
+        for (int id = 0; id < g_device_count; ++id) {
+            g_tensor_split[id] /= total_vram;
         }
 
         for (int id = 0; id < g_device_count; ++id) {
@@ -835,7 +835,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
-    const int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
 
     CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
     CUBLAS_CHECK(
@@ -1041,6 +1044,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
                     dst_ddf_i -= (row_low % ne0)*ne1;
                 }
 
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                    dst_ddf_i += i01_low; // offset is 0 if no tensor split
+                }
+
                 // copy src0, src1 to device if necessary
                 if (use_src1) {
                     if (src1->backend == GGML_BACKEND_CPU) {
@@ -1322,6 +1331,20 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
     tensor->extra = extra;
 }
 
+void ggml_cuda_set_main_device(int main_device) {
+    if (main_device > g_device_count) {
+        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+                main_device, g_device_count, g_main_device);
+        return;
+    }
+    g_main_device = main_device;
+    if (g_device_count > 1) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
+    }
+}
+
 void ggml_cuda_set_scratch_size(size_t scratch_size) {
     g_scratch_size = scratch_size;
 }
diff --git a/ggml-cuda.h b/ggml-cuda.h
@@ -27,6 +27,7 @@ void   ggml_cuda_host_free(void * ptr);
 void   ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
 void   ggml_cuda_free_data(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+void   ggml_cuda_set_main_device(int main_device);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
diff --git a/llama.cpp b/llama.cpp
@@ -857,6 +857,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 512,
         /*.gpu_layers                  =*/ 0,
+        /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ {0},
         /*.seed                        =*/ -1,
         /*.f16_kv                      =*/ true,
@@ -943,6 +944,7 @@ static void llama_model_load_internal(
         int n_ctx,
         int n_batch,
         int n_gpu_layers,
+        int main_gpu,
         const float * tensor_split,
         ggml_type memory_type,
         bool use_mmap,
@@ -1039,6 +1041,7 @@ static void llama_model_load_internal(
 
 #if defined(GGML_USE_CUBLAS)
     fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    ggml_cuda_set_main_device(main_gpu);
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
 #elif defined(GGML_USE_CLBLAST)
@@ -1225,6 +1228,7 @@ static bool llama_model_load(
         int n_ctx,
         int n_batch,
         int n_gpu_layers,
+        int main_gpu,
         float * tensor_split,
         ggml_type memory_type,
         bool use_mmap,
@@ -1233,8 +1237,8 @@ static bool llama_model_load(
         llama_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
-        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, tensor_split, memory_type, use_mmap,
-                                  use_mlock, vocab_only, progress_callback, progress_callback_user_data);
+        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
+                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
         return true;
     } catch (const std::string & err) {
         fprintf(stderr, "error loading model: %s\n", err.c_str());
@@ -2400,8 +2404,8 @@ struct llama_context * llama_init_from_file(
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
     if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.tensor_split, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
-                params.progress_callback, params.progress_callback_user_data)) {
+                params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
+                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
         llama_free(ctx);
         return nullptr;
diff --git a/llama.h b/llama.h
@@ -75,6 +75,7 @@ extern "C" {
         int n_ctx;                             // text context
         int n_batch;                           // prompt processing batch size
         int n_gpu_layers;                      // number of layers to store in VRAM
+        int main_gpu;                          // the GPU that is used for scratch and small tensors
         float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
         int seed;                              // RNG seed, -1 for random