ggml-org · ggerganov · Jul 10, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -101,6 +101,34 @@ jobs:
           cd build
           ctest --verbose
 
+  ubuntu-latest-cmake-mpi:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential mpich
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_MPI=ON ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose
+
   macOS-latest-make:
     runs-on: macos-latest
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -75,6 +75,7 @@ option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
@@ -305,6 +306,23 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MPI)
+    cmake_minimum_required(VERSION 3.10)
+    find_package(MPI)
+    if (MPI_C_FOUND)
+        message(STATUS "MPI found")
+        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        add_compile_definitions(GGML_USE_MPI)
+        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
+        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        set(c_flags   ${c_flags}   -Wno-cast-qual)
+        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
+    else()
+        message(WARNING "MPI not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -473,6 +491,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
             ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_MPI}
             ${GGML_SOURCES_EXTRA}
             )
 

diff --git a/Makefile b/Makefile
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_MPI
+	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	OBJS     += ggml-mpi.o
+
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI
+
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas

diff --git a/README.md b/README.md
@@ -268,6 +268,35 @@ Any value larger than 0 will offload the computation to the GPU. For example:
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
 ```
 
+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First, build llama.cpp and download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines. You will need to build llama.cpp with an MPI-capable compiler, for example,
+
+```bash
+make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+```
+
+Once the programs are built and the weights are downloaded on all machines, ensure password-less SSH access to each machine from the primary host.
+
+Next, create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_finalize_backend();
+
     return 0;
 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_finalize_backend();
+
     return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
     llama_free( ctx );
     llama_free_model( model );
 
+    llama_finalize_backend();
+
     return 0;
 }
 

diff --git a/ggml-mpi.c b/ggml-mpi.c
@@ -0,0 +1,81 @@
+#include "ggml-mpi.h"
+
+#include "ggml.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#define UNUSED GGML_UNUSED
+
+struct ggml_mpi_tensor_info {
+    int rank;
+};
+
+// ggml_compute_forward_send
+
+static void ggml_mpi_compute_forward_send(
+        struct ggml_tensor * src,
+        const struct ggml_tensor * orig) {
+    UNUSED(orig);
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+
+    int my_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+
+    int dst_rank = ((struct ggml_mpi_tensor_info *)src->extra)->rank;
+    // fprintf(stderr, "(%d) Sending to (%d)\n", my_rank, (int)dst->extra);
+    int retval = MPI_Send(src->data, ggml_nelements(src), MPI_FLOAT, dst_rank, 0, MPI_COMM_WORLD);
+    // fprintf(stderr, "(%d) Sent to (%d)\n", my_rank, (int)dst->extra);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+// ggml_compute_forward_recv
+
+static void ggml_mpi_compute_forward_recv(
+        struct ggml_tensor * dst,
+        const struct ggml_tensor * orig,
+        const struct ggml_tensor * parent) {
+    UNUSED(parent);
+    UNUSED(orig);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    MPI_Status status;
+
+    int my_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+
+    int src_rank = ((struct ggml_mpi_tensor_info *)dst->extra)->rank;
+    // fprintf(stderr, "(%d) Receiving from (%d)\n", my_rank, src_extra);
+    int retval = MPI_Recv(dst->data, ggml_nelements(dst), MPI_FLOAT, src_rank, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    // fprintf(stderr, "(%d) Received from (%d)\n", my_rank, src_extra);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+struct ggml_tensor * ggml_mpi_send_tensor(
+        struct ggml_context * ctx,
+        struct ggml_tensor *src,
+        int dst_rank) {
+
+    struct ggml_tensor * result = ggml_map_custom1_inplace_f32(ctx, src, ggml_mpi_compute_forward_send);
+
+    // TODO how/when to free this struct?
+    struct ggml_mpi_tensor_info *info = calloc(1, sizeof(struct ggml_mpi_tensor_info));
+    info->rank = dst_rank;
+    result->extra = info;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_mpi_recv_tensor(
+        struct ggml_context * ctx,
+        struct ggml_tensor *parent,
+        struct ggml_tensor *dst,
+        int src_rank) {
+    struct ggml_tensor * result = ggml_map_custom2_inplace_f32(ctx, dst, parent, ggml_mpi_compute_forward_recv);
+
+    // TODO how/when to free this struct?
+    struct ggml_mpi_tensor_info *info = calloc(1, sizeof(struct ggml_mpi_tensor_info));
+    info->rank = src_rank;
+    result->extra = info;
+
+    return result;
+}
diff --git a/ggml-mpi.h b/ggml-mpi.h
@@ -0,0 +1,22 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_tensor;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_tensor * ggml_mpi_send_tensor(
+        struct ggml_context * ctx,
+        struct ggml_tensor *src,
+        int dst_rank);
+struct ggml_tensor * ggml_mpi_recv_tensor(
+        struct ggml_context * ctx,
+        struct ggml_tensor *parent,
+        struct ggml_tensor *dst,
+        int src_rank);
+
+#ifdef __cplusplus
+}
+#endif