Skip to content

Distributed inference via MPI #2099

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f85785f
MPI support, first cut
evanmiller Jul 4, 2023
d05ca74
fix warnings, update README
evanmiller Jul 4, 2023
668ba5f
fixes
evanmiller Jul 4, 2023
042c5b2
wrap includes
evanmiller Jul 4, 2023
32deabf
Merge branch 'master' into mpi
evanmiller Jul 6, 2023
06a2393
PR comments
evanmiller Jul 7, 2023
1f0a2cf
Update CMakeLists.txt
evanmiller Jul 7, 2023
55207ba
Add GH workflow, fix test
evanmiller Jul 7, 2023
ef61acf
Add info to README
evanmiller Jul 7, 2023
3232db6
mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099)
ggerganov Jul 9, 2023
e339d35
mpi : add names for layer inputs + prep ggml_mpi_graph_compute()
ggerganov Jul 9, 2023
01abb3b
mpi : move all MPI logic into ggml-mpi
ggerganov Jul 9, 2023
c717c51
mpi : various fixes - communication now works but results are wrong
ggerganov Jul 9, 2023
ef37dd1
mpi : fix output tensor after MPI compute (still not working)
ggerganov Jul 9, 2023
beadbf3
mpi : fix inference
ggerganov Jul 9, 2023
9da9d26
mpi : minor
ggerganov Jul 9, 2023
0f557c2
Merge branch 'master' into mpi
evanmiller Jul 9, 2023
4a9a474
Add OpenMPI to GH action
evanmiller Jul 9, 2023
03cc12b
[mpi] continue-on-error: true
evanmiller Jul 9, 2023
81c5ddd
Merge branch 'mpi' into refactor-mpi
ggerganov Jul 9, 2023
1c3a15c
Merge pull request #1 from ggerganov/refactor-mpi
evanmiller Jul 9, 2023
166db36
mpi : fix after master merge
ggerganov Jul 9, 2023
f085a57
[mpi] Link MPI C++ libraries to fix OpenMPI
evanmiller Jul 9, 2023
00b8aa1
tests : fix new llama_backend API
ggerganov Jul 9, 2023
666a15a
Merge remote-tracking branch 'refs/remotes/origin/mpi' into mpi
evanmiller Jul 9, 2023
b18e4ad
Merge branch 'mpi' of github.com:evanmiller/llama.cpp into mpi
evanmiller Jul 9, 2023
ada1a2a
[mpi] use MPI_INT32_T
evanmiller Jul 9, 2023
c3c3ef1
mpi : factor out recv / send in functions and reuse
ggerganov Jul 10, 2023
eaef2d0
mpi : extend API to allow usage with outer backends (e.g. Metal)
ggerganov Jul 10, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,34 @@ jobs:
cd build
ctest --verbose

ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest

steps:
- name: Clone
id: checkout
uses: actions/checkout@v1

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential mpich

- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_MPI=ON ..
cmake --build . --config Release

- name: Test
id: cmake_test
run: |
cd build
ctest --verbose

macOS-latest-make:
runs-on: macos-latest

Expand Down
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)

Expand Down Expand Up @@ -305,6 +306,23 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
set(c_flags ${c_flags} -Wno-cast-qual)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
else()
message(WARNING "MPI not found")
endif()
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -473,6 +491,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_MPI}
${GGML_SOURCES_EXTRA}
)

Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
OBJS += ggml-mpi.o

ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI

ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas
Expand Down
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,35 @@ Any value larger than 0 will offload the computation to the GPU. For example:
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
```

### MPI Build

MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.

First, build llama.cpp and download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines. You will need to build llama.cpp with an MPI-capable compiler, for example,

```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```

Once the programs are built and the weights are downloaded on all machines, ensure password-less SSH access to each machine from the primary host.

Next, create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".

Here is an example hostfile:

```
192.168.0.1:2
malvolio.local:1
```

The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.

Finally, you're ready to run a computation using `mpirun`:

```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
```

### BLAS Build

Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
Expand Down
2 changes: 2 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_finalize_backend();

return 0;
}
2 changes: 2 additions & 0 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_finalize_backend();

return 0;
}
2 changes: 2 additions & 0 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ int main(int argc, char ** argv)
llama_free( ctx );
llama_free_model( model );

llama_finalize_backend();

return 0;
}

Expand Down
81 changes: 81 additions & 0 deletions ggml-mpi.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include "ggml-mpi.h"

#include "ggml.h"

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define UNUSED GGML_UNUSED

struct ggml_mpi_tensor_info {
int rank;
};

// ggml_compute_forward_send

static void ggml_mpi_compute_forward_send(
struct ggml_tensor * src,
const struct ggml_tensor * orig) {
UNUSED(orig);
GGML_ASSERT(src->type == GGML_TYPE_F32);

int my_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

int dst_rank = ((struct ggml_mpi_tensor_info *)src->extra)->rank;
// fprintf(stderr, "(%d) Sending to (%d)\n", my_rank, (int)dst->extra);
int retval = MPI_Send(src->data, ggml_nelements(src), MPI_FLOAT, dst_rank, 0, MPI_COMM_WORLD);
// fprintf(stderr, "(%d) Sent to (%d)\n", my_rank, (int)dst->extra);
GGML_ASSERT(retval == MPI_SUCCESS);
}

// ggml_compute_forward_recv

static void ggml_mpi_compute_forward_recv(
struct ggml_tensor * dst,
const struct ggml_tensor * orig,
const struct ggml_tensor * parent) {
UNUSED(parent);
UNUSED(orig);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
MPI_Status status;

int my_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

int src_rank = ((struct ggml_mpi_tensor_info *)dst->extra)->rank;
// fprintf(stderr, "(%d) Receiving from (%d)\n", my_rank, src_extra);
int retval = MPI_Recv(dst->data, ggml_nelements(dst), MPI_FLOAT, src_rank, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
// fprintf(stderr, "(%d) Received from (%d)\n", my_rank, src_extra);
GGML_ASSERT(retval == MPI_SUCCESS);
}

struct ggml_tensor * ggml_mpi_send_tensor(
struct ggml_context * ctx,
struct ggml_tensor *src,
int dst_rank) {

struct ggml_tensor * result = ggml_map_custom1_inplace_f32(ctx, src, ggml_mpi_compute_forward_send);

// TODO how/when to free this struct?
struct ggml_mpi_tensor_info *info = calloc(1, sizeof(struct ggml_mpi_tensor_info));
info->rank = dst_rank;
result->extra = info;

return result;
}

struct ggml_tensor * ggml_mpi_recv_tensor(
struct ggml_context * ctx,
struct ggml_tensor *parent,
struct ggml_tensor *dst,
int src_rank) {
struct ggml_tensor * result = ggml_map_custom2_inplace_f32(ctx, dst, parent, ggml_mpi_compute_forward_recv);

// TODO how/when to free this struct?
struct ggml_mpi_tensor_info *info = calloc(1, sizeof(struct ggml_mpi_tensor_info));
info->rank = src_rank;
result->extra = info;

return result;
}
22 changes: 22 additions & 0 deletions ggml-mpi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

struct ggml_context;
struct ggml_tensor;

#ifdef __cplusplus
extern "C" {
#endif

struct ggml_tensor * ggml_mpi_send_tensor(
struct ggml_context * ctx,
struct ggml_tensor *src,
int dst_rank);
struct ggml_tensor * ggml_mpi_recv_tensor(
struct ggml_context * ctx,
struct ggml_tensor *parent,
struct ggml_tensor *dst,
int src_rank);

#ifdef __cplusplus
}
#endif
Loading