Skip to content

Commit ecb217d

Browse files
authored
llama : Metal inference (#1642)
* mtl : export the LLaMA computation graph * ci : disable temporary * mtl : adapt the MNIST example as starter * mtl : no need for mtl-export tool, add cli arg for main instead * mtl : export just a small part of the graph for now to make it easier * mtl : move MSL code into separate file for easy editing * mtl : initial get_rows_q4_0 kernel * mtl : confirmed get_rows_q4_0 is working correctly * mtl : add rms_norm kernel + confirm working * mtl : add mul kernel + confirm working * mtl : initial mul_mat Q4 kernel (wrong results) * mtl : mul_mat fixes (still wrong) * mtl : another mul_mat Q4 (still does not work) * mtl : working mul_mat q4 * ggml : fix handling of "view" ops in ggml_graph_import() * mtl : add rope kernel * mtl : add reshape and transpose handling * ggml : store offset as opt arg for ggml_view_xd() operators * mtl : add cpy kernel + handle view ops * mtl : confirm f16 x f32 attention mul mat * mtl : add scale kernel * mtl : add diag_mask_inf kernel * mtl : fix soft_max kernel * ggml : update ggml_nbytes() to handle non-contiguous tensors * mtl : verify V tensor contents * mtl : add f32 -> f32 cpy kernel * mtl : add silu kernel * mtl : add non-broadcast mul kernel * mtl : full GPU inference of the computation graph * mtl : optimize rms_norm and soft_max kernels * mtl : add f16 mat x f32 vec multiplication kernel * mtl : fix bug in f16 x f32 mul mat + speed-up computation * mtl : faster mul_mat_q4_0_f32 kernel * mtl : fix kernel signature + roll inner loop * mtl : more threads for rms_norm + better timing * mtl : remove printfs from inner loop * mtl : simplify implementation * mtl : add save/load vocab to ggml file * mtl : plug Metal inference into llama.cpp (very quick-n-dirty) * mtl : make it work with main example Lots of hacks but at least now it generates text * mtl : preparing for merge * mtl : clean-up ggml mtl interface + suport scratch / inplace * mtl : remove temp / debug code * metal : final refactoring and simplification * Revert "ci : disable temporary" This reverts commit 98c267f. * metal : add comments * metal : clean-up stuff, fix typos * readme : add Metal instructions * readme : add example for main
1 parent dcb2ed4 commit ecb217d

17 files changed

+1676
-93
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ build-release/
1717
build-static/
1818
build-cublas/
1919
build-opencl/
20+
build-metal/
2021
build-no-accel/
2122
build-sanitize-addr/
2223
build-sanitize-thread/

CMakeLists.txt

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,14 @@ if (NOT MSVC)
6464
endif()
6565

6666
# 3rd party libs
67-
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
68-
option(LLAMA_BLAS "llama: use BLAS" OFF)
67+
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
68+
option(LLAMA_BLAS "llama: use BLAS" OFF)
6969
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
70-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
71-
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
72-
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73-
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
70+
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
71+
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
72+
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73+
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
74+
option(LLAMA_METAL "llama: use Metal" OFF)
7475

7576
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
7677
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)
183184

184185
enable_language(CUDA)
185186

186-
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
187+
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
187188

188189
add_compile_definitions(GGML_USE_CUBLAS)
189190
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
@@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
200201
endif()
201202
endif()
202203

204+
if (LLAMA_METAL)
205+
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
206+
find_library(METAL_FRAMEWORK Metal REQUIRED)
207+
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
208+
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
209+
210+
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
211+
212+
add_compile_definitions(GGML_USE_METAL)
213+
add_compile_definitions(GGML_METAL_NDEBUG)
214+
215+
# get full path to the file
216+
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
217+
218+
# copy ggml-metal.metal to bin directory
219+
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
220+
221+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
222+
${FOUNDATION_LIBRARY}
223+
${METAL_FRAMEWORK}
224+
${METALKIT_FRAMEWORK}
225+
${METALPERFORMANCE_FRAMEWORK}
226+
)
227+
endif()
228+
203229
if (LLAMA_CLBLAST)
204230
find_package(CLBlast)
205231
if (CLBlast_FOUND)
206232
message(STATUS "CLBlast found")
207233

208-
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
234+
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
209235

210236
add_compile_definitions(GGML_USE_CLBLAST)
211237

@@ -370,8 +396,10 @@ endif()
370396
add_library(ggml OBJECT
371397
ggml.c
372398
ggml.h
373-
${GGML_CUDA_SOURCES}
374-
${GGML_OPENCL_SOURCES})
399+
${GGML_SOURCES_CUDA}
400+
${GGML_SOURCES_OPENCL}
401+
${GGML_SOURCES_METAL}
402+
)
375403

376404
target_include_directories(ggml PUBLIC .)
377405
target_compile_features(ggml PUBLIC c_std_11) # don't bump
@@ -384,21 +412,25 @@ endif()
384412
add_library(llama
385413
llama.cpp
386414
llama.h
387-
llama-util.h)
415+
llama-util.h
416+
)
388417

389418
target_include_directories(llama PUBLIC .)
390419
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
391-
target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
420+
target_link_libraries(llama PRIVATE
421+
ggml
422+
${LLAMA_EXTRA_LIBS}
423+
)
392424

393425
if (BUILD_SHARED_LIBS)
394426
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
395427
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
396428
endif()
397429

398-
if (GGML_CUDA_SOURCES)
430+
if (GGML_SOURCES_CUDA)
399431
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
400-
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
401-
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
432+
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
433+
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
402434
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
403435
endif()
404436

Makefile

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
105105
#CFLAGS += -mfma -mf16c -mavx
106106
#CXXFLAGS += -mfma -mf16c -mavx
107107
endif
108+
108109
ifneq ($(filter ppc64%,$(UNAME_M)),)
109110
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
110111
ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -116,26 +117,30 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
116117
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
117118
endif
118119
endif
120+
119121
ifndef LLAMA_NO_ACCELERATE
120122
# Mac M1 - include Accelerate framework.
121123
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
122124
ifeq ($(UNAME_S),Darwin)
123125
CFLAGS += -DGGML_USE_ACCELERATE
124126
LDFLAGS += -framework Accelerate
125127
endif
126-
endif
128+
endif # LLAMA_NO_ACCELERATE
129+
127130
ifdef LLAMA_OPENBLAS
128131
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
129132
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
130133
LDFLAGS += -lopenblas -lcblas
131134
else
132135
LDFLAGS += -lopenblas
133136
endif
134-
endif
137+
endif # LLAMA_OPENBLAS
138+
135139
ifdef LLAMA_BLIS
136140
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
137141
LDFLAGS += -lblis -L/usr/local/lib
138-
endif
142+
endif # LLAMA_BLIS
143+
139144
ifdef LLAMA_CUBLAS
140145
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
141146
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@@ -156,33 +161,49 @@ endif # LLAMA_CUDA_DMMV_Y
156161
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
157162
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
158163
endif # LLAMA_CUBLAS
164+
159165
ifdef LLAMA_CLBLAST
160-
CFLAGS += -DGGML_USE_CLBLAST
161-
CXXFLAGS += -DGGML_USE_CLBLAST
166+
CFLAGS += -DGGML_USE_CLBLAST
167+
CXXFLAGS += -DGGML_USE_CLBLAST
162168
# Mac provides OpenCL as a framework
163169
ifeq ($(UNAME_S),Darwin)
164170
LDFLAGS += -lclblast -framework OpenCL
165171
else
166172
LDFLAGS += -lclblast -lOpenCL
167173
endif
168174
OBJS += ggml-opencl.o
175+
169176
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
170177
$(CXX) $(CXXFLAGS) -c $< -o $@
171-
endif
178+
endif # LLAMA_CLBLAST
179+
180+
ifdef LLAMA_METAL
181+
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
182+
CXXFLAGS += -DGGML_USE_METAL
183+
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
184+
OBJS += ggml-metal.o
185+
186+
ggml-metal.o: ggml-metal.m ggml-metal.h
187+
$(CC) $(CFLAGS) -c $< -o $@
188+
endif # LLAMA_METAL
189+
172190
ifneq ($(filter aarch64%,$(UNAME_M)),)
173191
# Apple M1, M2, etc.
174192
# Raspberry Pi 3, 4, Zero 2 (64-bit)
175193
CFLAGS += -mcpu=native
176194
CXXFLAGS += -mcpu=native
177195
endif
196+
178197
ifneq ($(filter armv6%,$(UNAME_M)),)
179198
# Raspberry Pi 1, Zero
180199
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
181200
endif
201+
182202
ifneq ($(filter armv7%,$(UNAME_M)),)
183203
# Raspberry Pi 2
184204
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
185205
endif
206+
186207
ifneq ($(filter armv8%,$(UNAME_M)),)
187208
# Raspberry Pi 3, 4, Zero 2 (32-bit)
188209
CFLAGS += -mfp16-format=ieee -mno-unaligned-access

README.md

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
5151
The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook
5252

5353
- Plain C/C++ implementation without dependencies
54-
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
54+
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
5555
- AVX, AVX2 and AVX512 support for x86 architectures
5656
- Mixed F16 / F32 precision
5757
- 4-bit, 5-bit and 8-bit integer quantization support
58-
- Runs on the CPU
5958
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
6059
- cuBLAS and CLBlast support
6160

@@ -236,6 +235,32 @@ In order to build llama.cpp you have three different options.
236235
zig build -Drelease-fast
237236
```
238237

238+
### Metal Build
239+
240+
Using Metal allows the computation to be executed on the GPU for Apple devices:
241+
242+
- Using `make`:
243+
244+
```bash
245+
LLAMA_METAL=1 make
246+
```
247+
248+
- Using `CMake`:
249+
250+
```bash
251+
mkdir build-metal
252+
cd build-metal
253+
cmake -DLLAMA_METAL=ON ..
254+
cmake --build . --config Release
255+
```
256+
257+
When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
258+
Any value larger than 0 will offload the computation to the GPU. For example:
259+
260+
```bash
261+
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
262+
```
263+
239264
### BLAS Build
240265

241266
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
@@ -369,7 +394,7 @@ Building the program with BLAS support may lead to some performance improvements
369394

370395
Running:
371396

372-
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
397+
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
373398

374399
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
375400
The selection can be a number (starting from 0) or a text string to search:

examples/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ else()
3737
add_subdirectory(save-load-state)
3838
add_subdirectory(benchmark)
3939
add_subdirectory(baby-llama)
40-
if(LLAMA_BUILD_SERVER)
40+
if (LLAMA_METAL)
41+
add_subdirectory(metal)
42+
endif()
43+
if (LLAMA_BUILD_SERVER)
4144
add_subdirectory(server)
4245
endif()
4346
endif()

examples/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
299299
params.use_mmap = false;
300300
} else if (arg == "--mtest") {
301301
params.mem_test = true;
302+
} else if (arg == "--export") {
303+
params.export_cgraph = true;
302304
} else if (arg == "--verbose-prompt") {
303305
params.verbose_prompt = true;
304306
} else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -438,6 +440,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
438440
fprintf(stderr, " number of layers to store in VRAM\n");
439441
#endif
440442
fprintf(stderr, " --mtest compute maximum memory usage\n");
443+
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
441444
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
442445
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
443446
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");

examples/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ struct gpt_params {
7171
bool use_mmap = true; // use mmap for faster loads
7272
bool use_mlock = false; // use mlock to keep model in memory
7373
bool mem_test = false; // compute maximum memory usage
74+
bool export_cgraph = false; // export the computation graph
7475
bool verbose_prompt = false; // print prompt tokens before generation
7576
};
7677

examples/main/main.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,13 @@ int main(int argc, char ** argv) {
134134
return 0;
135135
}
136136

137+
// export the cgraph and exit
138+
if (params.export_cgraph) {
139+
llama_eval_export(ctx, "llama.ggml");
140+
llama_free(ctx);
141+
142+
return 0;
143+
}
137144

138145
std::string path_session = params.path_prompt_cache;
139146
std::vector<llama_token> session_tokens;

examples/metal/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
set(TEST_TARGET metal)
2+
add_executable(${TEST_TARGET} metal.cpp)
3+
target_link_libraries(${TEST_TARGET} PRIVATE ggml)

0 commit comments

Comments
 (0)