Skip to content

Commit fee7da1

Browse files
JohnJohannesGaessler
John
authored andcommitted
Work in progress.
Added falcon main and library based on llama.cpp CPU inference works (getting ~260ms/token on 7B 16 bit falcon) Tested with 7B 16 bit and the two shakespear models (both in 16 bit precisiononly) TODO/WIP: 1) quantization runs, creates a ggjt 3 file but something is wrong with the quantized model binary - even quantization from 16 -> 16 also fails, something is wrong in the tensors produced 2) mmap should work with quantized binaries once 1) is solved 3) CUDA support is mostly there, it's currently disabled (all CPU backend) 4) memory/context caluculations are off, GPU memory calculations are wrong either 5) the python conversion script is pre GGML 1 version (tokens without scores) 6) some stuff is still called "llama", some of it should be renamed to a generic name as it works for both 7) the GGML produced by the current python uses an old ftype method Makfiles: cmake on windows with build tools works the makefile for linux/msys was blind adjusted but not tested yet - possibly missed something Changes to the codebase: * repeat2 has been added to ggml (jploski - ggml-org/ggml#231) including the backward variant (untested, probably fails) * minor changes to work with falcon (name length) * libfalcon is the previous "llama.cpp" and falcon_main is the previous main.cpp
1 parent b241649 commit fee7da1

17 files changed

+6570
-9
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,6 @@ qnt-*.txt
5656
perf-*.txt
5757

5858
examples/jeopardy/results.txt
59+
demo_falcon_orig.cpp
60+
.github/workflows/build.yml
61+
.github/workflows/build.yml

CMakeLists.txt

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
2-
project("llama.cpp" C CXX)
3-
2+
# If CUDA toolkit is not found using msvc compiler switch to Community Edition (same compiler, just other kit..)
3+
project("ggllm.cpp" C CXX)
4+
# add_definitions(-DGGML_PERF=1)
5+
include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
6+
include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")
47
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
58

69
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@@ -20,7 +23,7 @@ else()
2023
endif()
2124

2225
if (EMSCRIPTEN)
23-
set(BUILD_SHARED_LIBS_DEFAULT OFF)
26+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
2427

2528
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
2629
else()
@@ -67,7 +70,7 @@ endif()
6770
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
6871
option(LLAMA_BLAS "llama: use BLAS" OFF)
6972
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
70-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
73+
option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
7174
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7275
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
7376
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
@@ -225,13 +228,14 @@ if (LLAMA_BLAS)
225228
endif()
226229

227230
if (LLAMA_CUBLAS)
228-
cmake_minimum_required(VERSION 3.17)
231+
cmake_minimum_required(VERSION 3.17)
229232

230233
find_package(CUDAToolkit)
231234
if (CUDAToolkit_FOUND)
232235
message(STATUS "cuBLAS found")
233236

234237
enable_language(CUDA)
238+
message(STATUS "CUDA found, version: ${CUDAToolkit_VERSION}")
235239

236240
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
237241

@@ -480,19 +484,44 @@ target_link_libraries(llama PRIVATE
480484
${LLAMA_EXTRA_LIBS}
481485
)
482486

487+
# falcon
488+
add_library(libfalcon
489+
libfalcon.cpp
490+
libfalcon.h
491+
llama-util.h
492+
)
493+
target_include_directories(libfalcon PUBLIC .)
494+
target_compile_features(libfalcon PUBLIC cxx_std_11) # don't bump
495+
target_link_libraries(libfalcon PRIVATE
496+
ggml
497+
${LLAMA_EXTRA_LIBS}
498+
)
499+
#
500+
483501
if (BUILD_SHARED_LIBS)
484502
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
485503
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
486504
if (LLAMA_METAL)
487505
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
488506
endif()
507+
508+
# falcon
509+
set_target_properties(libfalcon PROPERTIES POSITION_INDEPENDENT_CODE ON)
510+
target_compile_definitions(libfalcon PRIVATE LLAMA_SHARED LLAMA_BUILD)
511+
if (LLAMA_METAL)
512+
set_target_properties(libfalcon PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
513+
endif()
514+
#
489515
endif()
490516

491517
if (GGML_SOURCES_CUDA)
492518
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
493519
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
494520
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
495521
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
522+
# falcon
523+
set_property(TARGET libfalcon PROPERTY CUDA_ARCHITECTURES OFF)
524+
496525
endif()
497526

498527

Makefile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,15 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
255255
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
256256
$(CXX) $(CXXFLAGS) -c $< -o $@
257257

258+
libfalcon.o: libfalcon.cpp ggml.h ggml-cuda.h libfalcon.h llama-util.h
259+
$(CXX) $(CXXFLAGS) -c $< -o $@
260+
258261
common.o: examples/common.cpp examples/common.h
259262
$(CXX) $(CXXFLAGS) -c $< -o $@
260263

264+
falcom_common.o: examples/falcon_common.cpp examples/falcon_common.h
265+
$(CXX) $(CXXFLAGS) -c $< -o $@
266+
261267
libllama.so: llama.o ggml.o $(OBJS)
262268
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
263269

@@ -280,6 +286,9 @@ simple: examples/simple/simple.cpp build-info.h ggml.
280286
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
281287
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
282288

289+
falcon_quantize: examples/falcon_quantize/quantize.cpp build-info.h ggml.o libfalcon.o $(OBJS)
290+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
291+
283292
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
284293
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
285294

@@ -306,6 +315,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
306315
rm $@.tmp; \
307316
fi
308317

318+
falcon_main: examples/falcon/falcon_main.cpp build-info.h ggml.o libfalcon.o falcon_common.o $(OBJS)
319+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
309320
#
310321
# Tests
311322
#

examples/CMakeLists.txt

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ find_package(Threads REQUIRED)
66

77
# ...
88

9-
# common
9+
# common
1010

1111
set(TARGET common)
1212

@@ -23,13 +23,33 @@ target_include_directories(${TARGET} PUBLIC .)
2323
target_compile_features(${TARGET} PUBLIC cxx_std_11)
2424
target_link_libraries(${TARGET} PRIVATE llama)
2525

26+
27+
# falcon_common
28+
29+
set(FALCON_TARGET falcon_common)
30+
31+
add_library(${FALCON_TARGET} OBJECT
32+
falcon_common.h
33+
falcon_common.cpp
34+
)
35+
36+
if (BUILD_SHARED_LIBS)
37+
set_target_properties(${FALCON_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
38+
endif()
39+
40+
target_include_directories(${FALCON_TARGET} PUBLIC .)
41+
target_compile_features(${FALCON_TARGET} PUBLIC cxx_std_11)
42+
target_link_libraries(${FALCON_TARGET} PRIVATE libfalcon)
43+
2644
# examples
2745

2846
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
2947

3048
if (EMSCRIPTEN)
3149
else()
3250
add_subdirectory(main)
51+
add_subdirectory(falcon)
52+
add_subdirectory(falcon_quantize)
3353
add_subdirectory(quantize)
3454
add_subdirectory(quantize-stats)
3555
add_subdirectory(perplexity)

examples/falcon/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
set(TARGET falcon_main)
2+
add_executable(${TARGET} falcon_main.cpp)
3+
target_link_libraries(${TARGET} PRIVATE falcon_common libfalcon ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
5+
if(TARGET BUILD_INFO)
6+
add_dependencies(${TARGET} BUILD_INFO)
7+
endif()
8+

0 commit comments

Comments
 (0)