Skip to content

Commit f69ab89

Browse files
committed
Merge remote-tracking branch 'origin/master' into sl/micro-batching
2 parents cad4652 + c9b316c commit f69ab89

37 files changed

+2456
-839
lines changed

.devops/main-intel.Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
2+
ARG UBUNTU_VERSION=22.04
3+
4+
FROM intel/hpckit:$ONEAPI_VERSION as build
5+
6+
RUN apt-get update && \
7+
apt-get install -y git
8+
9+
WORKDIR /app
10+
11+
COPY . .
12+
13+
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
14+
RUN mkdir build && \
15+
cd build && \
16+
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
17+
cmake --build . --config Release --target main server
18+
19+
FROM ubuntu:$UBUNTU_VERSION as runtime
20+
21+
COPY --from=build /app/build/bin/main /main
22+
COPY --from=build /app/build/bin/server /server
23+
24+
ENV LC_ALL=C.utf8
25+
26+
ENTRYPOINT [ "/main" ]

.devops/nix/nixpkgs-instances.nix

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@
77
{ system, ... }:
88
{
99
_module.args = {
10+
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
11+
# again, the below creates several nixpkgs instances which the
12+
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
13+
#
14+
# This is currently "slow" and "expensive", on a certain scale.
15+
# This also isn't "right" in that this hinders dependency injection at
16+
# the level of flake inputs. This might get removed in the foreseeable
17+
# future.
18+
#
19+
# Note that you can use these expressions without Nix
20+
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
21+
1022
pkgsCuda = import inputs.nixpkgs {
1123
inherit system;
1224
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,

.devops/nix/package.nix

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ let
7373
ps: [
7474
ps.numpy
7575
ps.sentencepiece
76+
ps.tiktoken
7677
ps.torchWithoutCuda
7778
ps.transformers
7879
]
@@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
114115
pname = "llama-cpp${pnameSuffix}";
115116
version = llamaVersion;
116117

118+
# Note: none of the files discarded here are visible in the sandbox or
119+
# affect the output hash. This also means they can be modified without
120+
# triggering a rebuild.
117121
src = lib.cleanSourceWith {
118122
filter =
119123
name: type:
120-
!(builtins.any (_: _) [
124+
let
125+
noneOf = builtins.all (x: !x);
126+
baseName = baseNameOf name;
127+
in
128+
noneOf [
121129
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
122-
(name == "README.md") # Ignore *.md changes whe computing outPaths
123-
(lib.hasPrefix "." name) # Skip hidden files and directories
124-
]);
130+
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
131+
(lib.hasPrefix "." baseName) # Skip hidden files and directories
132+
(baseName == "flake.lock")
133+
];
125134
src = lib.cleanSource ../../.;
126135
};
127136

@@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
159168

160169
cmakeFlags =
161170
[
162-
(cmakeBool "LLAMA_NATIVE" true)
171+
(cmakeBool "LLAMA_NATIVE" false)
163172
(cmakeBool "LLAMA_BUILD_SERVER" true)
164173
(cmakeBool "BUILD_SHARED_LIBS" true)
165174
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
@@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
216225
description = "contains numpy and sentencepiece";
217226
buildInputs = [ llama-python ];
218227
inputsFrom = [ finalAttrs.finalPackage ];
228+
shellHook = ''
229+
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
230+
'';
219231
};
220232

221233
shell-extra = mkShell {

.devops/nix/scope.nix

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
llamaVersion ? "0.0.0",
55
}:
66

7+
# We're using `makeScope` instead of just writing out an attrset
8+
# because it allows users to apply overlays later using `overrideScope'`.
9+
# Cf. https://noogle.dev/f/lib/makeScope
10+
711
lib.makeScope newScope (
812
self: {
913
inherit llamaVersion;

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ jobs:
295295
OPENBLAS_VERSION: 0.3.23
296296
OPENCL_VERSION: 2023.04.17
297297
CLBLAST_VERSION: 1.6.0
298-
SDE_VERSION: 9.21.1-2023-04-24
298+
SDE_VERSION: 9.33.0-2024-01-07
299299

300300
strategy:
301301
matrix:
@@ -400,7 +400,7 @@ jobs:
400400
id: cmake_test_sde
401401
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
402402
run: |
403-
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
403+
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
404404
# for some weird reason windows tar doesn't like sde tar.xz
405405
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
406406
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar

.github/workflows/docker.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ jobs:
3535
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
3636
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
3737
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
38+
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
3839
steps:
3940
- name: Check out the repo
4041
uses: actions/checkout@v3

.github/workflows/nix-ci-aarch64.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,20 @@ name: Nix aarch64 builds
22

33
on:
44
workflow_dispatch: # allows manual triggering
5+
schedule:
6+
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
7+
# 1.5h instead of minutes with the cold cache).
8+
#
9+
# randint(0, 59), randint(0, 23)
10+
- cron: '26 12 * * *'
11+
# But also rebuild if we touched any of the Nix expressions:
512
push:
613
branches:
714
- master
8-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
15+
paths: ['**/*.nix', 'flake.lock']
916
pull_request:
1017
types: [opened, synchronize, reopened]
11-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
18+
paths: ['**/*.nix', 'flake.lock']
1219

1320
jobs:
1421
nix-build-aarch64:

.github/workflows/nix-ci.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@ on:
55
push:
66
branches:
77
- master
8-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
98
pull_request:
109
types: [opened, synchronize, reopened]
11-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
1210

1311
jobs:
1412
nix-eval:

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA
108108
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
109109
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
110110

111+
112+
# add perf arguments
113+
option(LLAMA_PERF "llama: enable perf" OFF)
114+
if (LLAMA_PERF)
115+
add_definitions(-DGGML_PERF)
116+
endif()
117+
111118
# Required for relocatable CMake package
112119
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
113120

@@ -471,6 +478,11 @@ function(get_flags CCID CCVER)
471478
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
472479
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
473480
endif()
481+
elseif (CCID MATCHES "Intel")
482+
# enable max optimization level when using Intel compiler
483+
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
484+
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
485+
add_link_options(-fuse-ld=lld -static-intel)
474486
endif()
475487

476488
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)

common/common.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
203203
params.prompt_cache_all = true;
204204
} else if (arg == "--prompt-cache-ro") {
205205
params.prompt_cache_ro = true;
206+
} else if (arg == "-bf" || arg == "--binary-file") {
207+
if (++i >= argc) {
208+
invalid_param = true;
209+
break;
210+
}
211+
std::ifstream file(argv[i], std::ios::binary);
212+
if (!file) {
213+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
214+
invalid_param = true;
215+
break;
216+
}
217+
// store the external file name in params
218+
params.prompt_file = argv[i];
219+
std::ostringstream ss;
220+
ss << file.rdbuf();
221+
params.prompt = ss.str();
222+
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
206223
} else if (arg == "-f" || arg == "--file") {
207224
if (++i >= argc) {
208225
invalid_param = true;
@@ -659,6 +676,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
659676
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
660677
params.logdir += DIRECTORY_SEPARATOR;
661678
}
679+
} else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
680+
if (++i >= argc) {
681+
invalid_param = true;
682+
break;
683+
}
684+
params.logits_file = argv[i];
662685
} else if (arg == "--perplexity" || arg == "--all-logits") {
663686
params.logits_all = true;
664687
} else if (arg == "--ppl-stride") {
@@ -695,6 +718,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
695718
break;
696719
}
697720
params.winogrande_tasks = std::stoi(argv[i]);
721+
} else if (arg == "--multiple-choice") {
722+
params.multiple_choice = true;
723+
} else if (arg == "--multiple-choice-tasks") {
724+
if (++i >= argc) {
725+
invalid_param = true;
726+
break;
727+
}
728+
params.multiple_choice_tasks = std::stoi(argv[i]);
729+
} else if (arg == "--kl-divergence") {
730+
params.kl_divergence = true;
698731
} else if (arg == "--ignore-eos") {
699732
params.ignore_eos = true;
700733
} else if (arg == "--no-penalize-nl") {
@@ -894,6 +927,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
894927
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
895928
printf(" -f FNAME, --file FNAME\n");
896929
printf(" prompt file to start generation.\n");
930+
printf(" -bf FNAME, --binary-file FNAME\n");
931+
printf(" binary file containing multiple choice tasks.\n");
897932
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
898933
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
899934
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -944,6 +979,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
944979
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
945980
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
946981
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
982+
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
983+
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
984+
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
947985
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
948986
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
949987
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);

common/common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ struct gpt_params {
9292
std::string input_suffix = ""; // string to suffix user inputs with
9393
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
9494
std::string logdir = ""; // directory in which to save YAML log files
95+
std::string logits_file = ""; // file for saving *all* logits
9596

9697
std::vector<llama_model_kv_override> kv_overrides;
9798

@@ -109,6 +110,11 @@ struct gpt_params {
109110
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
110111
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
111112

113+
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
114+
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
115+
116+
bool kl_divergence = false; // compute KL-divergence
117+
112118
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
113119
bool random_prompt = false; // do not randomize prompt if none provided
114120
bool use_color = false; // use color to distinguish generations and inputs

0 commit comments

Comments
 (0)