diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix new file mode 100644 index 0000000000000..d9b6a1e000628 --- /dev/null +++ b/.devops/nix/apps.nix @@ -0,0 +1,14 @@ +{ package, binaries }: + +let + default = builtins.elemAt binaries 0; + mkApp = name: { + ${name} = { + type = "app"; + program = "${package}/bin/${name}"; + }; + }; + result = builtins.foldl' (acc: name: (mkApp name) // acc) { } binaries; +in + +result // { default = result.${default}; } diff --git a/.devops/nix/devshells.nix b/.devops/nix/devshells.nix new file mode 100644 index 0000000000000..afaaa2644059b --- /dev/null +++ b/.devops/nix/devshells.nix @@ -0,0 +1,8 @@ +{ concatMapAttrs, packages }: + +concatMapAttrs + (name: package: { + ${name} = package.passthru.shell; + ${name + "-extra"} = package.passthru.shell-extra; + }) + packages diff --git a/.devops/nix/overlay.nix b/.devops/nix/overlay.nix new file mode 100644 index 0000000000000..c7baec8434fa4 --- /dev/null +++ b/.devops/nix/overlay.nix @@ -0,0 +1,5 @@ +final: prev: + +{ + llama-cpp = final.callPackage ./package.nix { }; +} diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix new file mode 100644 index 0000000000000..c6d03b4a480e7 --- /dev/null +++ b/.devops/nix/package.nix @@ -0,0 +1,224 @@ +{ + lib, + config, + stdenv, + mkShell, + cmake, + ninja, + pkg-config, + git, + python3, + mpi, + openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations + cudaPackages, + darwin, + rocmPackages, + clblast, + useBlas ? builtins.all (x: !x) [ + useCuda + useMetalKit + useOpenCL + useRocm + ], + useCuda ? config.cudaSupport, + useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, + useOpenCL ? false, + useRocm ? config.rocmSupport, +}@inputs: + +let + inherit (lib) + cmakeBool + cmakeFeature + optionals + versionOlder + ; + + # It's necessary to consistently use backendStdenv when building with CUDA support, + # otherwise we get libstdc++ errors downstream. + stdenv = throw "Use effectiveStdenv instead"; + effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv; + + # Give a little description difference between the flavors. + descriptionSuffix = + if useOpenCL then + " (OpenCL accelerated)" + else if useCuda then + " (CUDA accelerated)" + else if useRocm then + " (ROCm accelerated)" + else if useMetalKit then + " (MetalKit accelerated)" + else + ""; + + # TODO: package the Python in this repository in a Nix-like way. + # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo + # is PEP 517-compatible, and ensure the correct .dist-info is generated. + # https://peps.python.org/pep-0517/ + llama-python = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ] + ); + + # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime + llama-python-extra = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ps.torchWithoutCuda + ps.transformers + ] + ); + + # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 + # separately + darwinBuildInputs = + with darwin.apple_sdk.frameworks; + [ + Accelerate + CoreVideo + CoreGraphics + ] + ++ optionals useMetalKit [ MetalKit ]; + + cudaBuildInputs = with cudaPackages; [ + cuda_cccl.dev # + cuda_cudart + libcublas + ]; + + rocmBuildInputs = with rocmPackages; [ + clr + hipblas + rocblas + ]; +in + +effectiveStdenv.mkDerivation ( + finalAttrs: { + name = "llama.cpp"; + src = ../../.; + + postPatch = '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + + # TODO: Package up each Python script or service appropriately. + # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`, + # we could make those *.py into setuptools' entrypoints + substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python" + ''; + + nativeBuildInputs = [ + cmake + ninja + pkg-config + git + ] ++ optionals useCuda [ cudaPackages.cuda_nvcc ]; + + buildInputs = + [ mpi ] + ++ optionals useOpenCL [ clblast ] + ++ optionals useCuda cudaBuildInputs + ++ optionals useRocm rocmBuildInputs + ++ optionals effectiveStdenv.isDarwin darwinBuildInputs; + + cmakeFlags = + [ + (cmakeBool "LLAMA_NATIVE" true) + (cmakeBool "LLAMA_BUILD_SERVER" true) + (cmakeBool "BUILD_SHARED_LIBS" true) + (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) + (cmakeBool "LLAMA_METAL" useMetalKit) + (cmakeBool "LLAMA_BLAS" useBlas) + ] + ++ optionals useOpenCL [ (cmakeBool "LLAMA_CLBLAST" true) ] + ++ optionals useCuda [ (cmakeBool "LLAMA_CUBLAS" true) ] + ++ optionals useRocm [ + (cmakeBool "LLAMA_HIPBLAS" true) + (cmakeFeature "CMAKE_C_COMPILER" "hipcc") + (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") + + # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM + # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt + # and select the line that matches the current nixpkgs version of rocBLAS. + # Should likely use `rocmPackages.clr.gpuTargets`. + "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" + ] + ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ] + ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ]; + + # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, + # if they haven't been added yet. + postInstall = '' + mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server + mkdir -p $out/include + cp $src/llama.h $out/include/ + ''; + + # Define the shells here, but don't add in the inputsFrom to avoid recursion. + passthru = { + inherit + useBlas + useCuda + useMetalKit + useOpenCL + useRocm + ; + + shell = mkShell { + name = "default${descriptionSuffix}"; + description = "contains numpy and sentencepiece"; + buildInputs = [ llama-python ]; + inputsFrom = [ finalAttrs.finalPackage ]; + }; + + shell-extra = mkShell { + name = "extra${descriptionSuffix}"; + description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers"; + buildInputs = [ llama-python-extra ]; + inputsFrom = [ finalAttrs.finalPackage ]; + }; + }; + + meta = { + # Configurations we don't want even the CI to evaluate. Results in the + # "unsupported platform" messages. This is mostly a no-op, because + # cudaPackages would've refused to evaluate anyway. + badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; + + # Configurations that are known to result in build failures. Can be + # overridden by importing Nixpkgs with `allowBroken = true`. + broken = (useMetalKit && !effectiveStdenv.isDarwin); + + description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; + homepage = "https://github.com/ggerganov/llama.cpp/"; + license = lib.licenses.mit; + + # Accommodates `nix run` and `lib.getExe` + mainProgram = "llama"; + + # These people might respond, on the best effort basis, if you ping them + # in case of Nix-specific regressions or for reviewing Nix-specific PRs. + # Consider adding yourself to this list if you want to ensure this flake + # stays maintained and you're willing to invest your time. Do not add + # other people without their consent. Consider removing people after + # they've been unreachable for long periods of time. + + # Note that lib.maintainers is defined in Nixpkgs, but you may just add + # an attrset following the same format as in + # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix + maintainers = with lib.maintainers; [ + philiptaron + SomeoneSerge + ]; + + # Extend `badPlatforms` instead + platforms = lib.platforms.all; + }; + } +) diff --git a/flake.lock b/flake.lock index 0455f65617a2d..fdcd6d411b324 100644 --- a/flake.lock +++ b/flake.lock @@ -1,30 +1,12 @@ { "nodes": { - "flake-utils": { - "inputs": { - "systems": "systems" - }, - "locked": { - "lastModified": 1694529238, - "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, "nixpkgs": { "locked": { - "lastModified": 1698318101, - "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=", + "lastModified": 1703013332, + "narHash": "sha256-+tFNwMvlXLbJZXiMHqYq77z/RfmpfpiI3yjL6o/Zo9M=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c", + "rev": "54aac082a4d9bb5bbc5c4e899603abfb76a3f6d6", "type": "github" }, "original": { @@ -36,24 +18,8 @@ }, "root": { "inputs": { - "flake-utils": "flake-utils", "nixpkgs": "nixpkgs" } - }, - "systems": { - "locked": { - "lastModified": 1681028828, - "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", - "owner": "nix-systems", - "repo": "default", - "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", - "type": "github" - }, - "original": { - "owner": "nix-systems", - "repo": "default", - "type": "github" - } } }, "root": "root", diff --git a/flake.nix b/flake.nix index 4cf28d5c11c0f..dcf8e1d9defa0 100644 --- a/flake.nix +++ b/flake.nix @@ -1,139 +1,93 @@ { inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; }; - outputs = { self, nixpkgs, flake-utils }: - flake-utils.lib.eachDefaultSystem (system: - let - name = "llama.cpp"; - src = ./.; - meta.mainProgram = "llama"; - inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; - buildInputs = with pkgs; [ openmpi ]; - osSpecific = with pkgs; buildInputs ++ ( - if isAarch64 && isDarwin then - with pkgs.darwin.apple_sdk_11_0.frameworks; [ - Accelerate - MetalKit - ] - else if isAarch32 && isDarwin then - with pkgs.darwin.apple_sdk.frameworks; [ - Accelerate - CoreGraphics - CoreVideo - ] - else if isDarwin then - with pkgs.darwin.apple_sdk.frameworks; [ - Accelerate - CoreGraphics - CoreVideo - ] - else - with pkgs; [ openblas ] - ); - pkgs = import nixpkgs { inherit system; }; - nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ]; - cudatoolkit_joined = with pkgs; symlinkJoin { - # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit - # see https://github.com/NixOS/nixpkgs/issues/224291 - # copied from jaxlib - name = "${cudaPackages.cudatoolkit.name}-merged"; - paths = [ - cudaPackages.cudatoolkit.lib - cudaPackages.cudatoolkit.out - ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [ - # for some reason some of the required libs are in the targets/x86_64-linux - # directory; not sure why but this works around it - "${cudaPackages.cudatoolkit}/targets/${system}" + + outputs = + { self, nixpkgs }: + + let + systems = [ + "aarch64-darwin" + "aarch64-linux" + "x86_64-darwin" # x86_64-darwin isn't tested (and likely isn't relevant) + "x86_64-linux" + ]; + eachSystem = f: nixpkgs.lib.genAttrs systems (system: f system); + in + + { + # These define the various ways to build the llama.cpp project. + # Integrate them into your flake.nix configuration by adding this overlay to nixpkgs.overlays. + overlays.default = import ./.devops/nix/overlay.nix; + + # These use the package definition from `./.devops/nix/package.nix`. + # There's one per backend that llama-cpp uses. Add more as needed! + packages = eachSystem ( + system: + let + defaultConfig = { + inherit system; + overlays = [ self.overlays.default ]; + }; + pkgs = import nixpkgs defaultConfig; + + # Let's not make a big deal about getting the CUDA bits. + cudaConfig = defaultConfig // { + config.cudaSupport = true; + config.allowUnfreePredicate = + p: + builtins.all + ( + license: + license.free + || builtins.elem license.shortName [ + "CUDA EULA" + "cuDNN EULA" + ] + ) + (p.meta.licenses or [ p.meta.license ]); + }; + pkgsCuda = import nixpkgs cudaConfig; + + # Let's make sure to turn on ROCm support across the whole package ecosystem. + rocmConfig = defaultConfig // { + config.rocmSupport = true; + }; + pkgsRocm = import nixpkgs rocmConfig; + in + { + default = pkgs.llama-cpp; + opencl = pkgs.llama-cpp.override { useOpenCL = true; }; + cuda = pkgsCuda.llama-cpp; + rocm = pkgsRocm.llama-cpp; + } + ); + + # These use the definition of llama-cpp from `./.devops/nix/package.nix` + # and expose various binaries as apps with `nix run .#app-name`. + # Note that none of these apps use anything other than the default backend. + apps = eachSystem ( + system: + import ./.devops/nix/apps.nix { + package = self.packages.${system}.default; + binaries = [ + "llama" + "llama-embedding" + "llama-server" + "quantize" + "train-text-from-scratch" ]; - }; - llama-python = - pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); - # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime - llama-python-extra = - pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]); - postPatch = '' - substituteInPlace ./ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python' - ''; - postInstall = '' - mv $out/bin/main $out/bin/llama - mv $out/bin/server $out/bin/llama-server - mkdir -p $out/include - cp ${src}/llama.h $out/include/ - ''; - cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; - in - { - packages.default = pkgs.stdenv.mkDerivation { - inherit name src meta postPatch nativeBuildInputs postInstall; - buildInputs = osSpecific; - cmakeFlags = cmakeFlags - ++ (if isAarch64 && isDarwin then [ - "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - "-DLLAMA_METAL=ON" - ] else [ - "-DLLAMA_BLAS=ON" - "-DLLAMA_BLAS_VENDOR=OpenBLAS" - ]); - }; - packages.opencl = pkgs.stdenv.mkDerivation { - inherit name src meta postPatch nativeBuildInputs postInstall; - buildInputs = with pkgs; buildInputs ++ [ clblast ]; - cmakeFlags = cmakeFlags ++ [ - "-DLLAMA_CLBLAST=ON" - ]; - }; - packages.cuda = pkgs.stdenv.mkDerivation { - inherit name src meta postPatch nativeBuildInputs postInstall; - buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ]; - cmakeFlags = cmakeFlags ++ [ - "-DLLAMA_CUBLAS=ON" - ]; - }; - packages.rocm = pkgs.stdenv.mkDerivation { - inherit name src meta postPatch nativeBuildInputs postInstall; - buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ]; - cmakeFlags = cmakeFlags ++ [ - "-DLLAMA_HIPBLAS=1" - "-DCMAKE_C_COMPILER=hipcc" - "-DCMAKE_CXX_COMPILER=hipcc" - # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM - # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt - # and select the line that matches the current nixpkgs version of rocBLAS. - "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" - ]; - }; - apps.llama-server = { - type = "app"; - program = "${self.packages.${system}.default}/bin/llama-server"; - }; - apps.llama-embedding = { - type = "app"; - program = "${self.packages.${system}.default}/bin/embedding"; - }; - apps.llama = { - type = "app"; - program = "${self.packages.${system}.default}/bin/llama"; - }; - apps.quantize = { - type = "app"; - program = "${self.packages.${system}.default}/bin/quantize"; - }; - apps.train-text-from-scratch = { - type = "app"; - program = "${self.packages.${system}.default}/bin/train-text-from-scratch"; - }; - apps.default = self.apps.${system}.llama; - devShells.default = pkgs.mkShell { - buildInputs = [ llama-python ]; - packages = nativeBuildInputs ++ osSpecific; - }; - devShells.extra = pkgs.mkShell { - buildInputs = [ llama-python-extra ]; - packages = nativeBuildInputs ++ osSpecific; - }; - }); + } + ); + + # These expose a build environment for either a "default" or an "extra" set of dependencies. + devShells = eachSystem ( + system: + import ./.devops/nix/devshells.nix { + concatMapAttrs = nixpkgs.lib.concatMapAttrs; + packages = self.packages.${system}; + } + ); + }; }