flake.nix: rewrite

philiptaron · philiptaron · commit afb5d28bc6c2 · 2023-12-23T09:24:00.000-08:00
1. Split into separate files per output. 2. Added overlays, so that this flake can be integrated into others. The names in the overlay are `llama-cpp`, `llama-cpp-opencl`, `llama-cpp-cuda`, and `llama-cpp-rocm` so that they fit into the broader set of Nix packages from [nixpkgs](https://github.com/nixos/nixpkgs). 3. Use [callPackage](https://summer.nixos.org/blog/callpackage-a-tool-for-the-lazy/) rather than `with pkgs;` so that there's dependency injection rather than dependency lookup. 4. Add a description and meta information for each package. The description includes a bit about what's trying to accelerate each one. 5. Use specific CUDA packages instead of cudatoolkit on the advice of @SomeoneSerge.
diff --git a/.devops/apps.nix b/.devops/apps.nix
@@ -0,0 +1,14 @@
+names: pkgs:
+
+let
+  default = builtins.elemAt names 0;
+  mkApp = name: {
+    ${name} = {
+      type = "app";
+      program = "${pkgs.llama-cpp}/bin/${name}";
+    };
+  };
+  result = builtins.foldl' (acc: name: (mkApp name) // acc) {} names;
+in
+
+result // { default = result.${default}; }
diff --git a/.devops/devshells.nix b/.devops/devshells.nix
@@ -0,0 +1,32 @@
+pkgs:
+
+let
+  llama-python = pkgs.python3.withPackages (ps: [
+    ps.numpy
+    ps.sentencepiece
+  ]);
+
+  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
+  llama-python-extra = pkgs.python3.withPackages (ps: [
+    ps.numpy
+    ps.sentencepiece
+    ps.torchWithoutCuda
+    ps.transformers
+  ]);
+in
+
+{
+  default = pkgs.mkShell {
+    name = "default";
+    description = "contains numpy and sentencepiece";
+    inputsFrom = [ pkgs.llama-cpp ];
+    buildInputs = [ llama-python ];
+  };
+
+  extra = pkgs.mkShell {
+    name = "extra";
+    description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
+    inputsFrom = [ pkgs.llama-cpp ];
+    buildInputs = [ llama-python-extra ];
+  };
+}
diff --git a/.devops/overlay.nix b/.devops/overlay.nix
@@ -0,0 +1,20 @@
+final: prev:
+
+let
+  inherit (prev.stdenv) isAarch32 isAarch64 isDarwin;
+
+  darwinSpecific = if isAarch64 then {
+    inherit (prev.darwin.apple_sdk_11_0.frameworks) Accelerate MetalKit;
+  } else {
+    inherit (prev.darwin.apple_sdk.frameworks) Accelerate CoreGraphics CoreVideo;
+  };
+
+  osSpecific = if isDarwin then darwinSpecific else {};
+in
+
+{
+  llama-cpp = prev.callPackage ./package.nix osSpecific;
+  llama-cpp-opencl = prev.callPackage ./package.nix (osSpecific // { useOpenCL = true; });
+  llama-cpp-cuda = prev.callPackage ./package.nix (osSpecific // { useCuda = true; });
+  llama-cpp-rocm = prev.callPackage ./package.nix (osSpecific // { useRocm = true; });
+}
diff --git a/.devops/package.nix b/.devops/package.nix
@@ -0,0 +1,119 @@
+{ lib
+, config
+, stdenv
+, cmake
+, ninja
+, pkg-config
+, git
+, python3
+, openmpi
+, openblas
+, cudaPackages
+, rocmPackages
+, clblast
+, Accelerate ? null
+, MetalKit ? null
+, CoreVideo ? null
+, CoreGraphics ? null
+, useOpenCL ? false
+, useCuda ? config.cudaSupport
+, useRocm ? config.rocmSupport
+}@inputs:
+
+let
+  inherit (lib) cmakeBool cmakeFeature optional optionals versionOlder;
+  isDefault = !useOpenCL && !useCuda && !useRocm;
+
+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
+  stdenv = throw "Use effectiveStdenv instead";
+  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
+
+  # Give a little description difference between the flavors.
+  descriptionSuffix = if useOpenCL then
+    " (OpenCL accelerated)"
+  else if useCuda then
+    " (CUDA accelerated)"
+  else if useRocm then
+    " (ROCm accelerated)"
+  else if (MetalKit != null) then
+    " (MetalKit accelerated)"
+  else "";
+
+  # TODO: package the Python in this repository in a Nix-like way.
+  llama-python = python3.withPackages (ps: [ ps.numpy ps.sentencepiece ]);
+
+  # See ./overlay.nix for where these dependencies are passed in.
+  defaultBuildInputs = builtins.filter (p: p != null) [
+    Accelerate
+    MetalKit
+    CoreVideo
+    CoreGraphics
+  ];
+
+  cudaBuildInputs = with cudaPackages; [
+    cuda_cccl.dev # <nv/target>
+    cuda_cudart
+    libcublas
+  ];
+
+  rocmBuildInputs = with rocmPackages; [ clr hipblas rocblas ];
+in
+
+effectiveStdenv.mkDerivation {
+  name = "llama.cpp";
+  src = ../.;
+  meta = {
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    mainProgram = "llama";
+  };
+
+  postPatch = ''
+    substituteInPlace ./ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
+  '';
+
+  nativeBuildInputs = [ cmake ninja pkg-config git ]
+  ++ optional useCuda [ cudaPackages.cuda_nvcc ];
+
+  buildInputs = [ openmpi ]
+  ++ optional useOpenCL clblast
+  ++ optionals useCuda cudaBuildInputs
+  ++ optionals useRocm rocmBuildInputs
+  ++ optionals isDefault defaultBuildInputs;
+
+  cmakeFlags = [
+    (cmakeBool "LLAMA_NATIVE" true)
+    (cmakeBool "LLAMA_BUILD_SERVER" true)
+    (cmakeBool "BUILD_SHARED_LIBS" true)
+    (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+  ]
+  ++ optional useOpenCL (cmakeBool "LLAMA_CLBLAST" true)
+  ++ optional useCuda (cmakeBool "LLAMA_CUBLAS" true)
+  ++ optionals useRocm [
+    (cmakeBool "LLAMA_HIPBLAS" true)
+    (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+    (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+
+    # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+    # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+    # and select the line that matches the current nixpkgs version of rocBLAS.
+    # Should likely use `rocmPackages.clr.gpuTargets`.
+    "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+  ]
+  ++ optionals isDefault (if (MetalKit != null) then [
+    "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+    "-DLLAMA_METAL=ON"
+  ] else [
+    "-DLLAMA_BLAS=ON"
+    "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+  ]);
+
+  postInstall = ''
+    mv $out/bin/main $out/bin/llama
+    mv $out/bin/server $out/bin/llama-server
+    mkdir -p $out/include
+    cp $src/llama.h $out/include/
+  '';
+}
diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix