diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix
new file mode 100644
index 0000000000000..d9b6a1e000628
--- /dev/null
+++ b/.devops/nix/apps.nix
@@ -0,0 +1,14 @@
+{ package, binaries }:
+
+let
+  default = builtins.elemAt binaries 0;
+  mkApp = name: {
+    ${name} = {
+      type = "app";
+      program = "${package}/bin/${name}";
+    };
+  };
+  result = builtins.foldl' (acc: name: (mkApp name) // acc) { } binaries;
+in
+
+result // { default = result.${default}; }
diff --git a/.devops/nix/devshells.nix b/.devops/nix/devshells.nix
new file mode 100644
index 0000000000000..afaaa2644059b
--- /dev/null
+++ b/.devops/nix/devshells.nix
@@ -0,0 +1,8 @@
+{ concatMapAttrs, packages }:
+
+concatMapAttrs
+  (name: package: {
+    ${name} = package.passthru.shell;
+    ${name + "-extra"} = package.passthru.shell-extra;
+  })
+  packages
diff --git a/.devops/nix/overlay.nix b/.devops/nix/overlay.nix
new file mode 100644
index 0000000000000..c7baec8434fa4
--- /dev/null
+++ b/.devops/nix/overlay.nix
@@ -0,0 +1,5 @@
+final: prev:
+
+{
+  llama-cpp = final.callPackage ./package.nix { };
+}
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
new file mode 100644
index 0000000000000..c6d03b4a480e7
--- /dev/null
+++ b/.devops/nix/package.nix
@@ -0,0 +1,224 @@
+{
+  lib,
+  config,
+  stdenv,
+  mkShell,
+  cmake,
+  ninja,
+  pkg-config,
+  git,
+  python3,
+  mpi,
+  openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
+  cudaPackages,
+  darwin,
+  rocmPackages,
+  clblast,
+  useBlas ? builtins.all (x: !x) [
+    useCuda
+    useMetalKit
+    useOpenCL
+    useRocm
+  ],
+  useCuda ? config.cudaSupport,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useOpenCL ? false,
+  useRocm ? config.rocmSupport,
+}@inputs:
+
+let
+  inherit (lib)
+    cmakeBool
+    cmakeFeature
+    optionals
+    versionOlder
+    ;
+
+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
+  stdenv = throw "Use effectiveStdenv instead";
+  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
+
+  # Give a little description difference between the flavors.
+  descriptionSuffix =
+    if useOpenCL then
+      " (OpenCL accelerated)"
+    else if useCuda then
+      " (CUDA accelerated)"
+    else if useRocm then
+      " (ROCm accelerated)"
+    else if useMetalKit then
+      " (MetalKit accelerated)"
+    else
+      "";
+
+  # TODO: package the Python in this repository in a Nix-like way.
+  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
+  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
+  # https://peps.python.org/pep-0517/
+  llama-python = python3.withPackages (
+    ps: [
+      ps.numpy
+      ps.sentencepiece
+    ]
+  );
+
+  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
+  llama-python-extra = python3.withPackages (
+    ps: [
+      ps.numpy
+      ps.sentencepiece
+      ps.torchWithoutCuda
+      ps.transformers
+    ]
+  );
+
+  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
+  # separately
+  darwinBuildInputs =
+    with darwin.apple_sdk.frameworks;
+    [
+      Accelerate
+      CoreVideo
+      CoreGraphics
+    ]
+    ++ optionals useMetalKit [ MetalKit ];
+
+  cudaBuildInputs = with cudaPackages; [
+    cuda_cccl.dev # <nv/target>
+    cuda_cudart
+    libcublas
+  ];
+
+  rocmBuildInputs = with rocmPackages; [
+    clr
+    hipblas
+    rocblas
+  ];
+in
+
+effectiveStdenv.mkDerivation (
+  finalAttrs: {
+    name = "llama.cpp";
+    src = ../../.;
+
+    postPatch = ''
+      substituteInPlace ./ggml-metal.m \
+        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+
+      # TODO: Package up each Python script or service appropriately.
+      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
+      # we could make those *.py into setuptools' entrypoints
+      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
+    '';
+
+    nativeBuildInputs = [
+      cmake
+      ninja
+      pkg-config
+      git
+    ] ++ optionals useCuda [ cudaPackages.cuda_nvcc ];
+
+    buildInputs =
+      [ mpi ]
+      ++ optionals useOpenCL [ clblast ]
+      ++ optionals useCuda cudaBuildInputs
+      ++ optionals useRocm rocmBuildInputs
+      ++ optionals effectiveStdenv.isDarwin darwinBuildInputs;
+
+    cmakeFlags =
+      [
+        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_BUILD_SERVER" true)
+        (cmakeBool "BUILD_SHARED_LIBS" true)
+        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+        (cmakeBool "LLAMA_METAL" useMetalKit)
+        (cmakeBool "LLAMA_BLAS" useBlas)
+      ]
+      ++ optionals useOpenCL [ (cmakeBool "LLAMA_CLBLAST" true) ]
+      ++ optionals useCuda [ (cmakeBool "LLAMA_CUBLAS" true) ]
+      ++ optionals useRocm [
+        (cmakeBool "LLAMA_HIPBLAS" true)
+        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+
+        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+        # and select the line that matches the current nixpkgs version of rocBLAS.
+        # Should likely use `rocmPackages.clr.gpuTargets`.
+        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+      ]
+      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
+      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
+
+    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+    # if they haven't been added yet.
+    postInstall = ''
+      mv $out/bin/main $out/bin/llama
+      mv $out/bin/server $out/bin/llama-server
+      mkdir -p $out/include
+      cp $src/llama.h $out/include/
+    '';
+
+    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
+    passthru = {
+      inherit
+        useBlas
+        useCuda
+        useMetalKit
+        useOpenCL
+        useRocm
+        ;
+
+      shell = mkShell {
+        name = "default${descriptionSuffix}";
+        description = "contains numpy and sentencepiece";
+        buildInputs = [ llama-python ];
+        inputsFrom = [ finalAttrs.finalPackage ];
+      };
+
+      shell-extra = mkShell {
+        name = "extra${descriptionSuffix}";
+        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
+        buildInputs = [ llama-python-extra ];
+        inputsFrom = [ finalAttrs.finalPackage ];
+      };
+    };
+
+    meta = {
+      # Configurations we don't want even the CI to evaluate. Results in the
+      # "unsupported platform" messages. This is mostly a no-op, because
+      # cudaPackages would've refused to evaluate anyway.
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+
+      # Configurations that are known to result in build failures. Can be
+      # overridden by importing Nixpkgs with `allowBroken = true`.
+      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+
+      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+      homepage = "https://github.com/ggerganov/llama.cpp/";
+      license = lib.licenses.mit;
+
+      # Accommodates `nix run` and `lib.getExe`
+      mainProgram = "llama";
+
+      # These people might respond, on the best effort basis, if you ping them
+      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+      # Consider adding yourself to this list if you want to ensure this flake
+      # stays maintained and you're willing to invest your time. Do not add
+      # other people without their consent. Consider removing people after
+      # they've been unreachable for long periods of time.
+
+      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+      # an attrset following the same format as in
+      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+      maintainers = with lib.maintainers; [
+        philiptaron
+        SomeoneSerge
+      ];
+
+      # Extend `badPlatforms` instead
+      platforms = lib.platforms.all;
+    };
+  }
+)
diff --git a/flake.lock b/flake.lock
index 0455f65617a2d..fdcd6d411b324 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,30 +1,12 @@
 {
   "nodes": {
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1694529238,
-        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1698318101,
-        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
+        "lastModified": 1703013332,
+        "narHash": "sha256-+tFNwMvlXLbJZXiMHqYq77z/RfmpfpiI3yjL6o/Zo9M=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
+        "rev": "54aac082a4d9bb5bbc5c4e899603abfb76a3f6d6",
         "type": "github"
       },
       "original": {
@@ -36,24 +18,8 @@
     },
     "root": {
       "inputs": {
-        "flake-utils": "flake-utils",
         "nixpkgs": "nixpkgs"
       }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
     }
   },
   "root": "root",
diff --git a/flake.nix b/flake.nix
index 4cf28d5c11c0f..dcf8e1d9defa0 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,139 +1,93 @@
 {
   inputs = {
     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
   };
-  outputs = { self, nixpkgs, flake-utils }:
-    flake-utils.lib.eachDefaultSystem (system:
-      let
-        name = "llama.cpp";
-        src = ./.;
-        meta.mainProgram = "llama";
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
-        buildInputs = with pkgs; [ openmpi ];
-        osSpecific = with pkgs; buildInputs ++ (
-          if isAarch64 && isDarwin then
-            with pkgs.darwin.apple_sdk_11_0.frameworks; [
-              Accelerate
-              MetalKit
-            ]
-          else if isAarch32 && isDarwin then
-            with pkgs.darwin.apple_sdk.frameworks; [
-              Accelerate
-              CoreGraphics
-              CoreVideo
-            ]
-          else if isDarwin then
-            with pkgs.darwin.apple_sdk.frameworks; [
-              Accelerate
-              CoreGraphics
-              CoreVideo
-            ]
-          else
-            with pkgs; [ openblas ]
-        );
-        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
-        cudatoolkit_joined = with pkgs; symlinkJoin {
-          # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
-          # see https://github.com/NixOS/nixpkgs/issues/224291
-          # copied from jaxlib
-          name = "${cudaPackages.cudatoolkit.name}-merged";
-          paths = [
-            cudaPackages.cudatoolkit.lib
-            cudaPackages.cudatoolkit.out
-          ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
-            # for some reason some of the required libs are in the targets/x86_64-linux
-            # directory; not sure why but this works around it
-            "${cudaPackages.cudatoolkit}/targets/${system}"
+
+  outputs =
+    { self, nixpkgs }:
+
+    let
+      systems = [
+        "aarch64-darwin"
+        "aarch64-linux"
+        "x86_64-darwin" # x86_64-darwin isn't tested (and likely isn't relevant)
+        "x86_64-linux"
+      ];
+      eachSystem = f: nixpkgs.lib.genAttrs systems (system: f system);
+    in
+
+    {
+      # These define the various ways to build the llama.cpp project.
+      # Integrate them into your flake.nix configuration by adding this overlay to nixpkgs.overlays.
+      overlays.default = import ./.devops/nix/overlay.nix;
+
+      # These use the package definition from `./.devops/nix/package.nix`.
+      # There's one per backend that llama-cpp uses. Add more as needed!
+      packages = eachSystem (
+        system:
+        let
+          defaultConfig = {
+            inherit system;
+            overlays = [ self.overlays.default ];
+          };
+          pkgs = import nixpkgs defaultConfig;
+
+          # Let's not make a big deal about getting the CUDA bits.
+          cudaConfig = defaultConfig // {
+            config.cudaSupport = true;
+            config.allowUnfreePredicate =
+              p:
+              builtins.all
+                (
+                  license:
+                  license.free
+                  || builtins.elem license.shortName [
+                    "CUDA EULA"
+                    "cuDNN EULA"
+                  ]
+                )
+                (p.meta.licenses or [ p.meta.license ]);
+          };
+          pkgsCuda = import nixpkgs cudaConfig;
+
+          # Let's make sure to turn on ROCm support across the whole package ecosystem.
+          rocmConfig = defaultConfig // {
+            config.rocmSupport = true;
+          };
+          pkgsRocm = import nixpkgs rocmConfig;
+        in
+        {
+          default = pkgs.llama-cpp;
+          opencl = pkgs.llama-cpp.override { useOpenCL = true; };
+          cuda = pkgsCuda.llama-cpp;
+          rocm = pkgsRocm.llama-cpp;
+        }
+      );
+
+      # These use the definition of llama-cpp from `./.devops/nix/package.nix`
+      # and expose various binaries as apps with `nix run .#app-name`.
+      # Note that none of these apps use anything other than the default backend.
+      apps = eachSystem (
+        system:
+        import ./.devops/nix/apps.nix {
+          package = self.packages.${system}.default;
+          binaries = [
+            "llama"
+            "llama-embedding"
+            "llama-server"
+            "quantize"
+            "train-text-from-scratch"
           ];
-        };
-        llama-python =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
-        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-        llama-python-extra =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
-        postPatch = ''
-          substituteInPlace ./ggml-metal.m \
-            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
-        '';
-        postInstall = ''
-          mv $out/bin/main $out/bin/llama
-          mv $out/bin/server $out/bin/llama-server
-          mkdir -p $out/include
-          cp ${src}/llama.h $out/include/
-        '';
-        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
-      in
-      {
-        packages.default = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = osSpecific;
-          cmakeFlags = cmakeFlags
-            ++ (if isAarch64 && isDarwin then [
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-            "-DLLAMA_METAL=ON"
-          ] else [
-            "-DLLAMA_BLAS=ON"
-            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
-          ]);
-        };
-        packages.opencl = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ clblast ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_CLBLAST=ON"
-          ];
-        };
-        packages.cuda = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_CUBLAS=ON"
-          ];
-        };
-        packages.rocm = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_HIPBLAS=1"
-            "-DCMAKE_C_COMPILER=hipcc"
-            "-DCMAKE_CXX_COMPILER=hipcc"
-            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-            # and select the line that matches the current nixpkgs version of rocBLAS.
-            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-          ];
-        };
-        apps.llama-server = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/llama-server";
-        };
-        apps.llama-embedding = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/embedding";
-        };
-        apps.llama = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/llama";
-        };
-        apps.quantize = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/quantize";
-        };
-        apps.train-text-from-scratch = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
-        };
-        apps.default = self.apps.${system}.llama;
-        devShells.default = pkgs.mkShell {
-          buildInputs = [ llama-python ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
-        devShells.extra = pkgs.mkShell {
-          buildInputs = [ llama-python-extra ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
-      });
+        }
+      );
+
+      # These expose a build environment for either a "default" or an "extra" set of dependencies.
+      devShells = eachSystem (
+        system:
+        import ./.devops/nix/devshells.nix {
+          concatMapAttrs = nixpkgs.lib.concatMapAttrs;
+          packages = self.packages.${system};
+        }
+      );
+    };
 }