Skip to content

Commit 9ab66fc

Browse files
kouroshHakhalandscapepainter
authored andcommitted
[LLM] Simplify the NIXL dependency on ray-llm images (ray-project#57706)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
1 parent 588feb4 commit 9ab66fc

File tree

12 files changed

+40
-15053
lines changed

12 files changed

+40
-15053
lines changed

ci/raydepsets/configs/rayllm.depsets.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ build_arg_sets:
1818
- --extra-index-url https://download.pytorch.org/whl/${CUDA_CODE}
1919
build_arg_sets:
2020
- cpu
21-
- cu121
2221
- cu128
2322

2423
depsets:

docker/ray-llm/Dockerfile

Lines changed: 2 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,6 @@ FROM "$BASE_IMAGE"
66
COPY python/deplocks/llm/rayllm_*.lock ./
77

88
ARG KVER="5.15.0-139-generic"
9-
ARG ROOT_DIR="/usr/local"
10-
ARG GDR_HOME="${ROOT_DIR}/gdrcopy"
11-
ARG UCX_HOME="${ROOT_DIR}/ucx"
12-
ARG NIXL_HOME="${ROOT_DIR}/nixl"
139

1410
RUN <<EOF
1511
#!/bin/bash
@@ -33,97 +29,9 @@ uv pip install --system --no-cache-dir --no-deps \
3329
# Export installed packages
3430
$HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt
3531

36-
# Begin NIXL installation
37-
38-
mkdir -p "${ROOT_DIR}"
39-
40-
CUDACXX=$(which nvcc)
41-
CUDA_HOME=$(dirname $(dirname ${CUDACXX}))
42-
43-
TEMP_DIR="nixl_installer"
44-
mkdir -p "${TEMP_DIR}"
45-
46-
sudo apt-get update
47-
# kmod needed by nvidia-installer, pkg-config needed by GDRCopy, librdmacm-dev needed by UCX
48-
sudo apt-get install -y kmod pkg-config librdmacm-dev cmake
49-
50-
(
51-
echo "Installing GDRCopy"
52-
cd "${TEMP_DIR}"
53-
[[ -d "/lib/modules/${KVER}" ]] || sudo apt-get install linux-headers-${KVER} -y
54-
NV_DRIVER_VERSION="570.153.02"
55-
wget "https://us.download.nvidia.com/XFree86/Linux-x86_64/${NV_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}.run" -q
56-
sh NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}.run -x
57-
sudo NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}/nvidia-installer \
58-
--silent \
59-
--no-questions \
60-
--no-install-compat32-libs \
61-
--kernel-source-path="/lib/modules/${KVER}/build" \
62-
--utility-prefix="/usr"
63-
64-
(
65-
wget "https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz" -q
66-
tar xzf v2.5.tar.gz; rm v2.5.tar.gz
67-
cd gdrcopy-2.5
68-
sudo make prefix=$GDR_HOME CUDA=$CUDA_HOME KVER=${KVER} all install
69-
)
70-
71-
# Uninstall the driver, this driver might have conflict with the library
72-
# version on host. Remove it from container.
73-
sudo NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}/nvidia-installer \
74-
--uninstall \
75-
--silent \
76-
--no-questions
77-
)
78-
79-
UCX_VERSION="1.19.0"
80-
(
81-
echo "Installing UCX ${UCX_VERSION}"
82-
cd "${TEMP_DIR}"
83-
wget "https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz" -q
84-
tar xzf "ucx-${UCX_VERSION}.tar.gz"; rm "ucx-${UCX_VERSION}.tar.gz"
85-
cd "ucx-${UCX_VERSION}"
86-
87-
# Additional options for Mellanox NICs, install by default
88-
MLX_OPTS="--with-rdmacm \
89-
--with-mlx5-dv \
90-
--with-ib-hw-tm"
91-
92-
./configure --prefix=${UCX_HOME} \
93-
--enable-shared \
94-
--disable-static \
95-
--disable-doxygen-doc \
96-
--enable-optimizations \
97-
--enable-cma \
98-
--enable-devel-headers \
99-
--with-cuda=${CUDA_HOME} \
100-
--with-dm \
101-
--with-gdrcopy=${GDR_HOME} \
102-
--with-verbs \
103-
--enable-mt \
104-
${MLX_OPTS}
105-
make -j
106-
sudo make -j install-strip
107-
108-
sudo ldconfig
109-
)
110-
111-
# Keep in sync with llm-requirements.txt
112-
NIXL_VERSION="0.6.0"
113-
(
114-
echo "Installing NIXL ${NIXL_VERSION}"
115-
# NIXL needs meson pybind11 ninja, but should have been included in requirements_*.txt
116-
cd "${TEMP_DIR}"
117-
wget "https://github.com/ai-dynamo/nixl/archive/refs/tags/${NIXL_VERSION}.tar.gz" -q
118-
tar xzf "${NIXL_VERSION}.tar.gz"; rm "${NIXL_VERSION}.tar.gz"
119-
cd "nixl-${NIXL_VERSION}"
120-
meson setup build --prefix=${NIXL_HOME} -Ducx_path=${UCX_HOME}
121-
cd build
122-
ninja
123-
sudo env "PATH=$PATH" ninja install
124-
)
125-
sudo rm -rf "${TEMP_DIR}"
32+
sudo apt-get update -y && sudo apt-get install -y kmod pkg-config librdmacm-dev cmake
12633

34+
# Install DeepEP kernels
12735
EP_TEMP_DIR=$(pwd)/"ep_temp_dir"
12836
mkdir -p "${EP_TEMP_DIR}"
12937

@@ -189,7 +97,3 @@ sudo rm -rf /var/lib/apt/lists/*
18997
sudo apt-get clean
19098

19199
EOF
192-
193-
ENV PATH="${PATH}:${UCX_HOME}/bin:${NIXL_HOME}/bin"
194-
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${UCX_HOME}/lib:${NIXL_HOME}/lib/x86_64-linux-gnu"
195-
ENV NIXL_PLUGIN_DIR="${NIXL_HOME}/lib/x86_64-linux-gnu/plugins/"

python/deplocks/llm/ray_py311_cu121.lock

Lines changed: 0 additions & 2151 deletions
This file was deleted.

python/deplocks/llm/ray_test_py311_cu121.lock

Lines changed: 0 additions & 3539 deletions
This file was deleted.

python/deplocks/llm/rayllm_py311_cpu.lock

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,15 +1710,15 @@ ninja==1.11.1.3 \
17101710
# -r python/requirements/llm/llm-requirements.txt
17111711
# vllm
17121712
# xgrammar
1713-
nixl==0.6.0 \
1714-
--hash=sha256:1e703abfa56fb020831df8940539a4447205aa593632ace676cb2db93c8862ee \
1715-
--hash=sha256:4468619d1730aed076ad7de5aa206d6def4c61ae8a19067dbe31bc2846699e4a \
1716-
--hash=sha256:5368f36bc50ba9fb1aa1b6b7a9b083dfd19af0a45d1649142785ec9bee6b7919 \
1717-
--hash=sha256:6cff5aa014b80b3d17f364c21367dd055838ba6dcfaa4c50dc09b056276e8e14 \
1718-
--hash=sha256:ac733f706c80edf4fa9d3c2325cfb50671952b5aec5cf6a479c71e1ad175a2a0 \
1719-
--hash=sha256:b4a21d381cb78f92be695fb614b33fc8979a2d37f1ca6951121bd2fcccaf9daa \
1720-
--hash=sha256:b4dafa2b59d96c57ab7c036b6dcf41def4c28619415d6ff751190d94f321eef7 \
1721-
--hash=sha256:e5bc4b0a901a97e681b6f2b54a798cb8743c79bb5f4e4e4e68f06b913da2cfc9
1713+
nixl==0.6.1 \
1714+
--hash=sha256:24e9e98a72839d762bedb8faca010c5878aa0b2d5624a1590d6a588aab1d223e \
1715+
--hash=sha256:2a9f29718e5dde20ee9e6e5fb25411d1950ab84733e0d4fceb8bb6ccf555a1e5 \
1716+
--hash=sha256:77eab96bef382bfb91b9d6222e5581e49b193fcf573b38dcaa7a296822a2894e \
1717+
--hash=sha256:7abbaccc88f0330d38e5344efa4a0768fe523e9a0083b785ea60da858d73b265 \
1718+
--hash=sha256:831affb62a6ff6199e41ffdccaab3430cb61bf3ca71e597ca214d2db26620955 \
1719+
--hash=sha256:8507c73d9bc044dd921edbef81ebae3e0750584a70a63ea90e5ade79233535d2 \
1720+
--hash=sha256:d28c348371045962b109d5ebf1ab054017fd9c89a6d9167902c62dc793465e2d \
1721+
--hash=sha256:f562139f23609336e5254b96e07b20b3298cca81ddc7549fa2da6dd788a80564
17221722
# via
17231723
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
17241724
# -r python/requirements/llm/llm-requirements.txt

0 commit comments

Comments
 (0)