@@ -6,10 +6,6 @@ FROM "$BASE_IMAGE"
66COPY python/deplocks/llm/rayllm_*.lock ./
77
88ARG KVER="5.15.0-139-generic"
9- ARG ROOT_DIR="/usr/local"
10- ARG GDR_HOME="${ROOT_DIR}/gdrcopy"
11- ARG UCX_HOME="${ROOT_DIR}/ucx"
12- ARG NIXL_HOME="${ROOT_DIR}/nixl"
139
1410RUN <<EOF
1511# !/bin/bash
@@ -33,97 +29,9 @@ uv pip install --system --no-cache-dir --no-deps \
3329# Export installed packages
3430$HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt
3531
36- # Begin NIXL installation
37-
38- mkdir -p "${ROOT_DIR}"
39-
40- CUDACXX=$(which nvcc)
41- CUDA_HOME=$(dirname $(dirname ${CUDACXX}))
42-
43- TEMP_DIR="nixl_installer"
44- mkdir -p "${TEMP_DIR}"
45-
46- sudo apt-get update
47- # kmod needed by nvidia-installer, pkg-config needed by GDRCopy, librdmacm-dev needed by UCX
48- sudo apt-get install -y kmod pkg-config librdmacm-dev cmake
49-
50- (
51- echo "Installing GDRCopy"
52- cd "${TEMP_DIR}"
53- [[ -d "/lib/modules/${KVER}" ]] || sudo apt-get install linux-headers-${KVER} -y
54- NV_DRIVER_VERSION="570.153.02"
55- wget "https://us.download.nvidia.com/XFree86/Linux-x86_64/${NV_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}.run" -q
56- sh NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}.run -x
57- sudo NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}/nvidia-installer \
58- --silent \
59- --no-questions \
60- --no-install-compat32-libs \
61- --kernel-source-path="/lib/modules/${KVER}/build" \
62- --utility-prefix="/usr"
63-
64- (
65- wget "https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz" -q
66- tar xzf v2.5.tar.gz; rm v2.5.tar.gz
67- cd gdrcopy-2.5
68- sudo make prefix=$GDR_HOME CUDA=$CUDA_HOME KVER=${KVER} all install
69- )
70-
71- # Uninstall the driver, this driver might have conflict with the library
72- # version on host. Remove it from container.
73- sudo NVIDIA-Linux-x86_64-${NV_DRIVER_VERSION}/nvidia-installer \
74- --uninstall \
75- --silent \
76- --no-questions
77- )
78-
79- UCX_VERSION="1.19.0"
80- (
81- echo "Installing UCX ${UCX_VERSION}"
82- cd "${TEMP_DIR}"
83- wget "https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz" -q
84- tar xzf "ucx-${UCX_VERSION}.tar.gz" ; rm "ucx-${UCX_VERSION}.tar.gz"
85- cd "ucx-${UCX_VERSION}"
86-
87- # Additional options for Mellanox NICs, install by default
88- MLX_OPTS="--with-rdmacm \
89- --with-mlx5-dv \
90- --with-ib-hw-tm"
91-
92- ./configure --prefix=${UCX_HOME} \
93- --enable-shared \
94- --disable-static \
95- --disable-doxygen-doc \
96- --enable-optimizations \
97- --enable-cma \
98- --enable-devel-headers \
99- --with-cuda=${CUDA_HOME} \
100- --with-dm \
101- --with-gdrcopy=${GDR_HOME} \
102- --with-verbs \
103- --enable-mt \
104- ${MLX_OPTS}
105- make -j
106- sudo make -j install-strip
107-
108- sudo ldconfig
109- )
110-
111- # Keep in sync with llm-requirements.txt
112- NIXL_VERSION="0.6.0"
113- (
114- echo "Installing NIXL ${NIXL_VERSION}"
115- # NIXL needs meson pybind11 ninja, but should have been included in requirements_*.txt
116- cd "${TEMP_DIR}"
117- wget "https://github.com/ai-dynamo/nixl/archive/refs/tags/${NIXL_VERSION}.tar.gz" -q
118- tar xzf "${NIXL_VERSION}.tar.gz" ; rm "${NIXL_VERSION}.tar.gz"
119- cd "nixl-${NIXL_VERSION}"
120- meson setup build --prefix=${NIXL_HOME} -Ducx_path=${UCX_HOME}
121- cd build
122- ninja
123- sudo env "PATH=$PATH" ninja install
124- )
125- sudo rm -rf "${TEMP_DIR}"
32+ sudo apt-get update -y && sudo apt-get install -y kmod pkg-config librdmacm-dev cmake
12633
34+ # Install DeepEP kernels
12735EP_TEMP_DIR=$(pwd)/"ep_temp_dir"
12836mkdir -p "${EP_TEMP_DIR}"
12937
@@ -189,7 +97,3 @@ sudo rm -rf /var/lib/apt/lists/*
18997sudo apt-get clean
19098
19199EOF
192-
193- ENV PATH="${PATH}:${UCX_HOME}/bin:${NIXL_HOME}/bin"
194- ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${UCX_HOME}/lib:${NIXL_HOME}/lib/x86_64-linux-gnu"
195- ENV NIXL_PLUGIN_DIR="${NIXL_HOME}/lib/x86_64-linux-gnu/plugins/"
0 commit comments