Skip to content

Commit a16bebc

Browse files
committed
Add Text Embeddings Inference v1.8.3 (tentative)
1 parent c89565d commit a16bebc

File tree

2 files changed

+157
-0
lines changed

2 files changed

+157
-0
lines changed
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
FROM alpine AS text-embeddings-inference
2+
3+
RUN mkdir -p /text-embeddings-inference
4+
ADD https://github.com/huggingface/text-embeddings-inference/archive/refs/tags/v1.8.3.tar.gz /text-embeddings-inference/sources.tar.gz
5+
RUN tar -C /text-embeddings-inference -xf /text-embeddings-inference/sources.tar.gz --strip-components=1
6+
7+
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
8+
9+
ENV SCCACHE=0.10.0
10+
ENV RUSTC_WRAPPER=/usr/local/bin/sccache
11+
ENV PATH="/root/.cargo/bin:${PATH}"
12+
# NOTE: `CARGO_CHEF` version is aligned with `cargo-chef` version installed in
13+
# `lukemathwalker/cargo-chef:latest-rust-1.75-bookworm`
14+
ENV CARGO_CHEF=0.1.71
15+
16+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
17+
curl \
18+
libssl-dev \
19+
pkg-config \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
23+
chmod +x /usr/local/bin/sccache
24+
25+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
26+
RUN cargo install cargo-chef --version $CARGO_CHEF --locked
27+
28+
FROM base-builder AS planner
29+
30+
WORKDIR /usr/src
31+
32+
COPY --from=text-embeddings-inference /text-embeddings-inference/backends backends
33+
COPY --from=text-embeddings-inference /text-embeddings-inference/core core
34+
COPY --from=text-embeddings-inference /text-embeddings-inference/router router
35+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.toml Cargo.toml
36+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.lock Cargo.lock
37+
38+
RUN cargo chef prepare --recipe-path recipe.json
39+
40+
FROM base-builder AS builder
41+
42+
WORKDIR /usr/src
43+
44+
COPY --from=planner /usr/src/recipe.json recipe.json
45+
46+
RUN cargo chef cook --release --features google --recipe-path recipe.json && sccache -s
47+
48+
FROM builder AS builder-75
49+
50+
RUN CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing,google --recipe-path recipe.json && sccache -s
51+
52+
COPY --from=text-embeddings-inference /text-embeddings-inference/backends backends
53+
COPY --from=text-embeddings-inference /text-embeddings-inference/core core
54+
COPY --from=text-embeddings-inference /text-embeddings-inference/router router
55+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.toml Cargo.toml
56+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.lock Cargo.lock
57+
58+
RUN CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router --features candle-cuda-turing,google && sccache -s
59+
60+
FROM builder AS builder-80
61+
62+
RUN CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda,google --recipe-path recipe.json && sccache -s
63+
64+
COPY --from=text-embeddings-inference /text-embeddings-inference/backends backends
65+
COPY --from=text-embeddings-inference /text-embeddings-inference/core core
66+
COPY --from=text-embeddings-inference /text-embeddings-inference/router router
67+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.toml Cargo.toml
68+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.lock Cargo.lock
69+
70+
RUN CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router --features candle-cuda,google && sccache -s
71+
72+
FROM builder AS builder-90
73+
74+
RUN CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda,google --recipe-path recipe.json && sccache -s
75+
76+
COPY --from=text-embeddings-inference /text-embeddings-inference/backends backends
77+
COPY --from=text-embeddings-inference /text-embeddings-inference/core core
78+
COPY --from=text-embeddings-inference /text-embeddings-inference/router router
79+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.toml Cargo.toml
80+
COPY --from=text-embeddings-inference /text-embeddings-inference/Cargo.lock Cargo.lock
81+
82+
RUN CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router --features candle-cuda,google && sccache -s
83+
84+
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS base
85+
86+
ENV HUGGINGFACE_HUB_CACHE=/tmp \
87+
PORT=8080 \
88+
USE_FLASH_ATTENTION=True
89+
90+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
91+
apt-transport-https \
92+
ca-certificates \
93+
curl \
94+
gnupg \
95+
libssl-dev \
96+
&& rm -rf /var/lib/apt/lists/*
97+
98+
COPY --from=builder-75 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-75
99+
COPY --from=builder-80 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-80
100+
COPY --from=builder-90 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-90
101+
102+
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
103+
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
104+
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
105+
| gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \
106+
apt-get update -y && \
107+
apt-get install google-cloud-cli -y && \
108+
apt-get clean autoremove --yes && \
109+
rm -rf /var/lib/{apt,dpkg,cache,log}
110+
111+
COPY --chmod=775 containers/tei/gpu/1.8.3/entrypoint.sh entrypoint.sh
112+
ENTRYPOINT ["./entrypoint.sh"]
113+
CMD ["--json-output"]
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
3+
ldconfig 2>/dev/null || echo "WARN: Unable to refresh ld cache, not a big deal in most cases"
4+
5+
# NOTE: We first check that whether the `nvidia-smi` command is found, as otherwise
6+
# it makes no sense to download the artifacts from Google Cloud Storage. In this
7+
# case, we could eventually suggest the user to either use an NVIDIA GPU instance
8+
# on Google Cloud, or rather to use the Text Embeddings Inference (TEI) CPU DLC instead.
9+
if ! command -v nvidia-smi &>/dev/null; then
10+
echo "ERROR: nvidia-smi command not found."
11+
exit 1
12+
fi
13+
14+
if [[ $AIP_STORAGE_URI == gs://* ]]; then
15+
echo "INFO: Provided AIP_STORAGE_URI=$AIP_STORAGE_URI, which is a Google Cloud Storage path given that it starts with gs://"
16+
17+
TARGET_DIR="/tmp/model"
18+
mkdir -p "$TARGET_DIR"
19+
20+
echo "INFO: gcloud storage storage cp $AIP_STORAGE_URI/* $TARGET_DIR --recursive"
21+
gcloud storage cp "$AIP_STORAGE_URI/*" "$TARGET_DIR" --recursive
22+
23+
if [ $? -eq 0 ]; then
24+
echo "INFO: Model downloaded successfully to ${TARGET_DIR}."
25+
# NOTE: Update MODEL_ID to point to the local directory once downloaded
26+
echo "INFO: Updating MODEL_ID to point to the local directory."
27+
export MODEL_ID="$TARGET_DIR"
28+
else
29+
echo "ERROR: Failed to download model from GCS."
30+
exit 1
31+
fi
32+
fi
33+
34+
compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
35+
if [ ${compute_cap} -eq 75 ]; then
36+
exec text-embeddings-router-75 "$@"
37+
elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then
38+
exec text-embeddings-router-80 "$@"
39+
elif [ ${compute_cap} -eq 90 ]; then
40+
exec text-embeddings-router-90 "$@"
41+
else
42+
echo "ERROR: CUDA compute capability ${compute_cap} is not supported"
43+
exit 1
44+
fi

0 commit comments

Comments
 (0)