Add native LLM microservice using IPEX (opea-project#1337)

lvliang-intel · web-flow · commit d51a13681679 · 2025-04-08T20:25:27.000+08:00
Signed-off-by: lvliang-intel &lt;liang1.lv@intel.com&gt;
diff --git a/.github/workflows/docker/compose/third_parties-compose.yaml b/.github/workflows/docker/compose/third_parties-compose.yaml
@@ -98,3 +98,8 @@ services:
     build:
       dockerfile: comps/third_parties/llama-vision/src/Dockerfile.guard
     image: ${REGISTRY:-opea}/lvm-llama-vision-guard:${TAG:-latest}
+  ipex-llm:
+    build:
+      context: ipex-llm
+      dockerfile: comps/third_parties/ipex/src/Dockerfile
+    image: ${REGISTRY:-opea}/ipex-llm:${TAG:-latest}
diff --git a/comps/third_parties/ipex/deployment/docker_compose/compose.yaml b/comps/third_parties/ipex/deployment/docker_compose/compose.yaml
@@ -0,0 +1,22 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+
+  ipex:
+    image: ${REGISTRY:-opea}/ipex-llm:${TAG:-latest}
+    container_name: ipex-llm-server
+    ports:
+      - ${IPEX_LLM_PORT:-8688}:8688
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      MODEL_ID: ${MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/third_parties/ipex/src/Dockerfile b/comps/third_parties/ipex/src/Dockerfile
@@ -0,0 +1,77 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Most of this Dockerfile originates from https://github.com/intel/intel-extension-for-pytorch/blob/main/examples/cpu/llm/Dockerfile
+
+ARG BASE_IMAGE=ubuntu:22.04
+FROM ${BASE_IMAGE} AS base
+RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi && \
+    if [ ! -z ${HTTP_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTP_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi && \
+    if [ ! -z ${HTTPS_PROXY} ]; then echo "Acquire::https::Proxy \"${HTTPS_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi
+RUN apt update && \
+    apt full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    wget \
+    vim \
+    numactl \
+    gcc-12 \
+    g++-12 \
+    make
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+
+WORKDIR /root
+
+RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh && \
+    bash miniforge.sh -b -p ./miniforge3 && \
+    rm miniforge.sh
+
+# --build-arg COMPILE=ON to compile from source
+FROM base AS dev
+ARG COMPILE
+RUN git clone https://github.com/intel/intel-extension-for-pytorch.git
+RUN . ~/miniforge3/bin/activate && conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
+    cd intel-extension-for-pytorch/examples/cpu/llm && \
+    export CC=gcc && export CXX=g++ && \
+    if [ -z ${COMPILE} ]; then bash tools/env_setup.sh 14; else bash tools/env_setup.sh 10; fi && \
+    unset CC && unset CXX
+
+FROM base AS deploy
+RUN apt update && \
+    DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \
+    google-perftools \
+    openssh-server \
+    net-tools && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi
+COPY --from=dev /root/intel-extension-for-pytorch/examples/cpu/llm ./llm
+COPY --from=dev /root/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh ./llm/tools
+RUN . ~/miniforge3/bin/activate && conda create -y -n py310 python=3.10 && conda activate py310 && \
+    cd /usr/lib/x86_64-linux-gnu/ && ln -s libtcmalloc.so.4 libtcmalloc.so && cd && \
+    cd ./llm && \
+    bash tools/env_setup.sh 9 && \
+    python -m pip cache purge && \
+    mv ./oneCCL_release /opt/oneCCL && \
+    chown -R root:root /opt/oneCCL && \
+    sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh && \
+    pip install backoff fastapi uvicorn
+ARG PORT_SSH=22
+RUN mkdir /var/run/sshd && \
+    sed -i "s/#Port.*/Port ${PORT_SSH}/" /etc/ssh/sshd_config && \
+    echo "service ssh start" >> /root/.bashrc && \
+    ssh-keygen -b 4096 -f /root/.ssh/id_rsa -N "" && \
+    mv /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
+    echo "Host *\n    Port ${PORT_SSH}\n    IdentityFile /root/.ssh/id_rsa\n    StrictHostKeyChecking no" > /root/.ssh/config
+EXPOSE ${PORT_SSH}
+COPY ./comps/third_parties/ipex/src/ipex_inference.py /root
+COPY ./comps/third_parties/ipex/src/openai_protocol.py /root
+COPY ./comps/third_parties/ipex/src/entrypoint.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+
diff --git a/comps/third_parties/ipex/src/README.md b/comps/third_parties/ipex/src/README.md
@@ -0,0 +1,31 @@
+# IPEX Serving microservice
+
+[Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) delivers advanced optimizations to accelerate Large Language Model (LLM) inference on Intel hardware. It enhances performance through techniques such as paged attention and ROPE fusion, while also supporting a range of precision formats, including FP32, BF16, Smooth Quantization INT8, and prototype weight-only quantization in INT8/INT4.
+
+For more details, refer to the [README](https://github.com/intel/intel-extension-for-pytorch/blob/main/examples/cpu/llm/README.md)
+
+## 🚀1. Build the Docker Image
+
+The Dockerfile used here is primarily sourced from the IPEX project, with additions to incorporate serving capabilities for LLM inference. This Dockerfile enables SSH passwordless login, primarily for implementing distributed inference, although distributed inference is not currently applied but will be added soon.
+
+```bash
+cd ../../../../
+docker build -f comps/third_parties/ipex/src/Dockerfile --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg COMPILE=ON --build-arg PORT_SSH=2345 -t opea/ipex-llm:latest .
+```
+
+## 🚀2. Start the microservice
+
+```bash
+export MODEL_ID="microsoft/phi-4"
+
+cd comps/third_parties/ipex/deployment/docker_compose
+docker compose -f compose.yaml up -d
+```
+
+## 🚀3. Access the service
+
+Then you need to test your service using the following commands:
+
+```bash
+http_proxy="" curl -X POST -H "Content-Type: application/json" -d '{"model": "microsoft/phi-4", "messages": [{"role": "user", "content": "Hello! What is your name?"}], "max_tokens": 128}' http://localhost:8688/v1/chat/completions
+```
diff --git a/comps/third_parties/ipex/src/entrypoint.sh b/comps/third_parties/ipex/src/entrypoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Collect all command-line arguments into an array
+CMDS=()
+while [ $# -gt 0 ]; do
+  CMDS+=("$1")
+  shift
+done
+
+# Activate the Miniforge environment
+. ~/miniforge3/bin/activate
+conda activate py310
+
+# Set environment variables for oneCCL bindings for PyTorch
+TMP=$(python -c "import torch; import os; print(os.path.abspath(os.path.dirname(torch.__file__)))")
+. ${TMP}/../oneccl_bindings_for_pytorch/env/setvars.sh
+
+# Print a performance note
+echo "**Note:** For better performance, please consider to launch workloads with command 'ipexrun'."
+
+# Run the inference script
+python /root/ipex_inference.py "${CMDS[@]}"
diff --git a/comps/third_parties/ipex/src/ipex_inference.py b/comps/third_parties/ipex/src/ipex_inference.py
diff --git a/comps/third_parties/ipex/src/openai_protocol.py b/comps/third_parties/ipex/src/openai_protocol.py