Merge pull request #20 from XinyaoWa/vllm_gaudi

howard-yen · web-flow · commit 4526dfbdc512 · 2025-04-09T15:07:23.000-04:00
Support model evaluation on Intel Gaudi
diff --git a/README.md b/README.md
@@ -54,6 +54,8 @@ source env/bin/activate
 pip install -r requirements.txt
 ```
 
+For evaluating on NVIDIA GPUs, please install `flash-attn` by referring to the [flash attention repo](https://github.com/Dao-AILab/flash-attention).
+
 Additionally, if you wish to use the API models, you will need to install the package corresponding to the API you wish to use
 ```bash
 pip install openai # OpenAI API (GPT)
@@ -105,6 +107,21 @@ sbatch scripts/run_short_slurm.sh # 8k-64k
 # for the API models, note that API results may vary due to the randomness in the API calls
 bash scripts/run_api.sh 
 ```
+### Run on Intel Gaudi
+If you want to enable the evaluation on vLLM with Intel Gaudi, you can use the following commands:
+```bash
+## Build vllm docker image
+cd scripts/vllm-gaudi
+bash build_image.sh
+
+## launch vllm container, change `LLM_MODEL_ID` and `NUM_CARDS` as your need
+bash launch_container.sh
+
+## evalute
+cd ../../
+bash scripts/run_eval_vllm_gaudi.sh
+```
+
 Check out the script file for more details!
 See [Others](#others) for the slurm scripts, easily collecting all the results, and using VLLM.
 
diff --git a/arguments.py b/arguments.py
@@ -19,6 +19,8 @@ def parse_arguments():
     parser.add_argument("--model_name_or_path", type=str, default=None)
     parser.add_argument("--use_vllm", action="store_true", help="whether to use vllm engine")
     parser.add_argument("--use_sglang", action="store_true", help="whether to use sglang engine")
+    parser.add_argument("--use_tgi_or_vllm_serving", action="store_true", help="whether to use tgi or vllm serving engine")
+    parser.add_argument("--endpoint_url", type=str,default="http://localhost:8080/v1/", help="endpoint url for tgi or vllm serving engine")
 
     # data settings
     parser.add_argument("--datasets", type=str, default=None, help="comma separated list of dataset names")
diff --git a/configs/rag_vllm.yaml b/configs/rag_vllm.yaml
@@ -0,0 +1,12 @@
+input_max_length: 131072,131072,131072,131072
+datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3
+generation_max_length: 20,20,20,20
+test_files: data/kilt/nq-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k1000_dep3.jsonl,data/kilt/popqa_test_1000_k1000_dep6.jsonl
+demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl
+use_chat_template: false
+max_test_samples: 100
+shots: 2
+stop_new_line: true
+model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
+output_dir: output/vllm-gaudi/Llama-3.3-70B-Instruct
+use_tgi_or_vllm_serving: true
diff --git a/configs/recall_vllm.yaml b/configs/recall_vllm.yaml
@@ -0,0 +1,12 @@
+input_max_length: 131072,131072,131072,131072
+datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv
+generation_max_length: 50,100,50,100
+test_files: data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/json_kv/test_k1800_dep6.jsonl
+demo_files: ',,,'
+use_chat_template: false
+max_test_samples: 100
+shots: 2
+stop_new_line: false
+model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
+output_dir: output/vllm-gaudi/Llama-3.3-70B-Instruct
+use_tgi_or_vllm_serving: true
diff --git a/eval.py b/eval.py
@@ -12,7 +12,7 @@
 from torch.utils.data import DataLoader
 
 from arguments import parse_arguments
-from model_utils import load_LLM, OpenAIModel, AnthropicModel
+from model_utils import load_LLM, OpenAIModel, AnthropicModel, TgiVllmModel
 
 from data import (
     load_data,
@@ -77,7 +77,7 @@ def run_test(args, model, dataset, test_file, demo_file):
     logger.info("Running generation...")
     start_time = time.time()
     # generate all outputs
-    if isinstance(model, OpenAIModel) or isinstance(model, AnthropicModel):
+    if (isinstance(model, OpenAIModel) or isinstance(model, AnthropicModel)) and (not isinstance(model, TgiVllmModel)):
         # using the batch API makes it cheaper and faster
         logger.info(f"Using the OpenAI/Anthropic batch API by default, if you want to use the iterative API, please change the code")
         all_outputs = model.generate_batch(all_inputs, batch_file=output_path+".batch")
@@ -138,8 +138,9 @@ def run_test(args, model, dataset, test_file, demo_file):
         if args.debug:
             import pdb; pdb.set_trace()
 
-    mem_usage = sum([torch.cuda.max_memory_allocated(i) for i in range(torch.cuda.device_count())])
-    logger.info(f"Memory usage: {mem_usage/1000**3:.02f} GB")
+    if not args.no_cuda:
+        mem_usage = sum([torch.cuda.max_memory_allocated(i) for i in range(torch.cuda.device_count())])
+        logger.info(f"Memory usage: {mem_usage/1000**3:.02f} GB")
     logger.info(f"Total time: {end_time - start_time:.02f} s")
     logger.info(f"Throughput: {len(results) / (end_time - start_time):.02f} samples/s")
 
@@ -162,9 +163,10 @@ def run_test(args, model, dataset, test_file, demo_file):
         "data": results,
         "metrics": metrics,
         "averaged_metrics": averaged_metrics,
-        "memory_usage": mem_usage,
         "throughput": len(results) / (end_time - start_time),
     }
+    if not args.no_cuda:
+        output["memory_usage"] = mem_usage
 
     if args.output_dir is not None:
         with open(output_path, "w") as f:
diff --git a/model_utils.py b/model_utils.py
@@ -326,6 +326,64 @@ def generate_batch(self, inputs=None, prompt=None, **kwargs):
 
         return outputs
 
+class TgiVllmModel(OpenAIModel):
+    def __init__(
+        self, 
+        model_name, 
+        temperature=0.9, 
+        top_p=0.9, 
+        max_length=32768, 
+        generation_max_length=2048, 
+        generation_min_length=0, 
+        do_sample=True, 
+        stop_newline=False, 
+        use_chat_template=True, 
+        system_message=None,
+        seed=42,
+        **kwargs
+    ):
+        self.model_name = model_name
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_length = max_length
+        self.generation_max_length = generation_max_length
+        self.generation_min_length = generation_min_length
+        self.do_sample = do_sample
+        self.use_chat_template = use_chat_template
+        self.system_message = system_message
+        self.stops = None
+        if stop_newline:
+            self.stops = ["\n", "\n\n"]
+        
+        from openai import OpenAI
+        from transformers import AutoTokenizer
+        
+        endpoint_url = kwargs["endpoint_url"]
+        print(f"** Endpoint URL: {endpoint_url}")
+
+        self.model = OpenAI(
+                base_url=endpoint_url,
+                api_key="EMPTY_KEY"
+            )
+        self.model_name = model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.seed = seed
+        self.API_MAX_LENGTH = float('inf')
+
+    def generate_batch(self, inputs=None, prompt=None, **kwargs):
+        if inputs is None:
+            inputs = [None for _ in prompt]
+        else:
+            prompt = [None for _ in inputs]
+
+        # we don't support kwargs here for now
+        if len(kwargs) > 0:
+            logger.warning("kwargs are not supported for batch generation")
+        # use thread_map instead of process_map since the bottleneck is the api call
+        outputs = thread_map(self.generate, inputs, prompt, max_workers=32)
+
+        return outputs
+
 
 class AnthropicModel(LLM):
     def __init__(
@@ -1203,6 +1261,10 @@ def load_LLM(args):
     elif args.use_vllm:
         model_cls = VLLMModel
         kwargs['seed'] = args.seed
+    elif args.use_tgi_or_vllm_serving:
+        model_cls = TgiVllmModel
+        kwargs['seed'] = args.seed
+        kwargs["endpoint_url"] = args.endpoint_url
     elif args.use_sglang:
         model_cls = SGLangModel
         kwargs['seed'] = args.seed
diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,5 @@ datasets
 transformers
 accelerate
 sentencepiece
-flash-attn
 pytrec_eval
 rouge_score
diff --git a/scripts/run_eval_vllm_gaudi.sh b/scripts/run_eval_vllm_gaudi.sh
@@ -0,0 +1,9 @@
+export host_ip=$(hostname -I | awk '{print $1}')
+export LLM_ENDPOINT_PORT=8010
+export DATA_PATH="~/.cache/huggingface"
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1"
+export HF_HOME=$DATA_PATH
+
+for task in "recall" "rag"; do
+    python eval.py --config configs/${task}_vllm.yaml --endpoint_url $LLM_ENDPOINT --overwrite --no_cuda
+done
diff --git a/scripts/vllm-gaudi/build_image.sh b/scripts/vllm-gaudi/build_image.sh
@@ -0,0 +1,18 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+export TAG="helmet"
+echo "Building the vllm-gaudi docker images"
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd ./vllm-fork
+git checkout v0.6.6.post1+Gaudi-1.20.0 #habana_main
+
+docker build --no-cache -f Dockerfile.hpu -t ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+if [ $? -ne 0 ]; then
+    echo "vllm-gaudi failed"
+    exit 1
+else
+    echo "vllm-gaudi successful"
+fi
+
+
diff --git a/scripts/vllm-gaudi/compose.yaml b/scripts/vllm-gaudi/compose.yaml
@@ -0,0 +1,39 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  vllm-gaudi-server:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "${DATA_PATH:-./data}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HF_TOKEN}
+      HF_HOME: "/data"
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-true}
+      VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1
+      MAX_MODEL_LEN: ${MAX_MODEL_LEN:-131072}
+      MAX_SEQ_LEN_TO_CAPTURE: ${MAX_MODEL_LEN:-131072}
+      NUM_CARDS: ${NUM_CARDS:-1}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 150
+    command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_MODEL_LEN} --max-model-len ${MAX_MODEL_LEN}
diff --git a/scripts/vllm-gaudi/launch_container.sh b/scripts/vllm-gaudi/launch_container.sh
@@ -0,0 +1,16 @@
+export host_ip=$(hostname -I | awk '{print $1}')
+export LLM_ENDPOINT_PORT=8010
+export HF_TOKEN=${HF_TOKEN}
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1"
+export DATA_PATH="~/.cache/huggingface"
+export MAX_MODEL_LEN=131072
+
+# single node 
+# export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+# export NUM_CARDS=1
+
+# multiple nodes 
+export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"
+export NUM_CARDS=8
+
+docker compose up