Support Longbench (#179)

XinyaoWa · pre-commit-ci[bot] · web-flow · commit 021193f1d69d · 2024-10-29T16:45:25.000+08:00
* add longbench Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * refine readme Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/evals/evaluation/longbench/README.md b/evals/evaluation/longbench/README.md
@@ -0,0 +1,66 @@
+[LongBench](https://github.com/THUDM/LongBench) is the benchmark for bilingual, multitask, and comprehensive assessment of long context understanding capabilities of large language models. LongBench includes different languages (Chinese and English) to provide a more comprehensive evaluation of the large models' multilingual capabilities on long contexts. In addition, LongBench is composed of six major categories and twenty one different tasks, covering key long-text application scenarios such as single-document QA, multi-document QA, summarization, few-shot learning, synthetic tasks and code completion.
+
+In this guideline, we evaluate LongBench dataset with OPEA services on Intel hardwares.
+
+# 🚀 QuickStart
+
+## Installation
+
+```
+pip install ../../../requirements.txt
+```
+
+## Launch a LLM Service
+
+To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-gaudi) or [OPEA microservices](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation) to launch a service. 
+
+### Example 1: TGI
+For example, the follow command is to setup the [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model on  Gaudi:
+
+```
+model=meta-llama/Llama-2-7b-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all \
+-e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN=$hf_token \
+-e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true \
+-e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host \
+ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model --max-input-tokens 1024 \
+--max-total-tokens 2048
+```
+
+### Example 2: OPEA LLM
+You can also set up a service with OPEA microservices. 
+
+For example, you can refer to [native LLM](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/native/langchain) for deployment on native Gaudi without any serving framework.
+
+## Predict 
+Please set up the environment variables first.
+```
+export ENDPOINT="http://{host_ip}:8080/generate" # your LLM serving endpoint
+export LLM_MODEL="meta-llama/Llama-2-7b-hf"
+export BACKEND="tgi" # "tgi" or "llm"
+export DATASET="narrativeqa" # can refer to https://github.com/THUDM/LongBench/blob/main/task.md for full list
+export MAX_INPUT_LENGTH=2048 # specify the max input length according to llm services
+```
+Then get the prediction on the dataset.
+```
+python pred.py \
+    --endpoint ${ENDPOINT} \
+    --model_name ${LLM_MODEL} \
+    --backend ${BACKEND} \ 
+    --dataset ${DATASET} \
+    --max_input_length  ${MAX_INPUT_LENGTH}
+```
+The prediction will be saved to "pred/{LLM_MODEL}/{DATASET.jsonl}".
+
+## Evaluate
+Evaluate the prediction with LongBench metrics.
+```
+git clone https://github.com/THUDM/LongBench
+cd LongBench
+pip install -r requirements.txt
+python eval.py --model ${LLM_MODEL}
+```
+Then evaluated result will be saved to "pred/{LLM_MODEL}/{result.jsonl}".
diff --git a/evals/evaluation/longbench/pred.py b/evals/evaluation/longbench/pred.py
@@ -0,0 +1,163 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import os
+import random
+import time
+
+import numpy as np
+import requests
+from datasets import load_dataset
+from requests.exceptions import RequestException
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--endpoint", type=str, required=True)
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--backend", type=str, default="tgi", choices=["tgi", "llm"])
+    parser.add_argument(
+        "--dataset", type=str, help="give dataset name, if not given, will evaluate on all datasets", default=None
+    )
+    parser.add_argument("--e", action="store_true", help="Evaluate on LongBench-E")
+    parser.add_argument("--max_input_length", type=int, default=2048, help="max input length")
+    return parser.parse_args(args)
+
+
+def get_query(backend, prompt, max_new_length):
+    header = {"Content-Type": "application/json"}
+    query = {
+        "tgi": {"inputs": prompt, "parameters": {"max_new_tokens": max_new_length, "do_sample": False}},
+        "llm": {"query": prompt, "max_tokens": max_new_length},
+    }
+    return header, query[backend]
+
+
+def get_pred(
+    data, dataset_name, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path
+):
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    for json_obj in tqdm(data):
+        prompt = prompt_format.format(**json_obj)
+
+        # truncate to fit max_input_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
+        tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
+        if len(tokenized_prompt) > max_input_length:
+            half = int(max_input_length / 2)
+            prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode(
+                tokenized_prompt[-half:], skip_special_tokens=True
+            )
+
+        header, query = get_query(backend, prompt, max_new_length)
+        print("query: ", query)
+        try:
+            start_time = time.perf_counter()
+            res = requests.post(endpoint, headers=header, json=query)
+            res.raise_for_status()
+            res = res.json()
+            cost = time.perf_counter() - start_time
+        except RequestException as e:
+            raise Exception(f"An unexpected error occurred: {str(e)}")
+
+        if backend == "tgi":
+            result = res["generated_text"]
+        else:
+            result = res["text"]
+        print("result: ", result)
+        with open(out_path, "a", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "pred": result,
+                    "answers": json_obj["answers"],
+                    "all_classes": json_obj["all_classes"],
+                    "length": json_obj["length"],
+                },
+                f,
+                ensure_ascii=False,
+            )
+            f.write("\n")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    endpoint = args.endpoint
+    model_name = args.model_name
+    backend = args.backend
+    dataset = args.dataset
+    max_input_length = args.max_input_length
+
+    dataset_list = [
+        "narrativeqa",
+        "qasper",
+        "multifieldqa_en",
+        "multifieldqa_zh",
+        "hotpotqa",
+        "2wikimqa",
+        "musique",
+        "dureader",
+        "gov_report",
+        "qmsum",
+        "multi_news",
+        "vcsum",
+        "trec",
+        "triviaqa",
+        "samsum",
+        "lsht",
+        "passage_count",
+        "passage_retrieval_en",
+        "passage_retrieval_zh",
+        "lcc",
+        "repobench-p",
+    ]
+    datasets_e_list = [
+        "qasper",
+        "multifieldqa_en",
+        "hotpotqa",
+        "2wikimqa",
+        "gov_report",
+        "multi_news",
+        "trec",
+        "triviaqa",
+        "samsum",
+        "passage_count",
+        "passage_retrieval_en",
+        "lcc",
+        "repobench-p",
+    ]
+    if args.e:
+        if dataset is not None:
+            if dataset in datasets_e_list:
+                datasets = [dataset]
+            else:
+                raise NotImplementedError(f"{dataset} are not supported in LongBench-e dataset list: {datasets_e_list}")
+        else:
+            datasets = datasets_e_list
+        if not os.path.exists(f"pred_e/{model_name}"):
+            os.makedirs(f"pred_e/{model_name}")
+    else:
+        datasets = [dataset] if dataset is not None else dataset_list
+        if not os.path.exists(f"pred/{model_name}"):
+            os.makedirs(f"pred/{model_name}")
+
+    for dataset in datasets:
+        if args.e:
+            out_path = f"pred_e/{model_name}/{dataset}.jsonl"
+            data = load_dataset("THUDM/LongBench", f"{dataset}_e", split="test")
+        else:
+            out_path = f"pred/{model_name}/{dataset}.jsonl"
+            data = load_dataset("THUDM/LongBench", dataset, split="test")
+
+        # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
+        dataset2prompt = json.load(open("config/dataset2prompt.json", "r"))
+        dataset2maxlen = json.load(open("config/dataset2maxlen.json", "r"))
+        prompt_format = dataset2prompt[dataset]
+        max_new_length = dataset2maxlen[dataset]
+
+        data_all = [data_sample for data_sample in data]
+        get_pred(
+            data_all, dataset, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path
+        )