huggingface · anton-l · Mar 22, 2025 · Mar 22, 2025 · Mar 22, 2025 · Mar 26, 2025
diff --git a/.github/workflows/slow_tests.yaml b/.github/workflows/slow_tests.yaml
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -11,49 +11,36 @@ on:
 
 jobs:
   run_tests:
-    name: Run tests
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          lfs: true
-
-      - name: Cache Hugging Face models
-        uses: actions/cache@v4
-        with:
-          path: cache/models
-          key: hf-models-${{ runner.os }}-${{ github.ref }}
-          restore-keys: hf-models-${{ runner.os }}-
-
-      - name: Cache Hugging Face datasets
-        uses: actions/cache@v4
-        with:
-          path: cache/datasets
-          key: hf-datasets-${{ runner.os }}-${{ github.ref }}
-          restore-keys: hf-datasets-${{ runner.os }}-
-
-      - name: Cache uv virtual environment
-        uses: actions/cache@v4
-        with:
-          path: .venv
-          key: uv-env-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
-          restore-keys: uv-env-${{ runner.os }}-
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v5
-        with:
-          enable-cache: true
-
-      - name: Install the project
-        run: uv sync --extra dev
-
-      - name: Ensure cache directories exist
-        run: mkdir -p cache/models cache/datasets
-
-      - name: Run tests
-        env:
-          HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
-          HF_HOME: "cache/models"
-          HF_DATASETS_CACHE: "cache/datasets"
-        run: uv run pytest -x --disable-pytest-warnings
+   name: Run tests
+   runs-on: ubuntu-latest
+   steps:
+     - name: Checkout code
+       uses: actions/checkout@v3
+       with:
+        lfs: 'true'
+     - name: Setup Python environment
+       uses: actions/setup-python@v4
+       with:
+         python-version: '3.10'
+         cache: 'pip'
+     - name: Install lighteval in editable mode
+       run: |
+         pip install -e .[dev,extended_tasks,multilingual,litellm]
+     - name: Get cached files
+       uses: actions/cache@v4
+       id: get-cache
+       with:
+         path: "cache"
+         key: test-cache-HF
+     - name: Test
+       env:
+        HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
+        HF_HOME: "cache/models"
+        HF_DATASETS_CACHE: "cache/datasets"
+       run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
+        python -m pytest -x --disable-pytest-warnings
+     - name: Write cache
+       uses: actions/cache@v4
+       with:
+         path: "cache"
+         key: test-cache-HF
diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ Here’s a quick command to evaluate using the Accelerate backend:
 
 ```shell
 lighteval accelerate \
-    "model_name=gpt2" \
+    "pretrained=gpt2" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -23,8 +23,8 @@
     title: Use vllm as backend
   - local: use-sglang-as-backend
     title: Use SGLang as backend
-  - local: use-huggingface-inference-endpoints-or-tgi-as-backend
-    title: Use Hugging Face inference endpoints or TGI as backend
+  - local: evaluate-the-model-on-a-server-or-container
+    title: Evaluate on Server
   - local: contributing-to-multilingual-evaluations
     title: Contributing to multilingual evaluations
   title: Guides

diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
@@ -171,7 +171,7 @@ Once your file is created you can then run the evaluation with the following com
 
 ```bash
 lighteval accelerate \
-    "model_name=HuggingFaceH4/zephyr-7b-beta" \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta" \
     "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
     --custom-tasks {path_to_your_custom_task_file}
 ```
diff --git a/...inference-endpoints-or-tgi-as-backend.mdx → ...te-the-model-on-a-server-or-container.mdx b/...inference-endpoints-or-tgi-as-backend.mdx → ...te-the-model-on-a-server-or-container.mdx
@@ -25,12 +25,15 @@ be deleted afterwards).
 __configuration file example:__
 
 ```yaml
-model_parameters:
-    reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
-# endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
+model:
+  base_params:
+    # Pass either model_name, or endpoint_name and true reuse_existing
+    # endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
+    # reuse_existing: true # defaults to false; if true, ignore all params in instance, and don't delete the endpoint after evaluation
     model_name: "meta-llama/Llama-2-7b-hf"
-    revision: "main"  # defaults to "main"
+    # revision: "main" # defaults to "main"
     dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
+  instance:
     accelerator: "gpu"
     region: "eu-west-1"
     vendor: "aws"
@@ -41,7 +44,7 @@ model_parameters:
     namespace: null # The namespace under which to launch the endpoint. Defaults to the current user's namespace
     image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
     env_vars:
-    null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
+      null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
 ```
 
 ### Text Generation Inference (TGI)
@@ -52,8 +55,25 @@ serverless inference.
 __configuration file example:__
 
 ```yaml
-model_parameters:
+model:
+  instance:
     inference_server_address: ""
     inference_server_auth: null
     model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
 ```
+
+### OpenAI API
+
+Lighteval also supports evaluating models on the OpenAI API. To do so you need to set your OpenAI API key in the environment variable.
+
+```bash
+export  OPENAI_API_KEY={your_key}
+```
+
+And then run the following command:
+
+```bash
+lighteval endpoint openai \
+    {model-name} \
+    <task parameters>
+```
diff --git a/docs/source/package_reference/models.mdx b/docs/source/package_reference/models.mdx
@@ -31,6 +31,10 @@
 ### Open AI Models
 [[autodoc]] models.endpoints.openai_model.OpenAIClient
 
+## Nanotron Model
+### NanotronLightevalModel
+[[autodoc]] models.nanotron.nanotron_model.NanotronLightevalModel
+
 ## VLLM Model
 ### VLLMModel
 [[autodoc]] models.vllm.vllm_model.VLLMModelConfig

diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
@@ -27,7 +27,7 @@ To evaluate `GPT-2` on the Truthful QA benchmark with [🤗
 
 ```bash
 lighteval accelerate \
-     "model_name=openai-community/gpt2" \
+     "pretrained=gpt2" \
      "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -59,7 +59,7 @@ When specifying a path to file, it should start with `./`.
 
 ```bash
 lighteval accelerate \
-     "model_name=openai-community/gpt2" \
+     "pretrained=gpt2" \
      ./path/to/lighteval/examples/tasks/recommended_set.txt
 # or, e.g., "leaderboard|truthfulqa:mc|0|0|,leaderboard|gsm8k|3|1"
 ```
@@ -79,7 +79,7 @@ You can then evaluate a model using data parallelism on 8 GPUs like follows:
 ```bash
 accelerate launch --multi_gpu --num_processes=8 -m \
     lighteval accelerate \
-    "model_name=openai-community/gpt2" \
+    "pretrained=gpt2" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -92,7 +92,7 @@ To evaluate a model using pipeline parallelism on 2 or more GPUs, run:
 
 ```bash
 lighteval accelerate \
-    "model_name=openai-community/gpt2,model_parallel=True" \
+    "pretrained=gpt2,model_parallel=True" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -129,7 +129,7 @@ accelerate).
 - **add_special_tokens** (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
    If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
     `False` for causal models.
-- **model_parallel** (bool, optional, defaults to None):
+- **model_parallel** (bool, optional, defaults to False):
     True/False: force to use or not the `accelerate` library to load a large
     model across multiple devices.
     Default: None which corresponds to comparing the number of processes with

diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
@@ -31,20 +31,6 @@ This will create a Tensorboard dashboard in a HF org set with the `--results-org
 option.
 
 
-## Pushing results to WandB
-
-You can push the results to WandB by setting `--wandb`. This will init a WandB
-run and log the results.
-
-Wandb args need to be set in your env variables.
-
-```
-export WANDB_PROJECT="lighteval"
-```
-
-You can find a list of variable in the [wandb documentation](https://docs.wandb.ai/guides/track/environment-variables/).
-
-
 ## How to load and investigate details
 
 ### Load from local detail files

diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/use-inference-providers-as-backend.mdx
@@ -11,7 +11,7 @@ Lighteval allows to use Hugging Face's Inference Providers to evaluate llms on s
 
 ```bash
 lighteval endpoint inference-providers \
-    "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
+    "model=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
     "lighteval|gsm8k|0|0"
 ```
 
@@ -28,13 +28,13 @@ lighteval endpoint inference-providers \
 with the following config file:
 
 ```yaml
-model_parameters:
+model:
   model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
   provider: "novita"
   timeout: null
   proxies: null
   parallel_calls_count: 10
-  generation_parameters:
+  generation:
     temperature: 0.8
     top_k: 10
     max_new_tokens: 10000

diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx
@@ -10,14 +10,10 @@ Documentation for available APIs and compatible endpoints can be found [here](ht
 
 ```bash
 lighteval endpoint litellm \
-    "provider=openai,model_name=gpt-3.5-turbo" \
-    "lighteval|gsm8k|0|0" \
-    --use-chat-template
+    "gpt-3.5-turbo" \
+    "lighteval|gsm8k|0|0"
 ```
 
-> [!WARNING]
-> `--use-chat-template` is required for litellm to work properly.
-
 ## Using a config file
 
 Litellm allows generation with any OpenAI compatible endpoint, for example you
@@ -26,16 +22,17 @@ can evaluate a model running on a local vllm server.
 To do so you will need to use a config file like so:
 
 ```yaml
-model_parameters:
+model:
+  base_params:
     model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
     base_url: "URL OF THE ENDPOINT YOU WANT TO USE"
     api_key: "" # remove or keep empty as needed
-    generation_parameters:
-      temperature: 0.5
-      max_new_tokens: 256
-      stop_tokens: [""]
-      top_p: 0.9
-      seed: 0
-      repetition_penalty: 1.0
-      frequency_penalty: 0.0
+  generation:
+    temperature: 0.5
+    max_new_tokens: 256
+    stop_tokens: [""]
+    top_p: 0.9
+    seed: 0
+    repetition_penalty: 1.0
+    frequency_penalty: 0.0
 ```