huggingface · NathanHB · May 19, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
@@ -40,6 +40,7 @@
 from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig
 from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig
 from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
+from lighteval.models.transformers.vlm_transformers import VLMTransformersModel
 from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig
 from lighteval.utils.imports import (
     NO_LITELLM_ERROR_MSG,
@@ -163,7 +164,7 @@ def load_model_with_accelerate_or_default(
         model = VLLMModel(config=config)
         return model
     else:
-        model = TransformersModel(config=config)
+        model = VLMTransformersModel(config=config)
 
     return model
 

diff --git a/src/lighteval/models/transformers/vlm_transformers.py b/src/lighteval/models/transformers/vlm_transformers.py
@@ -0,0 +1,285 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import logging
+from typing import Union
+
+import torch
+from pydantic import PositiveInt
+from transformers import (
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    ProcessorMixin,
+)
+
+from lighteval.models.abstract_model import LightevalModel, ModelInfo
+from lighteval.models.model_output import (
+    GenerativeResponse,
+    LoglikelihoodResponse,
+    LoglikelihoodSingleTokenResponse,
+)
+from lighteval.models.utils import ModelConfig, _get_model_sha, _simplify_name
+from lighteval.tasks.requests import (
+    GreedyUntilRequest,
+    LoglikelihoodRequest,
+    LoglikelihoodSingleTokenRequest,
+)
+from lighteval.utils.imports import (
+    is_accelerate_available,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+if is_accelerate_available():
+    from datetime import timedelta
+
+    from accelerate import Accelerator, InitProcessGroupKwargs
+
+
+class VLMTransformersModelConfig(ModelConfig):
+    """
+    Base configuration class for models.
+
+    Attributes:
+        model_name (str):
+            HuggingFace Hub model ID name or the path to a pre-trained
+            model to load. This is effectively the `pretrained_model_name_or_path`
+            argument of `from_pretrained` in the HuggingFace `transformers` API.
+        accelerator (Accelerator): accelerator to use for model training.
+        tokenizer (Optional[str]): HuggingFace Hub tokenizer ID that will be
+            used for tokenization.
+        multichoice_continuations_start_space (Optional[bool]): Whether to add a
+            space at the start of each continuation in multichoice generation.
+            For example, context: "What is the capital of France?" and choices: "Paris", "London".
+            Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
+            True adds a space, False strips a space, None does nothing
+        pairwise_tokenization (bool): Whether to tokenize the context and continuation as separately or together.
+        subfolder (Optional[str]): The subfolder within the model repository.
+        revision (str): The revision of the model.
+        batch_size (int): The batch size for model training.
+        max_gen_toks (Optional[int]): The maximum number of tokens to generate.
+        max_length (Optional[int]): The maximum length of the generated output.
+        add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
+           If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
+            `False` for causal models.
+        model_parallel (bool, optional, defaults to None):
+            True/False: force to use or not the `accelerate` library to load a large
+            model across multiple devices.
+            Default: None which corresponds to comparing the number of processes with
+                the number of GPUs. If it's smaller => model-parallelism, else not.
+        dtype (Union[str, torch.dtype], optional, defaults to None):):
+            Converts the model weights to `dtype`, if specified. Strings get
+            converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
+            Use `dtype="auto"` to derive the type from the model's weights.
+        device (Union[int, str]): device to use for model training.
+        quantization_config (Optional[BitsAndBytesConfig]): quantization
+            configuration for the model, manually provided to load a normally floating point
+            model at a quantized precision. Needed for 4-bit and 8-bit precision.
+        trust_remote_code (bool): Whether to trust remote code during model
+            loading.
+        generation_parameters (GenerationParameters): Range of parameters which will affect the generation.
+        generation_config (GenerationConfig): GenerationConfig object (only passed during manual creation)
+
+    Methods:
+        __post_init__(): Performs post-initialization checks on the configuration.
+        _init_configs(model_name, env_config): Initializes the model configuration.
+        init_configs(env_config): Initializes the model configuration using the environment configuration.
+        get_model_sha(): Retrieves the SHA of the model.
+
+    """
+
+    model_name: str
+    tokenizer: str | None = None
+    subfolder: str | None = None
+    revision: str = "main"
+    batch_size: PositiveInt | None = None
+    generation_size: PositiveInt = 256
+    max_length: PositiveInt | None = None
+    add_special_tokens: bool = True
+    model_parallel: bool | None = None
+    dtype: str | None = None
+    device: Union[int, str] = "cuda"
+    trust_remote_code: bool = False
+    use_chat_template: bool = False
+    compile: bool = False
+    pairwise_tokenization: bool = False
+    device_map: str | None = None
+
+    def get_model_sha(self):
+        return _get_model_sha(repo_id=self.model_name, revision=self.revision)
+
+
+class VLMTransformersModel(LightevalModel):
+    def __init__(
+        self,
+        config: VLMTransformersModelConfig,
+    ):
+        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
+        self.config = config
+        self.accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
+        self._device = self.accelerator.device
+        self.use_chat_template = config.use_chat_template
+        self.multichoice_continuations_start_space = config.multichoice_continuations_start_space
+        self._add_special_tokens = config.add_special_tokens or False
+        self.pairwise_tokenization = config.pairwise_tokenization
+        self.batch_size = config.batch_size
+        self.transformers_config = config.get_transformers_config()
+
+        self.model_sha = config.get_model_sha()
+        self._max_length = self._init_max_length()
+        self._tokenizer = self._create_auto_tokenizer()
+        self.model = self._create_auto_model()
+
+        # We are in DP (and launch the script with `accelerate launch`)
+        if config.model_parallel is False and self.config.dtype not in ["4bit", "8bit"]:
+            logger.info(f"Using Data Parallelism, putting model on device {self._device}")
+            self.model = self.model.to(self._device)
+        if config.compile:
+            try:
+                logger.info("Compiling the model")
+                self.model.model.compile()
+            except AttributeError as e:
+                logger.warning("Could not compile the model because: ", e)
+
+        self.model_name = _simplify_name(config.model_name)
+
+        self.generation_config_dict = config.generation_parameters.to_transformers_dict()
+
+        self.model_info = ModelInfo(
+            model_name=self.config.model_name,
+            model_sha=self.model_sha,
+            model_dtype=config.dtype,
+        )
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    @property
+    def add_special_tokens(self):
+        return self._add_special_tokens
+
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+
+    @property
+    def device(self) -> Union[int, str, torch.device]:
+        return self._device
+
+    @property
+    def disable_tqdm(self) -> bool:
+        disable_tqdm = False
+        if self.accelerator:
+            disable_tqdm = bool(not self.accelerator.is_main_process)
+        return disable_tqdm
+
+    def _create_auto_model(self) -> AutoModelForVision2Seq:
+        subfolder = self.config.subfolder
+        revision = self.config.revision + (f"/{subfolder}" if subfolder is not None else "")
+
+        model = AutoModelForVision2Seq.from_pretrained(
+            self.config.model_name,
+            revision=revision,
+            device_map=self.config.device_map,
+            torch_dtype=self.config.dtype,
+            trust_remote_code=self.config.trust_remote_code,
+        )
+        model.eval()
+        torch.set_grad_enabled(False)
+
+        if self.config.compile:
+            try:
+                logger.info("Compiling the model")
+                model.compile()
+            except AttributeError as e:
+                logger.warning("Could not compile the model because: ", e)
+
+        return model
+
+    def _create_auto_tokenizer(
+        self,
+    ) -> ProcessorMixin:
+        """
+        Create a Hugging Face AutoTokenizer for language model.
+
+        Returns:
+            transformers.PreTrainedTokenizer: The created tokenizer.
+        """
+        tokenizer_name = self.config.tokenizer or self.config.model_name
+        subfolder = self.config.subfolder
+        revision = self.config.revision + (f"/{subfolder}" if subfolder is not None else "")
+
+        tokenizer = AutoProcessor.from_pretrained(
+            tokenizer_name,
+            revision=revision,
+            trust_remote_code=self.config.trust_remote_code,
+            padding_side="left",
+            truncation_side="left",
+        )
+
+        return tokenizer
+
+    def _init_max_length(self) -> int:
+        """
+        Returns:
+            int: Max length to use depending on the available args and config
+        """
+        raise NotImplementedError()
+
+    def greedy_until(
+        self,
+        requests: list[GreedyUntilRequest],
+    ) -> list[GenerativeResponse]:
+        """
+        Generates responses using a greedy decoding strategy until certain ending conditions are met.
+
+        Args:
+            requests (list[Request]): list of requests containing the context and ending conditions.
+            override_bs (int, optional): Override the batch size for generation. Defaults to None.
+
+        Returns:
+            list[GenerativeResponse]: list of generated responses.
+        """
+        raise NotImplementedError()
+
+    def loglikelihood(
+        self,
+        requests: list[LoglikelihoodRequest],
+    ) -> list[LoglikelihoodResponse]:
+        raise NotImplementedError()
+
+    def loglikelihood_single_token(
+        self, requests: list[LoglikelihoodSingleTokenRequest]
+    ) -> list[LoglikelihoodSingleTokenResponse]:
+        """Tokenize the context and continuation and compute the log likelihood of those
+        tokenized sequences.
+
+        Args:
+            requests (list[Tuple[str, dict]]): _description_
+
+        Returns:
+            list[Tuple[float, bool]]: _description_
+        """
+        raise NotImplementedError()
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -43,6 +43,63 @@
 # fmt: on
 
 
+def mmmu(line, task_name: str = None):
+    import base64
+    from io import BytesIO
+
+    standard = "Answer with the option letter from the given choices directly."
+
+    def replace_images_tokens(input_string):
+        image_order = [int(num) for num in re.findall(r"<image\s+(\d+)>", input_string)]
+        input_string = re.sub(r"<image\s+\d+>", "<image>", input_string)
+        return input_string, image_order
+
+    def parse_options(options):
+        option_letters = [chr(ord("A") + i) for i in range(len(options))]
+        choices_str = "\n".join(
+            [f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]
+        )
+        return choices_str
+
+    def construct_prompt(doc):
+        question = doc["question"]
+        parsed_options = parse_options(ast.literal_eval(str(doc["options"])))
+        question = f"{question}\n{parsed_options}\n{standard}"
+        return question
+
+    def origin_mmmu_doc_to_visual(doc, image_order):
+        visual = []
+        for idx in image_order:
+            visual.append(doc[f"image_{idx}"])
+        return visual
+
+    def mmmu_doc_to_text(doc):
+        question = construct_prompt(doc)
+        return replace_images_tokens(question)
+
+    def encode_pil_image(pil_image):
+        # Create a byte stream object
+        buffered = BytesIO()
+        # Save the PIL image object as a byte stream in PNG format
+        pil_image.save(buffered, format="PNG")
+        # Get the byte stream data and perform Base64 encoding
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return img_str
+
+    prompt, image_order = mmmu_doc_to_text(line)
+    images = origin_mmmu_doc_to_visual(line, image_order)
+
+    images = [encode_pil_image(image) for image in images]
+
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=line["options"],
+        gold_index=string.ascii_uppercase.index(line["answer"]),
+        specific={"images": images, "id": line["id"]},
+    )
+
+
 def aime_prompt_fn(line, task_name: str = None):
     # Prompt template adapted from
     # - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17

diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -24,6 +24,22 @@
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+mmmu_pro = LightevalTaskConfig(
+    name="mmmu_pro",
+    suite=["lighteval"],
+    prompt_function=prompt.mmmu,
+    hf_repo="MMMU/MMMU_pro",
+    hf_subset="standard (4 options)",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=1,
+    metric=[Metrics.exact_match],
+    stop_sequence=None,
+    trust_dataset=True,
+    version=0,
+)
 abstract_narrative_understanding_bigbench = LightevalTaskConfig(
     name="abstract_narrative_understanding",
     suite=["bigbench", "bigbench_json"],