huggingface · plaguss · Feb 21, 2025 · Feb 25, 2025
diff --git a/src/lighteval/utils/code/__init__.py b/src/lighteval/utils/code/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/src/lighteval/utils/code/e2b.py b/src/lighteval/utils/code/e2b.py
@@ -0,0 +1,156 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import json
+
+from lighteval.utils.code.types import CodeRunnerOutput
+from lighteval.utils.code.utils import prepare_fn
+from lighteval.utils.imports import is_e2b_available
+
+
+# STDIO is related to a code snippet that reads from stdin and writes to stdout
+EVALUATION_SCRIPT_STDIO = """\
+import subprocess
+import json
+
+def evaluate_code(code, test_cases):
+    outputs = []
+    exec_timeout = 5
+    for test_case in test_cases:
+        process = subprocess.run(
+            ["python3", "-c", code],
+            input=test_case,
+            text=True,
+            capture_output=True,
+            timeout=exec_timeout
+        )
+        if process.returncode != 0:  # Error in execution
+            outputs.append('')
+        outputs.append(process.stdout.strip())
+    return outputs
+
+evaluate_code({code}, json.loads({test_cases}))
+"""
+
+# CALL_BASED is related to a function that is executed, and the output printed (so it can be captured)
+# No input needs to be passed, the arguments will be passed to a function call internally (has to be
+# present in the code senippet supplied).
+EVALUATION_SCRIPT_CALL_BASED = """\
+import subprocess
+import json
+import tempfile
+import os
+
+def evaluate_code(code, test_cases):
+    outputs = []
+    exec_timeout = 5
+
+    for test_case in test_cases:
+        script = code.format(args=test_case)
+        # Create a temporary Python file with just the code
+        try:
+            process = subprocess.run(
+                ["python", "-c", script],
+                text=True,
+                capture_output=True,
+                timeout=exec_timeout
+            )
+
+            if process.returncode != 0:
+                outputs.append("Execution error: " + process.stderr.strip())
+            else:
+                outputs.append(process.stdout.strip())
+
+        except subprocess.TimeoutExpired:
+            outputs.append("Timeout error")
+
+    return outputs
+
+evaluate_code({code}, json.loads({test_cases}))
+"""
+
+
+# TODO: Update the inputs type, it should be a list with dicts, and a list with
+# the inputs inside, so we can gather the data there, i.e.
+# [{input: [input1, input2, ...], "fn_name": ..}, {input: [input1, input2, ...], "fn_name": ..}, ...]
+def code_runner_e2b(inputs: list[list[str]], code: list[str], timeout: int = 300, **kwargs) -> CodeRunnerOutput:
+    """Runs the code in an e2b sandbox and returns the output or error: https://e2b.dev/
+
+    Args:
+        inputs (list[list[str]]): List of lists with test cases, each list corresponding
+            one element in the code list.
+        code (list[str]): The list of code snippets to run. For example they could correspond to N
+            generations of a model.
+        timeout (int): The timeout in seconds for the code to run. Defaults to 300.
+        **kwargs: Additional keyword arguments. This arguments will be passed to the e2b sandbox.
+            For example, `language` can be passed to specify the language of the code (defaults
+            to "python"), or the `request_timeout` (defaults to 3 seconds).
+
+    Returns:
+        CodeRunnerOutput: The output or error of the code.
+    """
+    if is_e2b_available():
+        from dotenv import load_dotenv
+        from e2b_code_interpreter import Sandbox
+
+        load_dotenv()
+    else:
+        raise ValueError("e2b is not available, install it with `pip install e2b_code_interpreter`")
+
+    template = kwargs.get("template", "stdio")
+    if template == "stdio":
+
+        def populate_template(code: str, info: list[str]) -> str:
+            return EVALUATION_SCRIPT_STDIO.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info)))
+
+    elif template == "call_based":
+        # This template doesn't take inputs, they must be written in the code snippet directly.
+        # The code snippets have to be prepared beforehand, as we won't pass them in the template
+        # script.
+        # In this case we prepare the function by adding a call to the function generated
+        # (it shouldn't be present). After that, the function is added to the template
+        def populate_template(code: str, info: list[str]) -> str:
+            formatted_code = prepare_fn(code, fn_name=kwargs.get("fn_name"))
+
+            return EVALUATION_SCRIPT_CALL_BASED.format(
+                code=json.dumps(formatted_code), test_cases=json.dumps(json.dumps(info))
+            )
+
+    else:
+        raise ValueError(f"Template {template} is not valid")
+
+    # Prepare the "scripts" to run
+    scripts = []
+    for code_snippet, info in zip(code, inputs):
+        scripts.append(populate_template(code_snippet, info))
+
+    language = kwargs.get("language", "python")
+    outputs = []
+    with Sandbox(timeout=timeout, request_timeout=kwargs.get("request_timeout", 3)) as sbx:
+        for script in scripts:
+            execution = sbx.run_code(script, language=language)
+            # The execution returns an object like the following:
+            # Execution(Results: [Result(...)], Logs: Logs(stdout: [], stderr: []), Error: None)
+            # Get the text representation of the result
+            # If everything went well, the result will be the output of the code in the .text attribute
+            outputs.append(execution.text)
+
+    return CodeRunnerOutput(output=outputs, error=None)
diff --git a/src/lighteval/utils/code/local.py b/src/lighteval/utils/code/local.py
@@ -0,0 +1,26 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from lighteval.utils.code.types import CodeRunnerOutput
+
+
+def code_runner_local(inputs: list[list[str]], code: list[str], timeout: int = 300, **kwargs) -> CodeRunnerOutput:
+    raise NotImplementedError("Not implemented")
diff --git a/src/lighteval/utils/code/runner.py b/src/lighteval/utils/code/runner.py
@@ -0,0 +1,47 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from lighteval.utils.code.e2b import code_runner_e2b
+from lighteval.utils.code.local import code_runner_local
+from lighteval.utils.code.types import CodeRunnerOutput
+
+
+def code_runner(inputs: list[list[str]], code: list[str], timeout: int = 300, **kwargs) -> CodeRunnerOutput:
+    """Runs the code with the given input and returns the output or error.
+
+    Args:
+        inputs (list[list[str]]): List of lists with test cases, each list corresponding
+            one element in the code list.
+        code (list[str]): The list of code snippets to run. For example they could correspond to N
+            generations of a model.
+        timeout (int): The timeout in seconds for the code to run. Defaults to 300.
+        **kwargs: Additional keyword arguments. This arguments will be passed to the e2b sandbox.
+            For example, `language` can be passed to specify the language of the code (defaults
+            to "python"), or the `request_timeout` (defaults to 3 seconds).
+
+    Returns:
+        CodeRunnerOutput: The output or error of the code.
+    """
+    sandbox = kwargs.get("sandbox", None)
+    if sandbox == "e2b":
+        return code_runner_e2b(inputs, code, timeout, **kwargs)
+
+    return code_runner_local(inputs, code, timeout, **kwargs)
diff --git a/src/lighteval/utils/code/types.py b/src/lighteval/utils/code/types.py
@@ -0,0 +1,30 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from typing import Literal, TypedDict
+
+
+AvailableSandboxes = Literal["e2b", "local"]
+
+
+class CodeRunnerOutput(TypedDict):
+    output: list[str] | None
+    error: str | None
diff --git a/src/lighteval/utils/code/utils.py b/src/lighteval/utils/code/utils.py
@@ -0,0 +1,88 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import ast
+import sys
+import textwrap
+from io import StringIO
+
+
+# Import string used across code snippets generated by LiveCodeBench.
+IMPORT_STRING = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n"
+
+
+def clean_if_name(code: str) -> str:
+    try:
+        astree = ast.parse(code)
+        last_block = astree.body[-1]
+        if isinstance(last_block, ast.If):
+            condition = last_block.test
+            if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                code = (
+                    ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)  # type: ignore
+                )
+    except Exception:
+        pass
+
+    return code
+
+
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def prepare_fn(raw_code: str, fn_name: str = "maxLength") -> str:
+    """Prepares the code for execution by adding the function call at the end of the code.
+    Necessary to capture the results thrown to stdout by e2b for code examples that generated
+    a function but it's not run directly in the example given. Additionally checks for a class
+    named Solution to store the method/function defined, which is typical from LeetCode
+    platform. If present, will take into account for the function call.
+
+    Args:
+        raw_code (str): The raw code snippet.
+        fn_name (str): The name of the function to call.
+            This information should appear directly in the benchmark dataset.
+            Defaults to "maxLength" just as an example.
+    """
+    if "class Solution:" in raw_code:
+        fn_call_template = """
+        result = Solution.{fn_name}({args})
+        print(result)
+        """
+    else:
+        fn_call_template = """
+        result = {fn_name}({args})
+        print(result)
+        """
+
+    fn_call = textwrap.dedent(fn_call_template.format(args="{args}", fn_name=fn_name))
+    code = f"{raw_code}\n{fn_call}"
+    return code
diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py
@@ -147,3 +147,7 @@ def wrapper(*args, **kwargs):
 
 
 NO_LATEX2SYMPY2_EXTENDED_ERROR_MSG = "You are trying to parse latex expressions, for which you need `latex2sympy2_extended`, which is not available in your environment. Please install it using `pip install lighteval[math]`."
+
+
+def is_e2b_available():
+    return importlib.util.find_spec("e2b_code_interpreter") is not None
Original file line number	Diff line number	Diff line change
Expand Up		@@ -147,3 +147,7 @@ def wrapper(args, *kwargs):


		NO_LATEX2SYMPY2_EXTENDED_ERROR_MSG = "You are trying to parse latex expressions, for which you need `latex2sympy2_extended`, which is not available in your environment. Please install it using `pip install lighteval[math]`."


		def is_e2b_available():
		return importlib.util.find_spec("e2b_code_interpreter") is not None