Skip to content

Add draft functionality for a generic sandboxed code running #580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/lighteval/utils/code/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
156 changes: 156 additions & 0 deletions src/lighteval/utils/code/e2b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json

from lighteval.utils.code.types import CodeRunnerOutput
from lighteval.utils.code.utils import prepare_fn
from lighteval.utils.imports import is_e2b_available


# STDIO is related to a code snippet that reads from stdin and writes to stdout
EVALUATION_SCRIPT_STDIO = """\
import subprocess
import json

def evaluate_code(code, test_cases):
outputs = []
exec_timeout = 5
for test_case in test_cases:
process = subprocess.run(
["python3", "-c", code],
input=test_case,
text=True,
capture_output=True,
timeout=exec_timeout
)
if process.returncode != 0: # Error in execution
outputs.append('')
outputs.append(process.stdout.strip())
return outputs

evaluate_code({code}, json.loads({test_cases}))
"""

# CALL_BASED is related to a function that is executed, and the output printed (so it can be captured)
# No input needs to be passed, the arguments will be passed to a function call internally (has to be
# present in the code senippet supplied).
EVALUATION_SCRIPT_CALL_BASED = """\
import subprocess
import json
import tempfile
import os

def evaluate_code(code, test_cases):
outputs = []
exec_timeout = 5

for test_case in test_cases:
script = code.format(args=test_case)
# Create a temporary Python file with just the code
try:
process = subprocess.run(
["python", "-c", script],
text=True,
capture_output=True,
timeout=exec_timeout
)

if process.returncode != 0:
outputs.append("Execution error: " + process.stderr.strip())
else:
outputs.append(process.stdout.strip())

except subprocess.TimeoutExpired:
outputs.append("Timeout error")

return outputs

evaluate_code({code}, json.loads({test_cases}))
"""


# TODO: Update the inputs type, it should be a list with dicts, and a list with
# the inputs inside, so we can gather the data there, i.e.
# [{input: [input1, input2, ...], "fn_name": ..}, {input: [input1, input2, ...], "fn_name": ..}, ...]
def code_runner_e2b(inputs: list[list[str]], code: list[str], timeout: int = 300, **kwargs) -> CodeRunnerOutput:
"""Runs the code in an e2b sandbox and returns the output or error: https://e2b.dev/

Args:
inputs (list[list[str]]): List of lists with test cases, each list corresponding
one element in the code list.
code (list[str]): The list of code snippets to run. For example they could correspond to N
generations of a model.
timeout (int): The timeout in seconds for the code to run. Defaults to 300.
**kwargs: Additional keyword arguments. This arguments will be passed to the e2b sandbox.
For example, `language` can be passed to specify the language of the code (defaults
to "python"), or the `request_timeout` (defaults to 3 seconds).

Returns:
CodeRunnerOutput: The output or error of the code.
"""
if is_e2b_available():
from dotenv import load_dotenv
from e2b_code_interpreter import Sandbox

load_dotenv()
else:
raise ValueError("e2b is not available, install it with `pip install e2b_code_interpreter`")

template = kwargs.get("template", "stdio")
if template == "stdio":

def populate_template(code: str, info: list[str]) -> str:
return EVALUATION_SCRIPT_STDIO.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info)))

elif template == "call_based":
# This template doesn't take inputs, they must be written in the code snippet directly.
# The code snippets have to be prepared beforehand, as we won't pass them in the template
# script.
# In this case we prepare the function by adding a call to the function generated
# (it shouldn't be present). After that, the function is added to the template
def populate_template(code: str, info: list[str]) -> str:
formatted_code = prepare_fn(code, fn_name=kwargs.get("fn_name"))

return EVALUATION_SCRIPT_CALL_BASED.format(
code=json.dumps(formatted_code), test_cases=json.dumps(json.dumps(info))
)

else:
raise ValueError(f"Template {template} is not valid")

# Prepare the "scripts" to run
scripts = []
for code_snippet, info in zip(code, inputs):
scripts.append(populate_template(code_snippet, info))

language = kwargs.get("language", "python")
outputs = []
with Sandbox(timeout=timeout, request_timeout=kwargs.get("request_timeout", 3)) as sbx:
for script in scripts:
execution = sbx.run_code(script, language=language)
# The execution returns an object like the following:
# Execution(Results: [Result(...)], Logs: Logs(stdout: [], stderr: []), Error: None)
# Get the text representation of the result
# If everything went well, the result will be the output of the code in the .text attribute
outputs.append(execution.text)

return CodeRunnerOutput(output=outputs, error=None)
26 changes: 26 additions & 0 deletions src/lighteval/utils/code/local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from lighteval.utils.code.types import CodeRunnerOutput


def code_runner_local(inputs: list[list[str]], code: list[str], timeout: int = 300, **kwargs) -> CodeRunnerOutput:
raise NotImplementedError("Not implemented")
47 changes: 47 additions & 0 deletions src/lighteval/utils/code/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from lighteval.utils.code.e2b import code_runner_e2b
from lighteval.utils.code.local import code_runner_local
from lighteval.utils.code.types import CodeRunnerOutput


def code_runner(inputs: list[list[str]], code: list[str], timeout: int = 300, **kwargs) -> CodeRunnerOutput:
"""Runs the code with the given input and returns the output or error.

Args:
inputs (list[list[str]]): List of lists with test cases, each list corresponding
one element in the code list.
code (list[str]): The list of code snippets to run. For example they could correspond to N
generations of a model.
timeout (int): The timeout in seconds for the code to run. Defaults to 300.
**kwargs: Additional keyword arguments. This arguments will be passed to the e2b sandbox.
For example, `language` can be passed to specify the language of the code (defaults
to "python"), or the `request_timeout` (defaults to 3 seconds).

Returns:
CodeRunnerOutput: The output or error of the code.
"""
sandbox = kwargs.get("sandbox", None)
if sandbox == "e2b":
return code_runner_e2b(inputs, code, timeout, **kwargs)

return code_runner_local(inputs, code, timeout, **kwargs)
30 changes: 30 additions & 0 deletions src/lighteval/utils/code/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from typing import Literal, TypedDict


AvailableSandboxes = Literal["e2b", "local"]


class CodeRunnerOutput(TypedDict):
output: list[str] | None
error: str | None
88 changes: 88 additions & 0 deletions src/lighteval/utils/code/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import ast
import sys
import textwrap
from io import StringIO


# Import string used across code snippets generated by LiveCodeBench.
IMPORT_STRING = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n"


def clean_if_name(code: str) -> str:
try:
astree = ast.parse(code)
last_block = astree.body[-1]
if isinstance(last_block, ast.If):
condition = last_block.test
if ast.unparse(condition).strip() == "__name__ == '__main__'":
code = (
ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body) # type: ignore
)
except Exception:
pass

return code


class Capturing(list):
def __enter__(self):
self._stdout = sys.stdout
sys.stdout = self._stringio = StringIO()
# Make closing the StringIO a no-op
self._stringio.close = lambda x: 1
return self

def __exit__(self, *args):
self.append(self._stringio.getvalue())
del self._stringio # free up some memory
sys.stdout = self._stdout


def prepare_fn(raw_code: str, fn_name: str = "maxLength") -> str:
"""Prepares the code for execution by adding the function call at the end of the code.
Necessary to capture the results thrown to stdout by e2b for code examples that generated
a function but it's not run directly in the example given. Additionally checks for a class
named Solution to store the method/function defined, which is typical from LeetCode
platform. If present, will take into account for the function call.

Args:
raw_code (str): The raw code snippet.
fn_name (str): The name of the function to call.
This information should appear directly in the benchmark dataset.
Defaults to "maxLength" just as an example.
"""
if "class Solution:" in raw_code:
fn_call_template = """
result = Solution.{fn_name}({args})
print(result)
"""
else:
fn_call_template = """
result = {fn_name}({args})
print(result)
"""

fn_call = textwrap.dedent(fn_call_template.format(args="{args}", fn_name=fn_name))
code = f"{raw_code}\n{fn_call}"
return code
4 changes: 4 additions & 0 deletions src/lighteval/utils/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,7 @@ def wrapper(*args, **kwargs):


NO_LATEX2SYMPY2_EXTENDED_ERROR_MSG = "You are trying to parse latex expressions, for which you need `latex2sympy2_extended`, which is not available in your environment. Please install it using `pip install lighteval[math]`."


def is_e2b_available():
return importlib.util.find_spec("e2b_code_interpreter") is not None