From fac137eae51ac9ba0ba8ccb944a9a91fa3f9154c Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Tue, 6 May 2025 19:40:04 +0800
Subject: [PATCH 01/13] - removed htrack_block in
 src/lighteval/main_nanotron.py - fixed import path for NanotronLightevalModel
 src/lighteval/pipeline.py

---
 src/lighteval/main_nanotron.py | 31 ++++++++++++++-----------------
 src/lighteval/pipeline.py      |  2 +-
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 94004c065..345be1fe9 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -52,7 +52,6 @@ def nanotron(
 
     from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig
     from lighteval.logging.evaluation_tracker import EvaluationTracker
-    from lighteval.logging.hierarchical_logger import htrack_block
     from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
     from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
     from lighteval.utils.utils import EnvConfig
@@ -62,22 +61,20 @@ def nanotron(
     if not is_nanotron_available():
         raise ImportError(NO_NANOTRON_ERROR_MSG)
 
-    with htrack_block("Load nanotron config"):
-        # Create nanotron config
-        if not checkpoint_config_path.endswith(".yaml"):
-            raise ValueError("The checkpoint path should point to a YAML file")
-
-        model_config = get_config_from_file(
-            checkpoint_config_path,
-            config_class=Config,
-            model_config_class=None,
-            skip_unused_config_keys=True,
-            skip_null_keys=True,
-        )
-
-        # We are getting an type error, because the get_config_from_file is not correctly typed,
-        lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
-        nanotron_config = FullNanotronConfig(lighteval_config, model_config)
+    if not checkpoint_config_path.endswith(".yaml"):
+        raise ValueError("The checkpoint path should point to a YAML file")
+
+    model_config = get_config_from_file(
+        checkpoint_config_path,
+        config_class=Config,
+        model_config_class=None,
+        skip_unused_config_keys=True,
+        skip_null_keys=True,
+    )
+
+    # We are getting an type error, because the get_config_from_file is not correctly typed,
+    lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
+    nanotron_config = FullNanotronConfig(lighteval_config, model_config)
 
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 39e007b33..439286f10 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -72,7 +72,7 @@
     from nanotron.parallel.context import ParallelContext
     from nanotron.utils import local_ranks_zero_first
 
-    from lighteval.models.nanotron_model import NanotronLightevalModel
+    from lighteval.models.nanotron.nanotron_model import NanotronLightevalModel
 
 
 import logging

From 044c86534cfc55f626840d7b34f5ed0b1ee55a73 Mon Sep 17 00:00:00 2001
From: nouamanetazi <nouamane98@gmail.com>
Date: Sat, 22 Mar 2025 10:59:37 +0000
Subject: [PATCH 02/13] .

rebased pr #656
---
 src/lighteval/config/lighteval_config.py      |  8 +++
 src/lighteval/main_nanotron.py                | 51 ++++++++++++-------
 src/lighteval/models/__init__.py              | 21 ++++++++
 src/lighteval/models/nanotron/__init__.py     | 21 ++++++++
 .../models/nanotron/nanotron_model.py         | 22 +++++---
 src/lighteval/models/nanotron_model.py        | 26 ++++++++++
 src/lighteval/pipeline.py                     |  7 ++-
 7 files changed, 129 insertions(+), 27 deletions(-)
 create mode 100644 src/lighteval/models/__init__.py
 create mode 100644 src/lighteval/models/nanotron/__init__.py
 create mode 100644 src/lighteval/models/nanotron_model.py

diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
index f24a15184..0e8217afe 100644
--- a/src/lighteval/config/lighteval_config.py
+++ b/src/lighteval/config/lighteval_config.py
@@ -101,3 +101,11 @@ class LightEvalConfig:
 class FullNanotronConfig:
     lighteval_config: LightEvalConfig
     nanotron_config: "Config"
+    
+    @property
+    def generation_parameters(self):
+        # Return the generation parameters from the lighteval config
+        # or create default generation parameters if none are set
+        if self.lighteval_config.generation:
+            return self.lighteval_config.generation
+        return GenerationArgs()
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 94004c065..1b973a112 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -42,17 +42,17 @@ def nanotron(
     checkpoint_config_path: Annotated[
         str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
     ],
-    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
+    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")] = None,
     cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR,
 ):
     """
     Evaluate models using nanotron as backend.
     """
     from nanotron.config import Config, get_config_from_file
+    from nanotron.config.parallelism_config import ParallelismArgs
 
-    from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig
+    from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig, LightEvalLoggingArgs, LightEvalTasksArgs
     from lighteval.logging.evaluation_tracker import EvaluationTracker
-    from lighteval.logging.hierarchical_logger import htrack_block
     from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
     from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
     from lighteval.utils.utils import EnvConfig
@@ -61,23 +61,38 @@ def nanotron(
 
     if not is_nanotron_available():
         raise ImportError(NO_NANOTRON_ERROR_MSG)
+    
+    # Create nanotron config
+    if not checkpoint_config_path.endswith(".yaml"):
+        raise ValueError("The checkpoint path should point to a YAML file")
+
+    model_config = get_config_from_file(
+        checkpoint_config_path,
+        config_class=Config,
+        model_config_class=None,
+        skip_unused_config_keys=True,
+        skip_null_keys=True,
+    )
 
-    with htrack_block("Load nanotron config"):
-        # Create nanotron config
-        if not checkpoint_config_path.endswith(".yaml"):
-            raise ValueError("The checkpoint path should point to a YAML file")
-
-        model_config = get_config_from_file(
-            checkpoint_config_path,
-            config_class=Config,
-            model_config_class=None,
-            skip_unused_config_keys=True,
-            skip_null_keys=True,
-        )
-
-        # We are getting an type error, because the get_config_from_file is not correctly typed,
+    # Create or use default lighteval config
+    if lighteval_config_path is not None:
         lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
-        nanotron_config = FullNanotronConfig(lighteval_config, model_config)
+    else:
+        # Create default config with minimal required parameters
+        default_logging = LightEvalLoggingArgs(
+            output_dir="./eval_results"
+        )
+        default_tasks = LightEvalTasksArgs(
+            tasks="lighteval|agieval:aqua-rat|5|0"
+        )
+        default_parallelism = ParallelismArgs(dp=1, pp=1, tp=1)
+        lighteval_config = LightEvalConfig(
+            logging=default_logging,
+            tasks=default_tasks,
+            parallelism=default_parallelism
+        )
+    
+    nanotron_config = FullNanotronConfig(lighteval_config, model_config)
 
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py
new file mode 100644
index 000000000..064e2842d
--- /dev/null
+++ b/src/lighteval/models/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE. 
\ No newline at end of file
diff --git a/src/lighteval/models/nanotron/__init__.py b/src/lighteval/models/nanotron/__init__.py
new file mode 100644
index 000000000..064e2842d
--- /dev/null
+++ b/src/lighteval/models/nanotron/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE. 
\ No newline at end of file
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index 5d5bb934c..b785eeb0b 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -343,7 +343,14 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
         return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
 
     def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
-        return self.model(inputs)
+        position_ids = (
+            torch.arange(
+                inputs.shape[1], device=inputs.device, dtype=torch.int32
+            )
+            .unsqueeze(0)
+            .repeat(inputs.shape[0], 1)
+        )
+        return self.model(inputs, position_ids)
 
     def homogeneize_ending_conditions(self, ending_condition: tuple | dict | list | str) -> tuple[list, int]:
         """Ending conditions are submitted in several possible formats.
@@ -711,14 +718,14 @@ def _loglikelihood_single_token(
                     inputs, padding_length=max_context, max_context=max_context, full_attention_masks=True
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
-
-                out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
+                position_ids = torch.arange(batch_model.input_ids.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0).repeat(batch_model.input_ids.shape[0], 1)
+                out = self.model(input_ids=batch_model.input_ids, position_ids=position_ids)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
 
-                    # Gather all the output across TP
-                    out = out.transpose(0, 1).contiguous()  # [batch, seq_length, vocab]
+                    # Gather all the output accross TP
+                    out = out.view(*batch_model.input_ids.shape, -1).contiguous()  # [batch, seq_length, vocab]
 
                     gathered_out = [torch.zeros_like(out) for _ in range(self.parallel_context.tp_pg.size())]
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
@@ -944,7 +951,8 @@ def _loglikelihood_tokens(
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
                 with torch.no_grad():
-                    out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
+                    position_ids = torch.arange(batch_model.input_ids.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0).repeat(batch_model.input_ids.shape[0], 1)
+                    out = self.model(input_ids=batch_model.input_ids, position_ids=position_ids)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
@@ -954,7 +962,7 @@ def _loglikelihood_tokens(
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
                     out = torch.cat(gathered_out, dim=-1)
 
-                    out = out.transpose(0, 1)  # [batch, seq_length, vocab]
+                    out = out.view(*batch_model.input_ids.shape, -1)  # [batch, seq_length, vocab]
                     multi_logits = F.log_softmax(out, dim=-1)  # [batch, padding_length, vocab]
 
                     logits_sum = []
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
new file mode 100644
index 000000000..4a1ed72c6
--- /dev/null
+++ b/src/lighteval/models/nanotron_model.py
@@ -0,0 +1,26 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Import and re-export the NanotronLightevalModel class from the nanotron module
+from lighteval.models.nanotron.nanotron_model import NanotronLightevalModel
+
+__all__ = ["NanotronLightevalModel"] 
\ No newline at end of file
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 39e007b33..68ae71920 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -72,7 +72,7 @@
     from nanotron.parallel.context import ParallelContext
     from nanotron.utils import local_ranks_zero_first
 
-    from lighteval.models.nanotron_model import NanotronLightevalModel
+    # from lighteval.models.nanotron import NanotronLightevalModel
 
 
 import logging
@@ -187,15 +187,18 @@ def _init_model(self, model_config, model):
         logger.info("--- LOADING MODEL ---")
         if model_config is not None:
             if self.parallel_context:
+                from lighteval.models.nanotron_model import NanotronLightevalModel
+                
                 return NanotronLightevalModel(
                     checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path)
                     if self.pipeline_parameters.nanotron_checkpoint_path
                     else "",
-                    nanotron_config=self.model_config,
+                    nanotron_config=model_config,
                     parallel_context=self.parallel_context,
                     debug_one_layer_model=False,
                     model_class=None,
                 )
+                # return None
             else:
                 return load_model(config=model_config)
         if isinstance(model, TransformersModel):

From aad905096ee7e9cf076fdc9c118bc3009bfe8bc0 Mon Sep 17 00:00:00 2001
From: nouamanetazi <nouamane98@gmail.com>
Date: Sat, 22 Mar 2025 11:31:06 +0000
Subject: [PATCH 03/13] .

---
 src/lighteval/main_nanotron.py | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 1b973a112..22755997a 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -42,7 +42,7 @@ def nanotron(
     checkpoint_config_path: Annotated[
         str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
     ],
-    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")] = None,
+    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
     cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR,
 ):
     """
@@ -74,23 +74,8 @@ def nanotron(
         skip_null_keys=True,
     )
 
-    # Create or use default lighteval config
-    if lighteval_config_path is not None:
-        lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
-    else:
-        # Create default config with minimal required parameters
-        default_logging = LightEvalLoggingArgs(
-            output_dir="./eval_results"
-        )
-        default_tasks = LightEvalTasksArgs(
-            tasks="lighteval|agieval:aqua-rat|5|0"
-        )
-        default_parallelism = ParallelismArgs(dp=1, pp=1, tp=1)
-        lighteval_config = LightEvalConfig(
-            logging=default_logging,
-            tasks=default_tasks,
-            parallelism=default_parallelism
-        )
+    # Load lighteval config
+    lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
     
     nanotron_config = FullNanotronConfig(lighteval_config, model_config)
 

From 7995fa63a92146d5796bf9cb73c5da41bddcb447 Mon Sep 17 00:00:00 2001
From: nouamanetazi <nouamane98@gmail.com>
Date: Sat, 22 Mar 2025 11:37:11 +0000
Subject: [PATCH 04/13] .

---
 src/lighteval/models/__init__.py          | 21 ------------------
 src/lighteval/models/nanotron/__init__.py | 21 ------------------
 src/lighteval/models/nanotron_model.py    | 26 -----------------------
 src/lighteval/pipeline.py                 |  7 ++----
 4 files changed, 2 insertions(+), 73 deletions(-)
 delete mode 100644 src/lighteval/models/__init__.py
 delete mode 100644 src/lighteval/models/nanotron/__init__.py
 delete mode 100644 src/lighteval/models/nanotron_model.py

diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py
deleted file mode 100644
index 064e2842d..000000000
--- a/src/lighteval/models/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE. 
\ No newline at end of file
diff --git a/src/lighteval/models/nanotron/__init__.py b/src/lighteval/models/nanotron/__init__.py
deleted file mode 100644
index 064e2842d..000000000
--- a/src/lighteval/models/nanotron/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE. 
\ No newline at end of file
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
deleted file mode 100644
index 4a1ed72c6..000000000
--- a/src/lighteval/models/nanotron_model.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# Import and re-export the NanotronLightevalModel class from the nanotron module
-from lighteval.models.nanotron.nanotron_model import NanotronLightevalModel
-
-__all__ = ["NanotronLightevalModel"] 
\ No newline at end of file
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 68ae71920..bc0f5c819 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -72,7 +72,7 @@
     from nanotron.parallel.context import ParallelContext
     from nanotron.utils import local_ranks_zero_first
 
-    # from lighteval.models.nanotron import NanotronLightevalModel
+    from lighteval.models.nanotron.nanotron_model import NanotronLightevalModel
 
 
 import logging
@@ -186,9 +186,7 @@ def _init_parallelism_manager(self):
     def _init_model(self, model_config, model):
         logger.info("--- LOADING MODEL ---")
         if model_config is not None:
-            if self.parallel_context:
-                from lighteval.models.nanotron_model import NanotronLightevalModel
-                
+            if self.parallel_context:                
                 return NanotronLightevalModel(
                     checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path)
                     if self.pipeline_parameters.nanotron_checkpoint_path
@@ -198,7 +196,6 @@ def _init_model(self, model_config, model):
                     debug_one_layer_model=False,
                     model_class=None,
                 )
-                # return None
             else:
                 return load_model(config=model_config)
         if isinstance(model, TransformersModel):

From eca97ebba0c6c91c4df33c399536547584496fae Mon Sep 17 00:00:00 2001
From: Jason Stillerman <jason.t.stillerman@gmail.com>
Date: Wed, 26 Mar 2025 00:53:13 +0000
Subject: [PATCH 05/13] allow extra keywords in LightevalTaskConfig

---
 src/lighteval/tasks/lighteval_task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index da09ec000..2681e779d 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -107,6 +107,7 @@ class LightevalTaskConfig:
     few_shots_select: Optional[str] = None
 
     # Generation args
+    output_regex: Optional[str] = None
     generation_size: Optional[int] = None
     generation_grammar: Optional[TextGenerationInputGrammarType] = None
     stop_sequence: Optional[ListLike[str]] = None
@@ -120,6 +121,7 @@ class LightevalTaskConfig:
     must_remove_duplicate_docs: bool = False
 
     version: int = 0
+    frozen: bool = False
 
     def __post_init__(self):
         # If we got a Metrics enums instead of a Metric, we convert

From 70f7f9ee81aba20a80d77ac20b82f2a618f3b75d Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Tue, 6 May 2025 23:59:40 +0800
Subject: [PATCH 06/13] removed EnvConfig for nanotron

---
 src/lighteval/main_nanotron.py                | 14 +++---------
 .../models/nanotron/nanotron_model.py         | 22 +++++++------------
 src/lighteval/pipeline.py                     |  2 +-
 3 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 345be1fe9..2bf951549 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -26,9 +26,6 @@
 from typer import Option
 from typing_extensions import Annotated
 
-
-CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
-
 HELP_PANEL_NAME_1 = "Common Parameters"
 HELP_PANEL_NAME_2 = "Logging Parameters"
 HELP_PANEL_NAME_3 = "Debug Parameters"
@@ -42,8 +39,7 @@ def nanotron(
     checkpoint_config_path: Annotated[
         str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
     ],
-    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
-    cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR,
+    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")]
 ):
     """
     Evaluate models using nanotron as backend.
@@ -54,9 +50,6 @@ def nanotron(
     from lighteval.logging.evaluation_tracker import EvaluationTracker
     from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
     from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
-    from lighteval.utils.utils import EnvConfig
-
-    env_config = EnvConfig(token=os.getenv("HF_TOKEN"), cache_dir=cache_dir)
 
     if not is_nanotron_available():
         raise ImportError(NO_NANOTRON_ERROR_MSG)
@@ -75,7 +68,7 @@ def nanotron(
     # We are getting an type error, because the get_config_from_file is not correctly typed,
     lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
     nanotron_config = FullNanotronConfig(lighteval_config, model_config)
-
+    
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
         hub_results_org=lighteval_config.logging.results_org,
@@ -89,12 +82,11 @@ def nanotron(
 
     pipeline_parameters = PipelineParameters(
         launcher_type=ParallelismManager.NANOTRON,
-        env_config=env_config,
         job_id=os.environ.get("SLURM_JOB_ID", 0),
         nanotron_checkpoint_path=checkpoint_config_path,
         dataset_loading_processes=lighteval_config.tasks.dataset_loading_processes,
         custom_tasks_directory=lighteval_config.tasks.custom_tasks,
-        override_batch_size=lighteval_config.batch_size,
+        # override_batch_size=lighteval_config.batch_size,
         num_fewshot_seeds=1,
         max_samples=lighteval_config.tasks.max_samples,
         use_chat_template=False,
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index 5d5bb934c..70d37122a 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -56,7 +56,7 @@
 )
 from lighteval.utils.imports import is_nanotron_available
 from lighteval.utils.parallelism import find_executable_batch_size
-from lighteval.utils.utils import EnvConfig, as_list
+from lighteval.utils.utils import as_list
 
 
 logger = logging.getLogger(__name__)
@@ -101,7 +101,6 @@ def __init__(
         trust_remote_code: bool = False,
         debug_one_layer_model: bool = False,
         model_class: Optional[Type] = None,
-        env_config: EnvConfig = None,
     ):
         """Initializes a nanotron model for evaluation.
         Args:
@@ -138,7 +137,6 @@ def __init__(
         self._add_special_tokens = add_special_tokens
         self._tokenizer = self._create_auto_tokenizer(
             pretrained=tokenizer.tokenizer_name_or_path,
-            env_config=env_config,
             trust_remote_code=trust_remote_code,
         )
         self._tokenizer.model_max_length = self.max_length
@@ -230,7 +228,6 @@ def _create_auto_tokenizer(
         *,
         pretrained: str,
         tokenizer: Optional[str] = None,
-        env_config: EnvConfig = None,
         trust_remote_code: bool = False,
     ) -> transformers.PreTrainedTokenizer:
         """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
@@ -238,15 +235,11 @@ def _create_auto_tokenizer(
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 pretrained if tokenizer is None else tokenizer,
-                cache_dir=env_config.cache_dir,
-                token=env_config.token,
                 trust_remote_code=trust_remote_code,
             )
         except RecursionError:
             tokenizer = AutoTokenizer.from_pretrained(
                 pretrained if tokenizer is None else tokenizer,
-                cache_dir=env_config.cache_dir,
-                token=env_config.token,
                 unk_token="<unk>",
                 trust_remote_code=trust_remote_code,
             )
@@ -711,14 +704,14 @@ def _loglikelihood_single_token(
                     inputs, padding_length=max_context, max_context=max_context, full_attention_masks=True
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
-
-                out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
+                position_ids = torch.arange(batch_model.input_ids.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0).repeat(batch_model.input_ids.shape[0], 1)
+                out = self.model(input_ids=batch_model.input_ids, position_ids=position_ids)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
 
-                    # Gather all the output across TP
-                    out = out.transpose(0, 1).contiguous()  # [batch, seq_length, vocab]
+                    # Gather all the output accross TP
+                    out = out.view(*batch_model.input_ids.shape, -1).contiguous()  # [batch, seq_length, vocab]
 
                     gathered_out = [torch.zeros_like(out) for _ in range(self.parallel_context.tp_pg.size())]
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
@@ -944,7 +937,8 @@ def _loglikelihood_tokens(
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
                 with torch.no_grad():
-                    out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
+                    position_ids = torch.arange(batch_model.input_ids.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0).repeat(batch_model.input_ids.shape[0], 1)
+                    out = self.model(input_ids=batch_model.input_ids, position_ids=position_ids)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
@@ -954,7 +948,7 @@ def _loglikelihood_tokens(
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
                     out = torch.cat(gathered_out, dim=-1)
 
-                    out = out.transpose(0, 1)  # [batch, seq_length, vocab]
+                    out = out.view(*batch_model.input_ids.shape, -1)  # [batch, seq_length, vocab]
                     multi_logits = F.log_softmax(out, dim=-1)  # [batch, padding_length, vocab]
 
                     logits_sum = []
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 439286f10..f24021b99 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -155,7 +155,7 @@ def __init__(
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
 
-        generation_parameters = model_config.generation_parameters.model_dump() if model_config else {}
+        generation_parameters = model_config.generation_parameters.model_dump() if model_config and hasattr(model_config, "generation_parameters") else {}
 
         self.evaluation_tracker.general_config_logger.log_model_info(generation_parameters, self.model.model_info)
         self._init_random_seeds()

From c4c264c9480670110356727a111f4a350a6b3371 Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Wed, 7 May 2025 00:30:29 +0800
Subject: [PATCH 07/13] used asdict instead of model_dump

---
 src/lighteval/pipeline.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 222210ded..a55405f44 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -27,7 +27,7 @@
 import re
 import shutil
 from contextlib import nullcontext
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from datetime import timedelta
 from enum import Enum, auto
 
@@ -154,8 +154,7 @@ def __init__(
         self._metric_options = metric_options or {}
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
-
-        generation_parameters = model_config.generation_parameters.model_dump() if model_config and hasattr(model_config, "generation_parameters") else {}
+        generation_parameters = asdict(model_config.generation_parameters) if model_config and hasattr(model_config, "generation_parameters") else {}
 
         self.evaluation_tracker.general_config_logger.log_model_info(generation_parameters, self.model.model_info)
         self._init_random_seeds()

From 4c7a1e7458776971fa38dc895d76e530cf7ab81f Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Wed, 7 May 2025 00:31:42 +0800
Subject: [PATCH 08/13] added input_mask for nanotron models' forward

---
 src/lighteval/models/nanotron/nanotron_model.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index d0b45c7da..dc285e353 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -336,14 +336,9 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
         return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
 
     def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
-        position_ids = (
-            torch.arange(
-                inputs.shape[1], device=inputs.device, dtype=torch.int32
-            )
-            .unsqueeze(0)
-            .repeat(inputs.shape[0], 1)
-        )
-        return self.model(inputs, position_ids)
+        # This is only called for detecting the batch size so we just need a mock input_mask
+        input_mask = torch.ones_like(inputs)
+        return self.model(inputs, input_mask)
 
     def homogeneize_ending_conditions(self, ending_condition: tuple | dict | list | str) -> tuple[list, int]:
         """Ending conditions are submitted in several possible formats.
@@ -711,8 +706,7 @@ def _loglikelihood_single_token(
                     inputs, padding_length=max_context, max_context=max_context, full_attention_masks=True
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
-                position_ids = torch.arange(batch_model.input_ids.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0).repeat(batch_model.input_ids.shape[0], 1)
-                out = self.model(input_ids=batch_model.input_ids, position_ids=position_ids)
+                out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
@@ -944,8 +938,7 @@ def _loglikelihood_tokens(
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
                 with torch.no_grad():
-                    position_ids = torch.arange(batch_model.input_ids.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0).repeat(batch_model.input_ids.shape[0], 1)
-                    out = self.model(input_ids=batch_model.input_ids, position_ids=position_ids)
+                    out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs

From cb63773d1df756301650359c5b83003c44cbe3cd Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Wed, 7 May 2025 06:45:33 +0800
Subject: [PATCH 09/13] removed override_bs, use batch_size from
 lighteval_config

---
 .../models/nanotron/nanotron_model.py         | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index dc285e353..c03c24793 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -114,6 +114,10 @@ def __init__(
         self._max_length = max_length
         self.parallel_config = parallel_config
         self.parallel_context = parallel_context
+        if hasattr(lighteval_config, "batch_size"):
+            self.batch_size = lighteval_config.batch_size
+        else:
+            self.batch_size = None
 
         if parallel_config.pp > 1:
             # To implement PP parallelism we need to think about how we want to sync the output for the PP ranks without outputs
@@ -298,9 +302,9 @@ def max_length(self) -> int:
     def device(self) -> Union[int, str, torch.device]:
         return "cuda"
 
-    def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
-        if override_bs:
-            return override_bs
+    def _get_batch_size(self, max_input_length: int, starting_batch_size: int = 512) -> int:
+        if self.batch_size is not None:
+            return self.batch_size
         logger.warning("Detecting largest batch size")
 
         @find_executable_batch_size(
@@ -395,7 +399,7 @@ def _check_continuations_start_space(self, continuation: str) -> str:
         return continuation
 
     def loglikelihood_single_token(
-        self, requests: List[Tuple[str, dict]], override_bs=0
+        self, requests: List[Tuple[str, dict]],
     ) -> List[LoglikelihoodSingleTokenResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
@@ -428,11 +432,10 @@ def loglikelihood_single_token(
 
         return self._loglikelihood_single_token(
             requests,
-            override_bs=override_bs,
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
         )
 
-    def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None) -> List[LoglikelihoodResponse]:
+    def loglikelihood(self, requests: List[LoglikelihoodRequest]) -> List[LoglikelihoodResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
         """
@@ -450,12 +453,11 @@ def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None)
 
         return self._loglikelihood_tokens(
             requests,
-            override_bs=override_bs,
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
         )
 
     def loglikelihood_rolling(
-        self, requests: List[LoglikelihoodRollingRequest], override_bs: int = 0
+        self, requests: List[LoglikelihoodRollingRequest],
     ) -> List[LoglikelihoodResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         for request in tqdm(
@@ -466,7 +468,6 @@ def loglikelihood_rolling(
 
         results = self._loglikelihood_tokens(
             requests,
-            override_bs=override_bs,
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
             return_bool_score=False,
         )
@@ -632,7 +633,7 @@ def _get_subsets(self, dataset, num_dataset_splits):
 
     @torch.inference_mode()
     def _loglikelihood_single_token(
-        self, requests, disable_tqdm: bool = False, override_bs: int = 0, num_dataset_splits: int = 1
+        self, requests, disable_tqdm: bool = False, num_dataset_splits: int = 1
     ) -> List[LoglikelihoodSingleTokenResponse]:
         dataset = LoglikelihoodSingleTokenDataset(requests=requests)
         res = []
@@ -660,7 +661,7 @@ def _loglikelihood_single_token(
             context_enc = dataset[0].tokenized_context
             max_context = len(context_enc[-self.max_length :])
             batch_size = self._get_batch_size(
-                override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
+                max_input_length=max_context, starting_batch_size=starting_batch_size
             )
 
             starting_batch_size = batch_size * 2  # for the next round
@@ -860,7 +861,6 @@ def _loglikelihood_tokens(
         self,
         requests,
         disable_tqdm: bool = False,
-        override_bs: int = -1,
         num_dataset_splits: int = 1,
         return_bool_score: bool = True,
     ) -> List[LoglikelihoodResponse]:
@@ -892,7 +892,7 @@ def _loglikelihood_tokens(
             max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
 
             batch_size = self._get_batch_size(
-                override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
+                max_input_length=max_context, starting_batch_size=starting_batch_size
             )
             starting_batch_size = batch_size * 2  # for the next round
 
@@ -1094,7 +1094,6 @@ def greedy_until(
         self,
         requests: List[GreedyUntilRequest],
         disable_tqdm: bool = False,
-        override_bs: int = -1,
         num_dataset_splits: int = 1,
     ) -> List[GenerativeResponse]:
         """Greedy generation until a stop token is generated."""
@@ -1134,7 +1133,6 @@ def greedy_until(
                 max_input_length = min(len(context_enc) + max_gen, self.max_length)
 
             batch_size = self._get_batch_size(
-                override_bs=override_bs,
                 max_input_length=max_input_length,
                 starting_batch_size=starting_batch_size,
             )

From b7feb729f2b04e14dc97faa573ab4481afd9599c Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Tue, 20 May 2025 11:03:03 +0800
Subject: [PATCH 10/13] fixed dataclass & pydantic dual compat in pipeline.py

---
 src/lighteval/config/lighteval_config.py        |  2 +-
 src/lighteval/main_nanotron.py                  | 11 +++++++----
 src/lighteval/models/nanotron/nanotron_model.py | 14 ++++++--------
 src/lighteval/pipeline.py                       | 14 +++++++++++---
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
index 0e8217afe..100ab5431 100644
--- a/src/lighteval/config/lighteval_config.py
+++ b/src/lighteval/config/lighteval_config.py
@@ -101,7 +101,7 @@ class LightEvalConfig:
 class FullNanotronConfig:
     lighteval_config: LightEvalConfig
     nanotron_config: "Config"
-    
+
     @property
     def generation_parameters(self):
         # Return the generation parameters from the lighteval config
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index d10483131..d263090db 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -26,6 +26,7 @@
 from typer import Option
 from typing_extensions import Annotated
 
+
 HELP_PANEL_NAME_1 = "Common Parameters"
 HELP_PANEL_NAME_2 = "Logging Parameters"
 HELP_PANEL_NAME_3 = "Debug Parameters"
@@ -39,15 +40,17 @@ def nanotron(
     checkpoint_config_path: Annotated[
         str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
     ],
-    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")]
+    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
 ):
     """
     Evaluate models using nanotron as backend.
     """
     from nanotron.config import Config, get_config_from_file
-    from nanotron.config.parallelism_config import ParallelismArgs
 
-    from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig, LightEvalLoggingArgs, LightEvalTasksArgs
+    from lighteval.config.lighteval_config import (
+        FullNanotronConfig,
+        LightEvalConfig,
+    )
     from lighteval.logging.evaluation_tracker import EvaluationTracker
     from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
     from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
@@ -76,7 +79,7 @@ def nanotron(
     # We are getting an type error, because the get_config_from_file is not correctly typed,
     lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
     nanotron_config = FullNanotronConfig(lighteval_config, model_config)
-    
+
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
         hub_results_org=lighteval_config.logging.results_org,
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index c03c24793..137caa8a9 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -399,7 +399,8 @@ def _check_continuations_start_space(self, continuation: str) -> str:
         return continuation
 
     def loglikelihood_single_token(
-        self, requests: List[Tuple[str, dict]],
+        self,
+        requests: List[Tuple[str, dict]],
     ) -> List[LoglikelihoodSingleTokenResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
@@ -457,7 +458,8 @@ def loglikelihood(self, requests: List[LoglikelihoodRequest]) -> List[Loglikelih
         )
 
     def loglikelihood_rolling(
-        self, requests: List[LoglikelihoodRollingRequest],
+        self,
+        requests: List[LoglikelihoodRollingRequest],
     ) -> List[LoglikelihoodResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         for request in tqdm(
@@ -660,9 +662,7 @@ def _loglikelihood_single_token(
             # pull longest context sample from request
             context_enc = dataset[0].tokenized_context
             max_context = len(context_enc[-self.max_length :])
-            batch_size = self._get_batch_size(
-                max_input_length=max_context, starting_batch_size=starting_batch_size
-            )
+            batch_size = self._get_batch_size(max_input_length=max_context, starting_batch_size=starting_batch_size)
 
             starting_batch_size = batch_size * 2  # for the next round
 
@@ -891,9 +891,7 @@ def _loglikelihood_tokens(
 
             max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
 
-            batch_size = self._get_batch_size(
-                max_input_length=max_context, starting_batch_size=starting_batch_size
-            )
+            batch_size = self._get_batch_size(max_input_length=max_context, starting_batch_size=starting_batch_size)
             starting_batch_size = batch_size * 2  # for the next round
 
             # For the DP replicas
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index a55405f44..503e2dd57 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -27,7 +27,7 @@
 import re
 import shutil
 from contextlib import nullcontext
-from dataclasses import asdict, dataclass
+from dataclasses import asdict, dataclass, is_dataclass
 from datetime import timedelta
 from enum import Enum, auto
 
@@ -154,7 +154,15 @@ def __init__(
         self._metric_options = metric_options or {}
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
-        generation_parameters = asdict(model_config.generation_parameters) if model_config and hasattr(model_config, "generation_parameters") else {}
+
+        if model_config and hasattr(model_config, "generation_parameters"):
+            generation_parameters = (
+                asdict(model_config.generation_parameters)
+                if is_dataclass(model_config.generation_parameters)
+                else model_config.generation_parameters.model_dump()
+            )
+        else:
+            generation_parameters = {}
 
         self.evaluation_tracker.general_config_logger.log_model_info(generation_parameters, self.model.model_info)
         self._init_random_seeds()
@@ -185,7 +193,7 @@ def _init_parallelism_manager(self):
     def _init_model(self, model_config, model):
         logger.info("--- LOADING MODEL ---")
         if model_config is not None:
-            if self.parallel_context:                
+            if self.parallel_context:
                 return NanotronLightevalModel(
                     checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path)
                     if self.pipeline_parameters.nanotron_checkpoint_path

From 309d2b2e78fe5015aea9bbb503f64411faa919f2 Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Wed, 21 May 2025 00:03:03 +0800
Subject: [PATCH 11/13] let nanotron return results

---
 src/lighteval/main_nanotron.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index d263090db..7eed1284b 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -68,13 +68,6 @@ def nanotron(
         skip_unused_config_keys=True,
         skip_null_keys=True,
     )
-    model_config = get_config_from_file(
-        checkpoint_config_path,
-        config_class=Config,
-        model_config_class=None,
-        skip_unused_config_keys=True,
-        skip_null_keys=True,
-    )
 
     # We are getting an type error, because the get_config_from_file is not correctly typed,
     lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
@@ -115,4 +108,8 @@ def nanotron(
 
     pipeline.show_results()
 
+    results = pipeline.get_results()
+
     pipeline.save_and_push_results()
+
+    return results

From 4ca30193b87848ffc66800d5c128edf4c5366600 Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Wed, 21 May 2025 00:04:30 +0800
Subject: [PATCH 12/13] added nanotron deps, addd [nanotron] option to dev

---
 pyproject.toml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4b2c4c768..7e3a18e9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,9 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]
 
+[tool.uv]
+no-build-isolation-package = ['flash-attn']
+
 [project]
 name = "lighteval"
 version = "0.9.1.dev0"
@@ -88,14 +91,18 @@ optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
-  "nanotron",
-  "tensorboardX"
+  "nanotron@git+https://github.com/huggingface/nanotron@v0.5",
+  "tensorboardX",
+  "ninja",
+  "triton",
+  "flash-attn>=2.5.0,<2.7.0",
+  "datatrove[io]"
 ]
 tensorboardX = ["tensorboardX"]
 vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0","deepdiff"]
-dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
+dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm,nanotron]"]
 docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval

From c2dd504d846fed71d733c1b1c5b0bff2a8089bfe Mon Sep 17 00:00:00 2001
From: "T. Duy Nguyen-Hien" <duynht@u.nus.edu>
Date: Wed, 21 May 2025 00:09:23 +0800
Subject: [PATCH 13/13] fixed styling

---
 ...hteval_config_override_nanotron_tests.yaml |  24 +++
 src/lighteval/config/lighteval_config.py      |   6 +
 .../models/nanotron/nanotron_model.py         |   1 +
 ...molLM2-1.7B-Instruct-results-nanotron.json |   3 +
 tests/slow_tests/test_nanotron_model.py       | 159 ++++++++++++++++++
 5 files changed, 193 insertions(+)
 create mode 100644 examples/lighteval_config_override_nanotron_tests.yaml
 create mode 100644 tests/reference_scores/SmolLM2-1.7B-Instruct-results-nanotron.json
 create mode 100644 tests/slow_tests/test_nanotron_model.py

diff --git a/examples/lighteval_config_override_nanotron_tests.yaml b/examples/lighteval_config_override_nanotron_tests.yaml
new file mode 100644
index 000000000..8fdd8227a
--- /dev/null
+++ b/examples/lighteval_config_override_nanotron_tests.yaml
@@ -0,0 +1,24 @@
+# As of right now auto batch size doesn't work, so we use some default
+batch_size: 8
+generation: null
+logging:
+  output_dir: "tests/nanotron_logs"
+  save_details: false
+  push_to_hub: false
+  public_run: false
+  results_org: null
+  tensorboard_metric_prefix: "eval"
+parallelism:
+  dp: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 1
+  tp_linear_async_communication: false
+  tp_mode: ALL_REDUCE
+tasks:
+  dataset_loading_processes: 8
+  max_samples: 10
+  multichoice_continuations_start_space: null
+  num_fewshot_seeds: null
+  tasks: leaderboard|arc:challenge|25|0,leaderboard|truthfulqa:mc|0|0,leaderboard|hellaswag|10|0,leaderboard|mmlu:college_chemistry|5|0,leaderboard|mmlu:us_foreign_policy|5|0,lighteval|agieval:aqua-rat|0|0,lighteval|agieval:logiqa-en|0|0,lighteval|agieval:lsat-ar|0|0,lighteval|agieval:lsat-lr|0|0,lighteval|agieval:lsat-rc|0|0,lighteval|agieval:sat-en-without-passage|0|0,lighteval|agieval:sat-en|0|0,lighteval|bigbench:causal_judgment|3|0,lighteval|bigbench:date_understanding|3|0,lighteval|bigbench:disambiguation_qa|3|0,lighteval|bigbench:geometric_shapes|3|0,lighteval|bigbench:logical_deduction_five_objects|3|0,lighteval|bigbench:logical_deduction_seven_objects|3|0,lighteval|bigbench:movie_recommendation|3|0,lighteval|bigbench:navigate|3|0,lighteval|bigbench:ruin_names|3|0,lighteval|bigbench:salient_translation_error_detection|3|0,lighteval|bigbench:snarks|3|0,lighteval|bigbench:temporal_sequences|3|0,lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0,lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0,test|gsm8k|0|1
+  custom_tasks: examples/custom_tasks_tests.py
diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
index 100ab5431..9e3fafd68 100644
--- a/src/lighteval/config/lighteval_config.py
+++ b/src/lighteval/config/lighteval_config.py
@@ -109,3 +109,9 @@ def generation_parameters(self):
         if self.lighteval_config.generation:
             return self.lighteval_config.generation
         return GenerationArgs()
+
+    def __getattr__(self, name):
+        # Delegate attribute access to nanotron_config if not found in FullNanotronConfig
+        if hasattr(self.nanotron_config, name):
+            return getattr(self.nanotron_config, name)
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index 137caa8a9..dda515311 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -1236,6 +1236,7 @@ def greedy_until(
                     max_micro_batch_size=batch_size,  # ok for PP=1 for PP>1 we'll need to split the batch
                     returns_logits=returns_logits,
                     generation_config=self.generation_config,
+                    # tokenizer=self.tokenizer #NOTE[duynht]; This is needed for the current nanotron@main, but that is not compatible with HuggingfaceTB/SmolLM2-nanotron-ckpt
                 )
                 dist.barrier()  # Got everyone to send their stuff
                 outputs = list(outputs)
diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-nanotron.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-nanotron.json
new file mode 100644
index 000000000..3546d526c
--- /dev/null
+++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-nanotron.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdaf63946ad703af4cefaba86769b91b61847abbbc19fe48abfb68fcfb6e023e
+size 50151
diff --git a/tests/slow_tests/test_nanotron_model.py b/tests/slow_tests/test_nanotron_model.py
new file mode 100644
index 000000000..fe4e9d39a
--- /dev/null
+++ b/tests/slow_tests/test_nanotron_model.py
@@ -0,0 +1,159 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import json
+import os
+from functools import lru_cache, partial
+from typing import Callable, Tuple
+
+import nanotron.constants as nanotron_constants  # Add this import
+import pytest
+import yaml
+from deepdiff import DeepDiff
+from huggingface_hub import snapshot_download
+from packaging.version import Version
+
+from lighteval.main_nanotron import nanotron  # noqa: E402
+
+
+# Set env var for deterministic run of models
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+
+# Download the model checkpoint
+@pytest.fixture(scope="session", autouse=True)
+def download_model():
+    snapshot_download(
+        repo_id="HuggingFaceTB/SmolLM2-nanotron-ckpt",
+        allow_patterns=["1700M/final/*"],
+        local_dir="./SmolLM2-nanotron-ckpt/",
+    )
+
+
+MODELS_ARGS = [
+    # {"model_name": "gpt2", "use_chat_template": False, "revision": "main", "results_file": "tests/reference_scores/gpt2-results.json"},
+    {
+        "model_name": "SmolLM2-nanotron-ckpt/1700M/final/config.yaml",
+        "lighteval_config_path": "examples/lighteval_config_override_nanotron_tests.yaml",
+        "results_file": "tests/reference_scores/SmolLM2-1.7B-Instruct-results-nanotron.json",
+    }
+]
+TASKS_PATH = "examples/test_tasks.txt"
+CUSTOM_TASKS_PATH = "examples/custom_tasks_tests.py"
+
+ModelInput = Tuple[str, Callable[[], dict]]
+
+
+# Set data_stages to null in config.yaml before running tests
+def set_data_stages_to_null(config_path):
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    keys_to_keep = ["model", "tokenizer", "general", "parallelism"]
+    keys_to_delete = [key for key in config.keys() if key not in keys_to_keep]
+    for key in keys_to_delete:
+        del config[key]
+    if "parallelism" in config and config["parallelism"] is not None:
+        if "tp_recompute_allgather" in config["parallelism"]:
+            del config["parallelism"]["tp_recompute_allgather"]
+        if "recompute_layer" in config["parallelism"]:
+            del config["parallelism"]["recompute_layer"]
+    if "model" in config and config["model"] is not None:
+        if "model_config" in config["model"]:
+            if "rope_theta" in config["model"]["model_config"]:
+                del config["model"]["model_config"]["rope_theta"]
+            if "rope_interleaved" in config["model"]["model_config"]:
+                del config["model"]["model_config"]["rope_interleaved"]
+    # config["data_stages"] = None
+    # if "checkpoints" in config and config["checkpoints"] is not None:
+    #     if "save_final_state" in config["checkpoints"]:
+    #         del config["checkpoints"]["save_final_state"]
+    # if "optimizer" in config and config["optimizer"] is not None:
+    #     if "optimizer_factory" in config["optimizer"]:
+    #         del config["optimizer"]["optimizer_factory"]
+    with open(config_path, "w") as f:
+        yaml.safe_dump(config, f)
+
+
+@lru_cache(maxsize=len(MODELS_ARGS))
+def run_model(checkpoint_config_path: str, lighteval_config_path: str):
+    """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism"""
+    # Emulate torchrun launch
+    if "MASTER_ADDR" not in os.environ:
+        os.environ["MASTER_ADDR"] = "localhost"
+    if "MASTER_PORT" not in os.environ:
+        os.environ["MASTER_PORT"] = "60000"  # Or any other free port
+    if "WORLD_SIZE" not in os.environ:
+        os.environ["WORLD_SIZE"] = "1"
+    if "RANK" not in os.environ:
+        os.environ["RANK"] = "0"
+    if "LOCAL_RANK" not in os.environ:
+        os.environ["LOCAL_RANK"] = "0"
+
+    results = nanotron(
+        checkpoint_config_path=checkpoint_config_path,
+        lighteval_config_path=lighteval_config_path,
+    )
+    return results
+
+
+def generate_tests() -> list[ModelInput]:
+    """Generate test parameters for all models and tasks."""
+
+    tests = []
+    for model_args in MODELS_ARGS:
+        predictions_lite = partial(run_model, model_args["model_name"], model_args["lighteval_config_path"])
+        tests.append((model_args, predictions_lite))
+
+    return tests
+
+
+# generates the model predictions parameters at test collection time
+tests: list[ModelInput] = generate_tests()
+ids = [f"{model_input[0]['model_name']}" for model_input in tests]
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("tests", tests, ids=ids)
+def test_nanotron_model(tests: list[ModelInput], monkeypatch):  # Add monkeypatch fixture
+    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
+    model_args, get_predictions = tests
+
+    # Set data_stages to null in config.yaml before running tests
+    set_data_stages_to_null(model_args["model_name"])
+
+    # Monkeypatch CHECKPOINT_VERSION to bypass version check
+    monkeypatch.setattr(nanotron_constants, "CHECKPOINT_VERSION", Version("1.4"))
+
+    predictions = get_predictions()["results"]
+
+    # Load the reference results
+    with open(model_args["results_file"], "r") as f:
+        reference_results = json.load(f)["results"]
+
+    # Change the key names, replace '|' with ':'
+    reference_results = {k.replace("|", ":"): v for k, v in reference_results.items()}
+
+    # Convert defaultdict values to regular dict for comparison
+    predictions_dict = {k: dict(v) if hasattr(v, "default_factory") else v for k, v in predictions.items()}
+
+    diff = DeepDiff(reference_results, predictions_dict, ignore_numeric_type_changes=True, math_epsilon=0.05)
+
+    assert diff == {}, f"Differences found: {diff}"