fix deepspeed regional compilation (#3609)

IlyasMoutawwakil · web-flow · commit 6b61a373a2b4 · 2025-06-06T14:48:43.000+02:00
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -125,7 +125,7 @@
     PROFILE_PATTERN_NAME,
 )
 from .utils.modeling import get_state_dict_offloaded_model
-from .utils.other import compile_regions, is_compiled_module
+from .utils.other import compile_regions, compile_regions_deepspeed, is_compiled_module
 
 
 if is_deepspeed_available():
@@ -2030,7 +2030,7 @@ def _prepare_deepspeed(self, *args):
             if compare_versions("deepspeed", ">=", "0.14.4") and self.state.dynamo_plugin.backend != DynamoBackend.NO:
                 compile_kwargs = self.state.dynamo_plugin.to_kwargs()
                 if self.state.dynamo_plugin.use_regional_compilation:
-                    engine.module = compile_regions(engine.module, **compile_kwargs)
+                    compile_regions_deepspeed(engine.module, **compile_kwargs)
                 else:
                     engine.compile(backend=compile_kwargs.pop("backend"), compile_kwargs=compile_kwargs)
             if optimizer is not None:
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -270,6 +270,7 @@
     check_os_kernel,
     clean_state_dict_for_safetensors,
     compile_regions,
+    compile_regions_deepspeed,
     convert_bytes,
     extract_model_from_parallel,
     get_module_children_bottom_up,
diff --git a/src/accelerate/utils/other.py b/src/accelerate/utils/other.py
@@ -62,7 +62,7 @@ def is_compiled_module(module: torch.nn.Module) -> bool:
 
 def has_compiled_regions(module: torch.nn.Module) -> bool:
     """
-    Check whether the module has submodules that were compiled with torch.compile()
+    Check whether the module has submodules that were compiled with `torch.compile()`.
     """
     if not hasattr(torch, "_dynamo"):
         return False
@@ -75,6 +75,29 @@ def has_compiled_regions(module: torch.nn.Module) -> bool:
     return False
 
 
+def is_repeated_blocks(module: torch.nn.Module) -> bool:
+    """
+    Check whether the module is a repeated block, i.e. `torch.nn.ModuleList` with all children of the same class. This
+    is useful to determine whether we should apply regional compilation to the module.
+    """
+
+    return isinstance(module, torch.nn.ModuleList) and all(isinstance(m, module[0].__class__) for m in module)
+
+
+def has_repeated_blocks(module: torch.nn.Module) -> bool:
+    """
+    Check whether the module has repeated blocks, i.e. `torch.nn.ModuleList` with all children of the same class, at
+    any level of the module hierarchy. This is useful to determine whether we should apply regional compilation to the
+    module.
+    """
+    if module._modules:
+        for submodule in module.modules():
+            if is_repeated_blocks(submodule):
+                return True
+
+    return False
+
+
 def compile_regions(module: torch.nn.Module, **compile_kwargs) -> torch.nn.Module:
     """
     Performs regional compilation where we target repeated blocks of the same class and compile them sequentially to
@@ -123,33 +146,54 @@ def compile_regions(module: torch.nn.Module, **compile_kwargs) -> torch.nn.Modul
     """
 
     def _compile_regions(module: torch.nn.Module, **compile_kwargs) -> torch.nn.Module:
-        if isinstance(module, torch.nn.ModuleList):
-            if all(isinstance(submodule, module[0].__class__) for submodule in module):
-                new_module = torch.nn.ModuleList()
-                for submodule in module:
-                    new_module.append(torch.compile(submodule, **compile_kwargs))
-            else:
-                new_module = torch.compile(module, **compile_kwargs)
-        elif module._modules:  # Non-leaf node
+        if is_repeated_blocks(module):
+            new_module = torch.nn.ModuleList()
+            for submodule in module:
+                new_module.append(torch.compile(submodule, **compile_kwargs))
+        elif has_repeated_blocks(module):
             new_module = module.__class__.__new__(module.__class__)
             new_module.__dict__.update(module.__dict__)
             new_module._modules = {}
             for name, submodule in module.named_children():
                 new_module.add_module(name, _compile_regions(submodule, **compile_kwargs))
-        else:  # Leaf node
+        else:
             new_module = torch.compile(module, **compile_kwargs)
 
         return new_module
 
     new_module = _compile_regions(module, **compile_kwargs)
 
-    if not hasattr(new_module, "_orig_mod"):
+    if "_orig_mod" not in new_module.__dict__:
         # Keeps a reference to the original module to decompile/unwrap it later
         new_module.__dict__["_orig_mod"] = module
 
     return new_module
 
 
+def compile_regions_deepspeed(module: torch.nn.Module, **compile_kwargs):
+    """
+    Performs regional compilation the same way as `compile_regions`, but specifically for `DeepSpeedEngine.module`.
+    Since the model is wrapped in a `DeepSpeedEngine` and has many added hooks, offloaded parameters, etc that
+    `torch.compile(...)` interferes with, version of trgional compilation uses the inplace `module.compile()` method
+    instead.
+
+    Args:
+        module (`torch.nn.Module`):
+            The model to compile.
+        **compile_kwargs:
+            Additional keyword arguments to pass to `module.compile()`.
+    """
+
+    if is_repeated_blocks(module):
+        for submodule in module:
+            submodule.compile(**compile_kwargs)
+    elif has_repeated_blocks(module):
+        for child in module.children():
+            compile_regions_deepspeed(child, **compile_kwargs)
+    else:  # leaf node
+        module.compile(**compile_kwargs)
+
+
 def extract_model_from_parallel(
     model, keep_fp32_wrapper: bool = True, keep_torch_compile: bool = True, recursive: bool = False
 ):
@@ -175,9 +219,12 @@ def extract_model_from_parallel(
     is_compiled = is_compiled_module(model)
     has_compiled = has_compiled_regions(model)
 
-    if is_compiled or has_compiled:
+    if is_compiled:
         compiled_model = model
         model = model._orig_mod
+    elif has_compiled:
+        compiled_model = model
+        model = model.__dict__["_orig_mod"]
 
     if is_deepspeed_available():
         from deepspeed import DeepSpeedEngine
@@ -221,9 +268,13 @@ def _recursive_unwrap(module):
         if getattr(model, "_converted_to_transformer_engine", False):
             convert_model(model, to_transformer_engine=False)
 
-    if keep_torch_compile and (is_compiled or has_compiled):
-        compiled_model._orig_mod = model
-        model = compiled_model
+    if keep_torch_compile:
+        if is_compiled:
+            compiled_model._orig_mod = model
+            model = compiled_model
+        elif has_compiled:
+            compiled_model.__dict__["_orig_mod"] = model
+            model = compiled_model
 
     return model
 
diff --git a/tests/test_compile.py b/tests/test_compile.py
@@ -16,7 +16,7 @@
 import torch
 from torch.utils.benchmark import Timer
 
-from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, torch_device
+from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, slow, torch_device
 from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory
 
 
@@ -58,6 +58,8 @@ def test_regions_are_compiled(self):
         # Check that the compiled_model.transformer.h[i] and compiled_model.lm_head are compiled separately
         assert isinstance(compiled_model.transformer.h[0], torch._dynamo.eval_frame.OptimizedModule)
         assert isinstance(compiled_model.lm_head, torch._dynamo.eval_frame.OptimizedModule)
+        assert compiled_model.transformer.h[0]._orig_mod is model.transformer.h[0]
+        assert compiled_model.lm_head._orig_mod is model.lm_head
 
     def test_extract_model_keep_torch_compile(self):
         model, _ = self._get_model_and_inputs()
@@ -84,14 +86,14 @@ def test_extract_model_remove_torch_compile(self):
     def test_regional_compilation_cold_start(self):
         model, input_ids = self._get_model_and_inputs()
 
-        regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
+        regional_compilation_model = compile_regions(model, backend=backend)
         regional_compilation_cold_start = (
             Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
             .timeit(COMPILE_ITERS)
             .median
         )
 
-        full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
+        full_compilation_model = torch.compile(model, backend=backend)
         full_compilation_cold_start = (
             Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
             .timeit(COMPILE_ITERS)
@@ -106,6 +108,7 @@ def test_regional_compilation_cold_start(self):
 
         release_memory(model, full_compilation_model, regional_compilation_model)
 
+    @slow
     @require_non_cpu
     @require_huggingface_suite
     def test_regional_compilation_inference_speedup(self):
@@ -115,14 +118,14 @@ def test_regional_compilation_inference_speedup(self):
             Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
         )
 
-        regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
+        regional_compilation_model = compile_regions(model, backend=backend)
         regional_compilation_inference_latency = (
             Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
             .timeit(INFERENCE_ITERS)
             .median
         )
 
-        full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
+        full_compilation_model = torch.compile(model, backend=backend)
         full_compilation_inference_latency = (
             Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
             .timeit(INFERENCE_ITERS)