open-mmlab
diff --git a/‎mmengine/device/__init__.py
Lines changed: 6 additions & 4 deletions b/‎mmengine/device/__init__.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎mmengine/device/utils.py
Lines changed: 37 additions & 1 deletion b/‎mmengine/device/utils.py
Lines changed: 37 additions & 1 deletion
diff --git a/‎mmengine/dist/dist.py
Lines changed: 17 additions & 0 deletions b/‎mmengine/dist/dist.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎mmengine/dist/utils.py
Lines changed: 13 additions & 1 deletion b/‎mmengine/dist/utils.py
Lines changed: 13 additions & 1 deletion
diff --git a/‎mmengine/hooks/empty_cache_hook.py
Lines changed: 13 additions & 3 deletions b/‎mmengine/hooks/empty_cache_hook.py
Lines changed: 13 additions & 3 deletions
diff --git a/‎mmengine/logging/logger.py
Lines changed: 31 additions & 15 deletions b/‎mmengine/logging/logger.py
Lines changed: 31 additions & 15 deletions
diff --git a/‎mmengine/model/base_model/base_model.py
Lines changed: 15 additions & 0 deletions b/‎mmengine/model/base_model/base_model.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎mmengine/model/base_model/data_preprocessor.py
Lines changed: 9 additions & 0 deletions b/‎mmengine/model/base_model/data_preprocessor.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎mmengine/optim/optimizer/amp_optimizer_wrapper.py
Lines changed: 4 additions & 3 deletions b/‎mmengine/optim/optimizer/amp_optimizer_wrapper.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎mmengine/runner/amp.py
Lines changed: 7 additions & 1 deletion b/‎mmengine/runner/amp.py
Lines changed: 7 additions & 1 deletion
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .utils import (get_device, get_max_cuda_memory, is_cuda_available,
-                    is_dipu_available, is_mlu_available, is_mps_available,
-                    is_npu_available, is_npu_support_full_precision)
+from .utils import (get_device, get_max_cuda_memory, get_max_musa_memory,
+                    is_cuda_available, is_dipu_available, is_mlu_available,
+                    is_mps_available, is_musa_available, is_npu_available,
+                    is_npu_support_full_precision)
 
 __all__ = [
     'get_max_cuda_memory', 'get_device', 'is_cuda_available',
     'is_mlu_available', 'is_mps_available', 'is_npu_available',
-    'is_dipu_available', 'is_npu_support_full_precision'
+    'is_dipu_available', 'get_max_musa_memory', 'is_musa_available',
+    'is_npu_support_full_precision'
 ]
@@ -22,6 +22,12 @@
 except Exception:
     IS_DIPU_AVAILABLE = False
 
+try:
+    import torch_musa  # noqa: F401
+    IS_MUSA_AVAILABLE = True
+except Exception:
+    IS_MUSA_AVAILABLE = False
+
 
 def get_max_cuda_memory(device: Optional[torch.device] = None) -> int:
     """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
@@ -73,6 +79,34 @@ def is_dipu_available() -> bool:
     return IS_DIPU_AVAILABLE
 
 
+def get_max_musa_memory(device: Optional[torch.device] = None) -> int:
+    """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
+    a given device. By default, this returns the peak allocated memory since
+    the beginning of this program.
+
+    Args:
+        device (torch.device, optional): selected device. Returns
+            statistic for the current device, given by
+            :func:`~torch.musa.current_device`, if ``device`` is None.
+            Defaults to None.
+
+    Returns:
+        int: The maximum GPU memory occupied by tensors in megabytes
+        for a given device.
+    """
+    mem = torch.musa.max_memory_allocated(device=device)
+    mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                          dtype=torch.int,
+                          device=device)
+    # TODO:[email protected]: This function is not supported by musa yet.
+    # torch.musa.reset_peak_memory_stats()
+    return int(mem_mb.item())
+
+
+def is_musa_available() -> bool:
+    return IS_MUSA_AVAILABLE
+
+
 def is_npu_support_full_precision() -> bool:
     """Returns True if npu devices support full precision training."""
     version_of_support_full_precision = 220
@@ -91,12 +125,14 @@ def is_npu_support_full_precision() -> bool:
     DEVICE = 'mps'
 elif is_dipu_available():
     DEVICE = 'dipu'
+elif is_musa_available():
+    DEVICE = 'musa'
 
 
 def get_device() -> str:
     """Returns the currently existing device type.
 
     Returns:
-        str: cuda | npu | mlu | mps | cpu.
+        str: cuda | npu | mlu | mps | musa | cpu.
     """
     return DEVICE
@@ -415,12 +415,16 @@ def _broadcast_object_list(object_list: List[Any],
     current_device = torch.device('cpu')
     is_hccl_backend = group_backend == 'hccl'
     is_cncl_backend = group_backend == 'cncl'
+    is_mccl_backend = group_backend == 'mccl'
     if is_hccl_backend:
         current_device = torch.device('npu', torch.npu.current_device())
         object_sizes_tensor = object_sizes_tensor.to(current_device)
     elif is_cncl_backend:
         current_device = torch.device('mlu', torch.mlu.current_device())
         object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
     elif is_nccl_backend:
         # See note about using torch.cuda.current_device() here in
         # docstring. We cannot simply use my_rank since rank == device is
@@ -624,13 +628,21 @@ def _all_gather_object(object_list: List[Any],
     group_backend = get_backend(group)
     current_device = torch.device('cpu')
     is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
     if is_nccl_backend:
         # See note about using torch.cuda.current_device() here in docstring.
         # We cannot simply use my_rank since rank == device is not necessarily
         # true.
         current_device = torch.device('cuda', torch.cuda.current_device())
         input_tensor = input_tensor.to(current_device)
         local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        # See note about using torch.musa.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
     # Gather all local sizes. This is so that we can find the max size, and
     # index until the correct size when deserializing the tensors.
     group_size = get_world_size(group=group)
@@ -776,10 +788,15 @@ def _gather_object(obj: Any,
     group_backend = get_backend(group)
     current_device = torch.device('cpu')
     is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
     if is_nccl_backend:
         current_device = torch.device('cuda', torch.cuda.current_device())
         input_tensor = input_tensor.to(current_device)
         local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
     # Gather all local sizes. This is so that we can find the max size, and
     # index until the correct size when deserializing the tensors.
     group_size = get_world_size(group=group)
 
@@ -11,7 +11,8 @@
 from torch import Tensor
 from torch import distributed as torch_dist
 from torch.distributed import ProcessGroup
-from mmengine.device import is_mlu_available, is_npu_available
+from mmengine.device import (is_mlu_available, is_npu_available,
+                             is_musa_available)
 
 from collections.abc import Iterable, Mapping
 
@@ -117,6 +118,14 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
             rank=rank,
             world_size=int(os.environ['WORLD_SIZE']),
             **kwargs)
+    elif is_musa_available():
+        import torch_musa  # noqa: F401
+        torch.musa.set_device(rank)
+        torch_dist.init_process_group(
+            backend='mccl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
     else:
         torch.cuda.set_device(local_rank)
 
@@ -527,6 +536,9 @@ def get_comm_device(group: Optional[ProcessGroup] = None) -> torch.device:
         return torch.device('mlu', torch.mlu.current_device())
     elif backend == 'smddp':
         return torch.device('cuda', torch.cuda.current_device())
+    elif backend == 'mccl':
+        import torch_musa
+        return torch.device('musa', torch_musa.current_device())
     else:
         # GLOO and MPI backends use cpu device by default
         return torch.device('cpu')
 
@@ -4,6 +4,7 @@
 import torch
 
 from mmengine.registry import HOOKS
+from ..device import is_cuda_available, is_musa_available
 from .hook import Hook
 
 DATA_BATCH = Optional[Union[dict, tuple, list]]
@@ -49,7 +50,10 @@ def _after_iter(self,
             mode (str): Current mode of runner. Defaults to 'train'.
         """
         if self._do_after_iter:
-            torch.cuda.empty_cache()
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
 
     def _before_epoch(self, runner, mode: str = 'train') -> None:
         """Empty cache before an epoch.
@@ -59,7 +63,10 @@ def _before_epoch(self, runner, mode: str = 'train') -> None:
             mode (str): Current mode of runner. Defaults to 'train'.
         """
         if self._do_before_epoch:
-            torch.cuda.empty_cache()
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
 
     def _after_epoch(self, runner, mode: str = 'train') -> None:
         """Empty cache after an epoch.
@@ -69,4 +76,7 @@ def _after_epoch(self, runner, mode: str = 'train') -> None:
             mode (str): Current mode of runner. Defaults to 'train'.
         """
         if self._do_after_epoch:
-            torch.cuda.empty_cache()
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
@@ -398,22 +398,38 @@ def _get_device_id():
     except ImportError:
         return 0
     else:
-        local_rank = int(os.getenv('LOCAL_RANK', '0'))
-        # TODO: return device id of npu and mlu.
-        if not torch.cuda.is_available():
-            return local_rank
-        cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
-        if cuda_visible_devices is None:
-            num_device = torch.cuda.device_count()
-            cuda_visible_devices = list(range(num_device))
-        else:
-            cuda_visible_devices = cuda_visible_devices.split(',')
+        MUSA_AVAILABLE = False
         try:
-            return int(cuda_visible_devices[local_rank])
-        except ValueError:
-            # handle case for Multi-Instance GPUs
-            # see #1148 for details
-            return cuda_visible_devices[local_rank]
+            import torch_musa
+            MUSA_AVAILABLE = True
+        except ImportError:
+            pass
+        if MUSA_AVAILABLE:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            musa_visible_devices = os.getenv('MUSA_VISIBLE_DEVICES', None)
+            if musa_visible_devices is None:
+                num_device = torch_musa.device_count()
+                musa_visible_devices = list(range(num_device))
+            else:
+                musa_visible_devices = musa_visible_devices.split(',')
+            return int(musa_visible_devices[local_rank])
+        else:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            # TODO: return device id of npu and mlu.
+            if not torch.cuda.is_available():
+                return local_rank
+            cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+            if cuda_visible_devices is None:
+                num_device = torch.cuda.device_count()
+                cuda_visible_devices = list(range(num_device))
+            else:
+                cuda_visible_devices = cuda_visible_devices.split(',')
+            try:
+                return int(cuda_visible_devices[local_rank])
+            except ValueError:
+                # handle case for Multi-Instance GPUs
+                # see #1148 for details
+                return cuda_visible_devices[local_rank]
 
 
 def _get_host_info() -> str:
 
@@ -222,6 +222,21 @@ def cuda(
         self._set_device(torch.device(device))
         return super().cuda(device)
 
+    def musa(
+        self,
+        device: Optional[Union[int, str, torch.device]] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.musa`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        if device is None or isinstance(device, int):
+            device = torch.device('musa', index=device)
+        self._set_device(torch.device(device))
+        return super().musa(device)
+
     def mlu(
         self,
         device: Union[int, str, torch.device, None] = None,
 
@@ -113,6 +113,15 @@ def cuda(self, *args, **kwargs) -> nn.Module:
         self._device = torch.device(torch.cuda.current_device())
         return super().cuda()
 
+    def musa(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.musa.current_device())
+        return super().musa()
+
     def npu(self, *args, **kwargs) -> nn.Module:
         """Overrides this method to set the :attr:`device`
 
 
@@ -6,7 +6,7 @@
 import torch.nn as nn
 
 from mmengine.device import (is_cuda_available, is_mlu_available,
-                             is_npu_available)
+                             is_musa_available, is_npu_available)
 from mmengine.registry import OPTIM_WRAPPERS
 from mmengine.utils import digit_version
 from mmengine.utils.dl_utils import TORCH_VERSION
@@ -74,8 +74,9 @@ def __init__(self,
         assert digit_version(TORCH_VERSION) >= digit_version('1.6.0'), (
             '`torch.cuda.amp` is only available when pytorch version >= 1.6')
         assert is_cuda_available() or is_npu_available() or is_mlu_available(
-        ), ('``AmpOptimizerWrapper`` is only available training '
-            'on gpu, npu or mlu')
+        ) or is_musa_available(), (
+            '``AmpOptimizerWrapper`` is only available training '
+            'on gpu, npu, mlu or musa')
         super().__init__(**kwargs)
         self._scale_update_param = None
 
 
@@ -135,7 +135,13 @@ def autocast(device_type: Optional[str] = None,
 
         elif device_type == 'npu':
             pass
-
+        elif device_type == 'musa':
+            if dtype is None:
+                dtype = torch.get_autocast_gpu_dtype()
+            with torch.musa.amp.autocast(
+                    enabled=enabled, dtype=dtype, cache_enabled=cache_enabled):
+                yield
+                return
         else:
             # Device like MPS does not support fp16 training or testing.
             # If an inappropriate device is set and fp16 is enabled, an error