huggingface · akshatvishu · Feb 3, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/src/diffusers/models/_modeling_parallel.py b/src/diffusers/models/_modeling_parallel.py
@@ -290,7 +290,15 @@ def gather_size_by_comm(size: int, group: dist.ProcessGroup) -> List[int]:
     # HACK: Use Gloo backend for all_gather to avoid H2D and D2H overhead
     comm_backends = str(dist.get_backend(group=group))
     # NOTE: e.g., dist.init_process_group(backend="cpu:gloo,cuda:nccl")
-    gather_device = "cpu" if "cpu" in comm_backends else torch.accelerator.current_accelerator()
+    if "cpu" in comm_backends:
+        gather_device = "cpu"
+    elif hasattr(torch, "accelerator"):
+        acc = torch.accelerator.current_accelerator()
+        # Fall back to CUDA when no accelerator is returned.
+        gather_device = acc if acc is not None else torch.device("cuda")
+    else:
+        gather_device = torch.device("cuda")
+
     gathered_sizes = [torch.empty((1,), device=gather_device, dtype=torch.int64) for _ in range(world_size)]
     dist.all_gather(
         gathered_sizes,