change the name for grad_recv_indices,fix the vpp hang

zty-king · zty-king · commit 69c051876b70 · 2025-06-19T07:26:20.000Z
diff --git a/python/paddle/distributed/auto_parallel/pipelining/schedules.py b/python/paddle/distributed/auto_parallel/pipelining/schedules.py
@@ -1008,10 +1008,8 @@ def _get_1f1b_rank_ops(
     # earliest time step of first backward = [local_stages * group_size + 2 * (group_size - 1 - rank)]
     # warmup_ops = calculated above
     post_warmup_ops = (
-        (n_local_stages * pp_group_size + 2 * (pp_group_size - 1 - rank))
-        - (warmup_ops + rank)
-        - 1
-    )
+        n_local_stages * pp_group_size + 2 * (pp_group_size - 1 - rank)
+    ) - (warmup_ops + rank)
 
     if enable_zero_bubble:
         post_warmup_ops = pp_group_size - rank - 1
@@ -1076,11 +1074,13 @@ def _get_1f1b_rank_ops(
                     )
                 )
                 weight_op_count += 1
-            if op == warmup_ops + fwd_bwd_ops - 1:
-                # This is the last step in the 1F1B Phase, the bubbles are symmetrical with respect to the ending phase of the warm_up
-                rank_ops.extend([None] * post_warmup_ops)
         # Cooldown phase
         else:
+            # During cooldown phase, we need steps to align with 1f1b happening in other ranks
+            # TODO: we don't need to always append, after all 1f1b are finished we can stop appending None
+            if not enable_zero_bubble:
+                rank_ops.append(None)
+
             bwd_stage_index = backward_stage_index(op)
             bwd_stage_mb_index[bwd_stage_index] = (
                 bwd_mb_index := bwd_stage_mb_index[bwd_stage_index]
diff --git a/python/paddle/distributed/auto_parallel/pipelining/stage.py b/python/paddle/distributed/auto_parallel/pipelining/stage.py
@@ -205,7 +205,9 @@ def __init__(
         # Forward infra
         self.args_recv_info: dict[int, tuple[InputInfo, ...]] = {}
         self.act_send_info: dict[int, list] = {}
-        self.grad_recv_indices: dict[int, list] = {}
+        self._need_grad_indices: dict[int, list] = (
+            {}
+        )  # record the index of output that needs to receive grad from the next stage.
         # Backward infra will created lazily
         self.grad_recv_info: dict = {}
         self.grad_send_info: list | None = None
@@ -1121,10 +1123,10 @@ def _prepare_forward_infra(
             if not self.is_last:
                 self.act_send_info[idx] = [self.stage_index + 1]
                 if not outputs_meta[idx].stop_gradient:
-                    self.grad_recv_indices[idx] = [self.stage_index + 1]
+                    self._need_grad_indices[idx] = [self.stage_index + 1]
             else:
                 self.act_send_info[idx] = []
-                self.grad_recv_indices[idx] = []
+                self._need_grad_indices[idx] = []
 
         return outputs
 
@@ -1239,7 +1241,7 @@ def _create_grad_recv_info(
                         dst_list[0],
                         _make_tensor_from_meta(self.grads_meta[idx]),
                     )
-                    for idx, dst_list in self.grad_recv_indices.items()
+                    for idx, dst_list in self._need_grad_indices.items()
                 ]
             )
         return grad_recv_info