fix: port wrap_data_iterator pattern from PR #4659 to fix DCP test

Phlip79 · Phlip79 · commit e04e1c2d8ba9 · 2026-05-11T18:50:24.000Z
The Phase 1 merge of #4716 took main's version of training.py per the skill's "Files to Override from Main" rule, which uses the HybridCPDataLoaderWrapper class wrapped once outside train_step. That broke the DCP test (gpt3_mcore_te_tp2_pp1_cp4_dcp) in two ways: 1. RuntimeError: Trying to resize storage that is not resizable - fixed by c3dbea7 (rename args.hybrid_context_parallel -> args.dynamic_context_parallel). 2. AssertionError: data iterator is not wrapped with RerunDataIterator - the outside-train_step wrap converted train_data_iterator from a RerunDataIterator to a plain iterator, but rerun_state_machine's should_run_forward_backward asserts the wrap. PR #4659 resolved this by keeping dev's wrap_data_iterator pattern instead of main's HybridCPDataLoaderWrapper, calling wrap_data_iterator INSIDE train_step (after should_run_forward_backward) and inside the eval loop. That keeps the original RerunDataIterator visible to the assertion and only swaps in the packed iterator for the forward_backward_func call. Port that pattern verbatim from PR #4659's training.py: - Replace HybridCPDataLoaderWrapper import with wrap_data_iterator - Remove the outside-train_step wrap (was at line 3000-3001) - Inside train_step: add the if config.sequence_packing_scheduler is not None block before forward_backward_func, unpacking (data_iterator, num_microbatches, seqlen_sum_this_global_batch, seqlen_squared_sum_this_global_batch); pass num_microbatches= num_microbatches to forward_backward_func - Inside eval loop: add the same wrap with try/except StopIteration, using packed_data_iterator and scheduled_eval_num_microbatches Note: this leaves HybridCPDataLoaderWrapper and its imports (Any, List, BalancedCPScheduler) as dead code in megatron/core/datasets/data_schedule.py. Cleanup of that file (and of the remaining structural diff in training.py / data_samplers.py / utils.py vs PR #4659's tree) is left to follow-up.
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -181,7 +181,7 @@ def set_startup_timestamps(program_start=None, main_entry=None):
 except ImportError:
     HAVE_FSDP2 = False
 
-from megatron.core.datasets.data_schedule import HybridCPDataLoaderWrapper
+from megatron.core.datasets.data_schedule import wrap_data_iterator
 from megatron.core.distributed import finalize_model_grads
 from megatron.core.enums import ModelType
 from megatron.core.inference.symmetric_memory import SymmetricMemoryManager
@@ -2030,6 +2030,27 @@ def train_step(
                     if isinstance(optim_instance, DistributedOptimizer):
                         optim_instance._copy_main_params_to_param_buffer()
 
+        if config.sequence_packing_scheduler is not None:
+            # This wrapper is designed to support DP-balanced THD and dynamic-CP.
+            # Before wrapping, the data_iterator returns either a single sequence per get_item call, or a list where each element is a sequence.
+            # The wrapper is responsible for:
+            # 1. scheduling the sequences across ranks
+            # 2. packing them into THD format
+            # 3. broadcast flops parametes and num_microbatches to TP ranks to support unfixed num_microbatches
+            # 4. broadcast metadata(cu_seqlens, cu_seqlens_padded, max_seqlen, etc.) to PP ranks to
+            # 5. returning the packed data iterator and the FLOPs parameters
+            (
+                data_iterator,
+                num_microbatches,
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
+            ) = wrap_data_iterator(data_iterator, config, get_num_microbatches())
+        else:
+            # data_iterator unchanged
+            num_microbatches = get_num_microbatches()
+            seqlen_sum_this_global_batch = args.seq_length * args.global_batch_size
+            seqlen_squared_sum_this_global_batch = args.seq_length**2 * args.global_batch_size
+
         # Forward pass.
         if save_activations_in_this_iteration:
             enable_activation_logging(model, args.save)
@@ -2041,7 +2062,7 @@ def train_step(
             forward_step_func=forward_step_func,
             data_iterator=data_iterator,
             model=model,
-            num_microbatches=get_num_microbatches(),
+            num_microbatches=num_microbatches,
             seq_length=args.seq_length,
             micro_batch_size=args.micro_batch_size,
             decoder_seq_length=args.decoder_seq_length,
@@ -2997,9 +3018,6 @@ def train(
     energy_monitor = get_energy_monitor()
     one_logger = get_one_logger()
 
-    if args.dynamic_context_parallel:
-        train_data_iterator = iter(HybridCPDataLoaderWrapper(train_data_iterator, config))
-
     if args.run_workload_inspector_server:
         try:
             import threading
@@ -3699,11 +3717,30 @@ def evaluate(
             # Don't care about timing during evaluation
             config.timers = None
             ft_integration.on_eval_step_start()
+            if config.sequence_packing_scheduler is not None:
+                # This wrapper is designed to support DP-balanced THD and dynamic-CP.
+                # Before wrapping, the data_iterator returns either a single sequence per get_item call, or a list where each element is a sequence.
+                # The wrapper is responsible for:
+                # 1. scheduling the sequences across ranks
+                # 2. packing them into THD format
+                # 3. broadcast flops parametes and num_microbatches to TP ranks to support unfixed num_microbatches
+                # 4. broadcast metadata(cu_seqlens, cu_seqlens_padded, max_seqlen, etc.) to PP ranks to
+                # 5. returning the packed data iterator and the FLOPs parameters
+                try:
+                    (packed_data_iterator, scheduled_eval_num_microbatches, _, _) = (
+                        wrap_data_iterator(data_iterator, config, eval_num_microbatches)
+                    )
+                except StopIteration:
+                    # Validation data iterator exhausted, stop evaluation early.
+                    break
+            else:
+                packed_data_iterator = data_iterator
+                scheduled_eval_num_microbatches = eval_num_microbatches
             loss_dicts = forward_backward_func(
                 forward_step_func=forward_step_func,
-                data_iterator=data_iterator,
+                data_iterator=packed_data_iterator,
                 model=model,
-                num_microbatches=eval_num_microbatches,
+                num_microbatches=scheduled_eval_num_microbatches,
                 seq_length=args.seq_length,
                 micro_batch_size=eval_micro_batch_size,
                 decoder_seq_length=args.decoder_seq_length,