[Trainer][Megatron] Sequence packing + Context Parallel for Megatron (#274)

erictang000 · web-flow · commit b9cc7e8da5d9 · 2025-09-09T17:55:07.000-07:00
# Overview Adds sequence packing + context parallel support for megatron backend. Note that context parallel without sequence packing is not supported. ## Correctness Check ### CP + TP + PP <img width="368" height="275" alt="image" src="https://github.com/user-attachments/assets/53fdd009-3af9-4352-8e63-7604b2dfdeee" /> ### Just Sequence Packing <img width="366" height="278" alt="image" src="https://github.com/user-attachments/assets/9a40dfdf-af8c-44e8-bc54-78e13d187daa" /> ### Just CP + Sequence Packing <img width="364" height="281" alt="image" src="https://github.com/user-attachments/assets/c69522e8-52b1-4581-8a66-a579b29bbb0d" /> ### Timing Adding CP is slower as expected, adding just sequence packing is also slightly slower for tp=2,pp=2. <img width="362" height="286" alt="image" src="https://github.com/user-attachments/assets/9109ce98-0740-46ce-8a92-de5cd8cf2ec2" /> This seems to be because of overhead in computing rotary positional embeddings - without sequence packing, it's a batched call for a well formed tensor, while without sequence packing, it iterates over sequences one by one: #274 (comment)
diff --git a/skyrl-train/examples/training_backends/megatron/run_megatron.sh b/skyrl-train/examples/training_backends/megatron/run_megatron.sh
@@ -15,6 +15,7 @@ INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron
 
 MEGATRON_TP=2
 MEGATRON_PP=2
+MEGATRON_CP=1
 
 uv run --isolated --extra $INFERENCE_BACKEND --extra mcore -m skyrl_train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
@@ -29,9 +30,11 @@ uv run --isolated --extra $INFERENCE_BACKEND --extra mcore -m skyrl_train.entryp
   generator.inference_engine_tensor_parallel_size=1 \
   megatron_config.policy.tensor_model_parallel_size=$MEGATRON_TP \
   megatron_config.policy.pipeline_model_parallel_size=$MEGATRON_PP \
+  megatron_config.policy.context_parallel_size=$MEGATRON_CP \
   megatron_config.ref.tensor_model_parallel_size=$MEGATRON_TP \
+  megatron_config.ref.context_parallel_size=$MEGATRON_CP \
   megatron_config.ref.pipeline_model_parallel_size=$MEGATRON_PP \
-  trainer.use_sample_packing=false \
+  trainer.use_sample_packing=true \
   trainer.epochs=20 \
   trainer.eval_batch_size=1024 \
   trainer.eval_before_train=false \
@@ -56,7 +59,7 @@ uv run --isolated --extra $INFERENCE_BACKEND --extra mcore -m skyrl_train.entryp
   generator.gpu_memory_utilization=0.6 \
   trainer.logger="$LOGGER" \
   trainer.project_name="gsm8k_megatron" \
-  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_${MODEL_NAME}" \
+  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}" \
   trainer.resume_mode=null \
   trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \
   $@
diff --git a/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py b/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py
@@ -1,5 +1,6 @@
 # Utils ported from Verl
 # https://github.com/volcengine/verl/blob/e1603dc97f3c20c58feed1f5be34acd5c72a830c/verl/utils/megatron_utils.py#L4
+# https://github.com/volcengine/verl/blob/dfa3933ac44b545fca1f6a8519fd07394a2cde1c/verl/models/mcore/util.py
 # The original copyright is reproduced below:
 
 # Copyright 2024 Bytedance Ltd. and/or its affiliates
@@ -27,6 +28,7 @@
 from megatron.core.optimizer import ChainedOptimizer
 from megatron.core import parallel_state as mpu
 from megatron.core.utils import get_attr_wrapped_model
+from megatron.core.packed_seq_params import PackedSeqParams
 
 ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module)
 
@@ -291,6 +293,148 @@ def _iter_opts(opt):
         torch.cuda.empty_cache()
 
 
+def preprocess_packed_seqs(
+    input_ids: torch.Tensor, attention_mask: torch.Tensor, pre_process: bool = True
+) -> tuple[torch.Tensor, PackedSeqParams]:
+    """
+    Preprocess packed sequences
+    CP splits sequence into CP*2 chunks, and each GPU gets 2 chunks (GPU0 gets first and last chunks, GPU1
+    gets second and second last chunks, and so on), this is for load balancing with causal masking.
+    See https://github.com/NVIDIA/TransformerEngine/issues/1368
+    """
+    batch_size = input_ids.shape[0]
+
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    tp_size = mpu.get_tensor_model_parallel_world_size()
+    cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
+    align_size = tp_size * cp_size * 2 if cp_size > 1 else tp_size
+
+    pad_size = (align_size - seqlens_in_batch % align_size) % align_size
+    seqlens_in_batch_padded = seqlens_in_batch + pad_size
+
+    cu_seqlens = torch.zeros(batch_size + 1, dtype=torch.int32, device=input_ids.device)
+    cu_seqlens[1:] = torch.cumsum(seqlens_in_batch, dim=0)
+    cu_seqlens_padded = torch.zeros(batch_size + 1, dtype=torch.int32, device=input_ids.device)
+    cu_seqlens_padded[1:] = torch.cumsum(seqlens_in_batch_padded, dim=0)
+
+    # ----------------------------------------------------------------------------
+    # Move the index information needed in the subsequent loop to the CPU at once,
+    # to avoid frequent .item() calls in the loop that cause D2H synchronization
+    # ----------------------------------------------------------------------------
+    seqlens_in_batch_cpu: list[int] = seqlens_in_batch.tolist()  # original valid lengths
+    seqlens_in_batch_padded_cpu: list[int] = seqlens_in_batch_padded.tolist()  # lengths after padding
+    cu_seqlens_padded_cpu: list[int] = cu_seqlens_padded.tolist()  # start positions (after padding)
+
+    # Pure Python int calculation to avoid further synchronization
+    max_seqlen_in_batch = max(seqlens_in_batch_padded_cpu)
+
+    shape = list(input_ids.shape[1:])
+    shape[0] = sum(seqlens_in_batch_padded_cpu) // cp_size
+    if pre_process:
+        input_ids_rmpad = torch.zeros(shape, dtype=input_ids.dtype, device=input_ids.device)
+        for i in range(batch_size):
+            # Use Python int, so no GPU→CPU sync in the loop
+            if cp_size <= 1:
+                seqlen = seqlens_in_batch_cpu[i]
+                start_idx = cu_seqlens_padded_cpu[i]
+                input_ids_rmpad[start_idx : start_idx + seqlen] = input_ids[i, attention_mask[i]]
+                continue
+
+            seqlen_padded_i = seqlens_in_batch_padded_cpu[i]
+            seqlen = seqlen_padded_i // cp_size
+            half_seqlen = seqlen // 2
+            start_idx = cu_seqlens_padded_cpu[i] // cp_size
+            # split to 2 chunks
+            d = input_ids[i, attention_mask[i]]
+            input_ids_rmpad[start_idx : start_idx + half_seqlen] = d[
+                half_seqlen * cp_rank : half_seqlen * (cp_rank + 1)
+            ]
+
+            remain_start = seqlen_padded_i - half_seqlen * (cp_rank + 1)
+            remain_end = seqlen_padded_i - half_seqlen * cp_rank
+            remain_end = min(remain_end, d.shape[0])
+            remain_len = remain_end - remain_start
+            if remain_len > 0:
+                input_ids_rmpad[start_idx + half_seqlen : start_idx + half_seqlen + remain_len] = d[
+                    remain_start:remain_end
+                ]
+
+    packed_seq_params = PackedSeqParams(
+        qkv_format="thd",
+        cu_seqlens_q=cu_seqlens_padded,
+        max_seqlen_q=max_seqlen_in_batch,
+        cu_seqlens_kv=cu_seqlens_padded,
+        max_seqlen_kv=max_seqlen_in_batch,
+        cu_seqlens_q_padded=cu_seqlens_padded,
+        cu_seqlens_kv_padded=cu_seqlens_padded,
+    )
+    if pre_process:
+        return input_ids_rmpad.unsqueeze(0), packed_seq_params
+    else:
+        return input_ids, packed_seq_params
+
+
+def postprocess_packed_seqs(
+    output: torch.Tensor,
+    packed_seq_params: PackedSeqParams,
+    attention_mask: torch.Tensor,
+    batch_size: int,
+    seq_len: int,
+    post_process: bool = True,
+) -> torch.Tensor:
+    """
+    Postprocess packed sequences
+    """
+    if not post_process:
+        return output
+
+    # -------------------------------------------------------------------------
+    # Move the lengths and offsets needed for subsequent Python-level indexing to the CPU in advance,
+    # to avoid a large number of .item() calls in the loop
+    # -------------------------------------------------------------------------
+    cu_padded_cpu: list[int] = packed_seq_params.cu_seqlens_q_padded.tolist()
+    seq_lens_cpu: list[int] = attention_mask.sum(dim=1, dtype=torch.int32).cpu().tolist()
+
+    shape = [batch_size, seq_len] + list(output.shape[2:])  # 1,packed, dim -> batch_size, seq_len, dim
+    output_new = torch.zeros(shape, dtype=output.dtype, device=output.device)
+
+    cp_size = mpu.get_context_parallel_world_size()
+    # all gather output across context parallel group
+    if cp_size > 1:
+        # output shape: [1, packed_len, hidden_dim]
+        # need to gather across cp group and concatenate in sequence dimension
+        output_list = [torch.empty_like(output) for _ in range(cp_size)]
+        torch.distributed.all_gather(output_list, output.detach(), group=mpu.get_context_parallel_group())
+        output_list[mpu.get_context_parallel_rank()] = output
+    else:
+        output_list = [output]
+    for i in range(batch_size):
+        if cp_size <= 1:
+            s = seq_lens_cpu[i]
+            start_idx = cu_padded_cpu[i]
+            output_new[i, attention_mask[i]] = output[0][start_idx : start_idx + s]
+            continue
+        s_len_padded_chunk = (cu_padded_cpu[i + 1] - cu_padded_cpu[i]) // cp_size
+        half_seqlen = s_len_padded_chunk // 2
+        s_len = seq_lens_cpu[i]
+        s_len_padded = s_len_padded_chunk * cp_size
+        tmp = torch.empty(s_len_padded, *output.shape[2:], device=output.device)
+        for j in range(cp_size):
+            o = output_list[j][0]
+            # split to 2 chunks
+            packed_start_idx = cu_padded_cpu[i] // cp_size
+            o0, o1 = (
+                o[packed_start_idx : packed_start_idx + half_seqlen],
+                o[packed_start_idx + half_seqlen : packed_start_idx + s_len_padded_chunk],
+            )
+            tmp[j * half_seqlen : (j + 1) * half_seqlen] = o0
+            tmp[s_len_padded - (j + 1) * half_seqlen : s_len_padded - j * half_seqlen] = o1
+        output_new[i, attention_mask[i]] = tmp[:s_len]
+
+    return output_new
+
+
 def remove_left_padding(
     input_ids: torch.Tensor,
     attention_mask: torch.Tensor,
diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py
@@ -120,14 +120,19 @@ def validate_megatron_cfg(cfg: DictConfig):
     assert cfg.generator.backend == "vllm", "only vllm is supported for with megatron"
     assert cfg.trainer.placement.colocate_all, "only colocate_all=True is supported for megatron training"
     assert cfg.trainer.critic.model.path is None, "only GRPO training is currently supported for megatron"
-    assert not cfg.trainer.use_sample_packing, "sample packing is not yet supported for megatron"
+
+    if cfg.trainer.flash_attn:
+        import flash_attn
+
+        version = flash_attn.__version__
+        if version > "2.7.4.post1":
+            raise ValueError("flash_attn <= 2.7.4.post1 is required for using the megatron backend with flash_attn")
 
     worker_configs = [(cfg.trainer.policy, "policy"), (cfg.trainer.ref, "ref")]
     for config, worker_type in worker_configs:
-        # context, expert, and export tensor parallel are not yet supported for megatron
-        assert (
-            config.megatron_config.context_parallel_size == 1
-        ), f"found {worker_type}.context_parallel_size > 1, context parallel is not yet supported for megatron"
+        # context, expert, and expert tensor parallel are not yet supported for megatron
+        if config.megatron_config.context_parallel_size > 1:
+            assert cfg.trainer.use_sample_packing, "context parallel is only supported with sample packing"
         assert (
             config.megatron_config.expert_model_parallel_size == 1
         ), f"found {worker_type}.expert_model_parallel_size > 1, expert model parallel is not yet supported for megatron"
diff --git a/skyrl-train/skyrl_train/workers/megatron/megatron_policy.py b/skyrl-train/skyrl_train/workers/megatron/megatron_policy.py
@@ -13,6 +13,8 @@
 
 from skyrl_train.distributed.megatron.megatron_utils import (
     make_batch_generator,
+    preprocess_packed_seqs,
+    postprocess_packed_seqs,
     remove_left_padding,
     recover_left_padding,
 )
@@ -34,6 +36,7 @@ def __init__(
         self.actor_module = actor_module
         self.actor_optimizer = actor_optimizer
         self.policy_loss_fn = policy_loss_fn
+        self.use_sample_packing = self.cfg.trainer.use_sample_packing
 
         config = get_model_config(self.actor_module[0])
         # This is set to None by default: https://github.com/NVIDIA/Megatron-LM/blob/07b22a05136a3cb08ece05f7de38cf6aeeb165fb/megatron/core/model_parallel_config.py#L95
@@ -86,6 +89,7 @@ def collection_func(logits, data):
                 vocab_end_index=(tp_rank + 1) * logits.shape[-1],
                 tp_group=tp_grp,
                 inference_only=True,
+                cp_group=None,  # we handle cp gathering in `postprocess_packed_seqs`
                 chunk_size=None,
             )
             return 0.0, {"log_probs": token_logprobs}
@@ -96,27 +100,48 @@ def forward_step(batch_iter, model):
             attention_mask = batch["attention_mask"].to(bool)
             position_ids = batch["position_ids"]
 
-            new_sequences, new_attention_mask, new_position_ids = remove_left_padding(
-                sequences,
-                attention_mask,
-                position_ids,
-                self.tf_config.sequence_parallel,
-                pre_process=mpu.is_pipeline_first_stage(ignore_virtual=True),
-            )
+            if self.use_sample_packing:
+                new_sequences, packed_seq_params = preprocess_packed_seqs(
+                    sequences,
+                    attention_mask,
+                    pre_process=mpu.is_pipeline_first_stage(ignore_virtual=True),
+                )
+                new_attention_mask = None
+                new_position_ids = None
+            else:
+                new_sequences, new_attention_mask, new_position_ids = remove_left_padding(
+                    sequences,
+                    attention_mask,
+                    position_ids,
+                    self.tf_config.sequence_parallel,
+                    pre_process=mpu.is_pipeline_first_stage(ignore_virtual=True),
+                )
+                packed_seq_params = None
 
             outputs = model(
                 new_sequences,
                 new_position_ids,
                 new_attention_mask,
+                packed_seq_params=packed_seq_params,
             )
 
-            outputs = recover_left_padding(
-                outputs,
-                new_attention_mask,
-                attention_mask,
-                seq_len,
-                post_process=mpu.is_pipeline_last_stage(ignore_virtual=True),
-            )
+            if self.use_sample_packing:
+                outputs = postprocess_packed_seqs(
+                    outputs,
+                    packed_seq_params,
+                    attention_mask,
+                    micro_batch_size,
+                    seq_len,
+                    post_process=mpu.is_pipeline_last_stage(ignore_virtual=True),
+                )
+            else:
+                outputs = recover_left_padding(
+                    outputs,
+                    new_attention_mask,
+                    attention_mask,
+                    seq_len,
+                    post_process=mpu.is_pipeline_last_stage(ignore_virtual=True),
+                )
 
             return outputs, partial(collection_func, data=batch)
 
@@ -192,6 +217,7 @@ def loss_func(logits, data):
                 vocab_end_index=(tp_rank + 1) * logits.shape[-1],
                 tp_group=tp_grp,
                 inference_only=False,
+                cp_group=None,  # we handle cp gathering in `postprocess_packed_seqs`
                 chunk_size=None,
             )
 
@@ -240,27 +266,48 @@ def forward_step(batch_iter, model):
             attention_mask = batch["attention_mask"].to(bool)
             position_ids = batch["position_ids"]
 
-            new_sequences, new_attention_mask, new_position_ids = remove_left_padding(
-                sequences,
-                attention_mask,
-                position_ids,
-                self.tf_config.sequence_parallel,
-                pre_process=mpu.is_pipeline_first_stage(ignore_virtual=True),
-            )
+            if self.use_sample_packing:
+                new_sequences, packed_seq_params = preprocess_packed_seqs(
+                    sequences,
+                    attention_mask,
+                    pre_process=mpu.is_pipeline_first_stage(ignore_virtual=True),
+                )
+                new_attention_mask = None
+                new_position_ids = None
+            else:
+                new_sequences, new_attention_mask, new_position_ids = remove_left_padding(
+                    sequences,
+                    attention_mask,
+                    position_ids,
+                    self.tf_config.sequence_parallel,
+                    pre_process=mpu.is_pipeline_first_stage(ignore_virtual=True),
+                )
+                packed_seq_params = None
 
             outputs = model(
                 new_sequences,
                 new_position_ids,
                 new_attention_mask,
+                packed_seq_params=packed_seq_params,
             )
 
-            outputs = recover_left_padding(
-                outputs,
-                new_attention_mask,
-                attention_mask,
-                seq_len,
-                post_process=mpu.is_pipeline_last_stage(ignore_virtual=True),
-            )
+            if self.use_sample_packing:
+                outputs = postprocess_packed_seqs(
+                    outputs,
+                    packed_seq_params,
+                    attention_mask,
+                    micro_batch_size,
+                    seq_len,
+                    post_process=mpu.is_pipeline_last_stage(ignore_virtual=True),
+                )
+            else:
+                outputs = recover_left_padding(
+                    outputs,
+                    new_attention_mask,
+                    attention_mask,
+                    seq_len,
+                    post_process=mpu.is_pipeline_last_stage(ignore_virtual=True),
+                )
 
             return outputs, partial(loss_func, data=batch)
 
diff --git a/skyrl-train/tests/gpu/test_megatron_worker.py b/skyrl-train/tests/gpu/test_megatron_worker.py