[sglang] Upgrade sglang to 0.4.6.post1 & misc fixes (volcengine#1385)

ocss884 · web-flow · commit 25861fb8f45f · 2025-05-04T11:53:21.000-07:00
### Checklist Before Starting - [x] Search for similar PR(s). ### What does this PR do? - [x] upgrade required sglang version to 0.4.6.post1 which suports Qwen3 - [x] fix: flush_cache was never awaited - [x] remove unused env - [x] fix: add rank num to port to avoid SGLang picking the same port when random.seed being set - [x] feat: disable SGLang memory inbalance check by default sgl-project/sglang#5426 - [x] update setup.py to avoid old version pip can not resolving deps - [x] fix: tools_kwargs length mismatch with batch volcengine#1380 > Add one-line overview of what this PR aims to achieve or accomplish. ### High-Level Design > Demonstrate the high-level design if this PR is complex. ### Specific Changes > List the specific changes. ### API > Demonstrate how the API changes if any. ### Usage Example > Provide usage example(s) for easier usage. ```python # Add code snippet or script demonstrating how to use this ``` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluatuion results, etc. ### Additional Info. - **Issue Number**: Fixes issue # or discussion # if any. - **Training**: [Note which backend this PR will affect: FSDP, Megatron, both, or none] - **Inference**: [Note which backend this PR will affect: vLLM, SGLang, both, or none] ### Checklist Before Submitting - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting). - [ ] Add `[BREAKING]` to the PR title if it breaks any API. - [ ] Update the documentation about your changes in the [docs](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add CI test(s) if neccessary.
diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml
@@ -180,7 +180,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.5.post3
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -209,7 +209,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.5.post3
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -238,7 +238,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.5.post3
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -267,7 +267,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.5.post3
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
       options: --gpus all --shm-size=50g # Visual dataloader requires large memory
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml
@@ -41,7 +41,7 @@ jobs:
       HF_HUB_ENABLE_HF_TRANSFER: 1
       SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.5.post3
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/docker/Dockerfile.sglang b/docker/Dockerfile.sglang
@@ -36,8 +36,8 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
     pip config set global.extra-index-url "${PIP_INDEX}" && \
     python -m pip install --upgrade pip
 
-# Install sglang-0.4.5.post3 and torch-memory-saver
-RUN pip install "sglang[all]==0.4.5.post3" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+# Install sglang-0.4.6.post1 and torch-memory-saver
+RUN pip install "sglang[all]==0.4.6.post1" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
 
 # Install torch-2.6.0
 RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
@@ -47,8 +47,8 @@ RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.
 
 # Install flash_attn-2.7.4.post1
 RUN pip uninstall -y transformer-engine flash-attn && \
-    wget -v https://ghfast.top/https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+    wget -v https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 
 # Fix cv2
 RUN pip uninstall -y pynvml nvidia-ml-py && \
diff --git a/docs/start/install.rst b/docs/start/install.rst
@@ -42,7 +42,7 @@ For vLLM with Megatron or FSDP, please use ``whatcanyousee/verl:ngc-cu124-vllm0.
 
 For latest vLLM with FSDP, please refer to ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``.
 
-For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.5.post3`` which is provided by SGLang RL Group.
+For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1`` which is provided by SGLang RL Group.
 
 See files under ``docker/`` for NGC-based image or if you want to build your own.
 
diff --git a/docs/workers/sglang_worker.rst b/docs/workers/sglang_worker.rst
@@ -10,18 +10,18 @@ Introduction
 ------------
 `SGLang <https://github.com/sgl-project/sglang>`_ is an open-source state-of-the-art inference service engine, fully adopted by xAI to support all inference needs of Grok during research and serving processes.
 
-Currently, verl fully supports using SGLang as the inference engine during the rollout phase. As a rollout engine, SGLang provides the same feature coverage as vLLM., including memory saving and multi-node rollout features. After installing verl and SGLang, simply add ``actor_rollout_ref.rollout.name=sglang`` at startup to seamlessly switch between the two inference frameworks.
+Currently, verl fully supports using SGLang as the inference engine during the rollout phase. As a rollout engine, SGLang provides the same feature coverage as vLLM., including memory saving and multi-node rollout features. After installing verl and SGLang, simply add ``actor_rollout_ref.rollout.name=sglang`` at startup script to seamlessly switch between the two inference frameworks.
 
 In addition, the SGLang team is actively working on supporting features such as Multi-Turn Agentic RL, VLM RLHF, Server-Based RLHF, and Partial Rollout. You can track the related development progress in the `Tracking Roadmap <https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/issues/74>`_.
 
 Installation
 ------------
-First, follow the requirements outlined in `Install SGLang as rollout backend <https://verl.readthedocs.io/en/latest/start/install.html#install-sglang-as-rollout-backend>`_ for installation, and ensure that the version requirements are met. Generally, using the latest `SGLang <https://github.com/sgl-project/sglang>`_ from the main branch will allow stable training startup without needing to target a specific version.
+Please always follow the following command to install SGLang with verl. 
 
 .. code-block:: bash
-
-    # Currently 0.4.5, subject to updates at any time, please refer to the latest version
-    pip install "sglang[all]>=0.4.5" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
+    pip install --upgrade pip
+    # Currently 0.4.6.post1, subject to updates at any time, please refer to the latest version specified in `setup.py`
+    pip install -e ".[sglang]"
 
 Using SGLang as the Inference Backend for PPO Training on a Single Machine
 -------------------------------------------------------------------------
diff --git a/recipe/dapo/src/main_dapo.py b/recipe/dapo/src/main_dapo.py
@@ -25,7 +25,6 @@
 
 def get_custom_reward_fn(config):
     import importlib.util
-    import os
 
     reward_fn_config = config.get("custom_reward_function") or {}
     file_path = reward_fn_config.get("path")
@@ -58,9 +57,6 @@ def main(config):
 
 
 def run_ppo(config) -> None:
-    # TODO(linjunrong.ocss884): this ENV is left for resolving SGLang conflict with ray devices
-    # isolation, will solve in the future
-    os.environ["ENSURE_CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "")
     if not ray.is_initialized():
         # this is for local ray cluster
         ray.init(
diff --git a/setup.py b/setup.py
@@ -36,8 +36,8 @@
     "pybind11",
     "pylatexenc",
     "ray[default]>=2.10",
-    "tensordict<=0.6.2",
     "torchdata",
+    "tensordict<=0.6.2",
     "transformers",
     "wandb",
 ]
@@ -50,7 +50,7 @@
 VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.3"]
 SGLANG_REQUIRES = [
     "tensordict<=0.6.2",
-    "sglang[all]==0.4.5.post3",
+    "sglang[srt,openai]==0.4.6.post1",
     "torch-memory-saver>=0.0.5",
 ]
 
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
@@ -65,9 +65,6 @@ def main(config):
 
 
 def run_ppo(config) -> None:
-    # TODO(linjunrong.ocss884): this ENV is left for resolving SGLang conflict with ray devices
-    # isolation, will solve in the future
-    os.environ["ENSURE_CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "")
     if not ray.is_initialized():
         # this is for local ray cluster
         ray.init(
diff --git a/verl/workers/rollout/sglang_rollout/sglang_rollout.py b/verl/workers/rollout/sglang_rollout/sglang_rollout.py
@@ -112,6 +112,7 @@ def __init__(
         """
         super().__init__()
         self.config = config
+        os.environ.setdefault("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK", "true")
 
         assert not (not config.enforce_eager and config.free_cache_engine), "disable CUDA graph (enforce_eager = False) if free cache engine"
 
@@ -128,7 +129,6 @@ def __init__(
                 tensor_model_parallel_size=tensor_parallel_size,
                 num_tp_per_train_tp=num_tp_per_train_tp,
             )
-
         assert model_hf_config.max_position_embeddings >= config.prompt_length + config.response_length, "model context length should be greater than total sequence length"
 
         tp_size = tensor_parallel_size
@@ -144,6 +144,7 @@ def __init__(
         # device_mesh_device = init_device_mesh("cuda", **device_mesh_kwargs)
 
         # get tp_rank of this process in this tp group
+        rank = device_mesh_cpu.get_rank()
         tp_rank = device_mesh_cpu["tp"].get_local_rank()
         visible_devices = [None] * device_mesh_cpu.size(1)
         torch.distributed.all_gather_object(visible_devices, os.environ["CUDA_VISIBLE_DEVICES"], device_mesh_cpu.get_group("tp"))
@@ -178,7 +179,10 @@ def __init__(
             load_format=load_format,
             dist_init_addr=dist_init_addr,
             nnodes=nnodes,
-            # NOTE(Chenyang): if you want to debug the sglang engine
+            # NOTE(linjunrong): add rank to prevent SGLang generate same port inside PortArgs.init_new
+            # when random.seed is being set during training
+            port=30000 + rank,
+            # NOTE(Chenyang): if you want to debug the SGLang engine output
             # please set the following parameters
             # Otherwise, it will make the engine run too slow
             # log_level="INFO",
@@ -320,6 +324,8 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
                 batch_size = batch_size * self.sampling_params["n"]
                 if "multi_modal_inputs" in non_tensor_batch.keys():
                     non_tensor_batch["multi_modal_inputs"] = np.repeat(non_tensor_batch["multi_modal_inputs"], self.sampling_params["n"], axis=0)
+                if "tools_kwargs" in non_tensor_batch.keys():
+                    non_tensor_batch["tools_kwargs"] = np.repeat(non_tensor_batch["tools_kwargs"], self.sampling_params["n"], axis=0)
             seq = torch.cat([idx, response], dim=-1)
 
         response_length = response.size(1)
@@ -350,6 +356,6 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
 
         # free cache engine
         if self.config.free_cache_engine and self.inference_engine._engine is not None and self.inference_engine._engine.tokenizer_manager is not None:
-            self.inference_engine._engine.tokenizer_manager.flush_cache()
+            self.inference_engine._engine.flush_cache()
 
         return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)