Try to make CI more stable.

joyang-nv · joyang-nv · commit cf205dbcdee9 · 2026-01-13T11:44:13.000+08:00
Signed-off-by: Jonas Yang &lt;joyang@nvidia.com&gt;
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
@@ -131,12 +131,16 @@ jobs:
         run: |
           ray stop --force
           OPTIM_MEMORY_EFFICIENT=True ENGINE=sglang SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
+        with:
+          max_attempts: 3
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
         run: |
           ray stop --force
           export VLLM_USE_V1=1
           ray start --head
           ENGINE=sglang MODE=async RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
+        with:
+          max_attempts: 3
       - name: Profiling GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
         run: |
           ray stop --force
@@ -147,6 +151,8 @@ jobs:
           else
             echo "[SUCCESS] profile success"
           fi
+        with:
+          max_attempts: 3
       - name: clean up
         run: |
           rm -rf checkpoints
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
@@ -153,11 +153,15 @@ jobs:
           ray stop --force
           ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=4 LORA_RANK=8 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False USE_DIST_CKPT=False \
           bash tests/special_e2e/run_ppo_trainer_megatron.sh
+        with:
+          max_attempts: 3
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
         run: |
           ray stop --force
           RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=1 COMMON_PP=4 LORA_RANK=8 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False USE_DIST_CKPT=False \
           bash tests/special_e2e/run_ppo_trainer_megatron.sh
+        with:
+          max_attempts: 3
       - name: clean up
         run: |
           rm -rf checkpoints
@@ -189,11 +193,15 @@ jobs:
         run: |
           ray stop --force
           ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
+        with:
+          max_attempts: 3
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
         run: |
           ray stop --force
           export VLLM_USE_V1=1
           ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
+        with:
+          max_attempts: 3
       - name: clean up
         run: |
           rm -rf checkpoints
diff --git a/.github/workflows/gpu_unit_tests.yml b/.github/workflows/gpu_unit_tests.yml
@@ -88,6 +88,7 @@ jobs:
           mode: "create"
           faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
           mlp-image: "${{ env.IMAGE }}"
+          max_attempts: 3
 
   gpu_unit_tests:
     if: github.repository_owner == 'volcengine'
@@ -105,6 +106,8 @@ jobs:
           fetch-depth: 0
       - name: Install the current repository
         run: |
+          pip3 list | grep cupy || true
+          pip3 list | grep cupy | xargs pip3 uninstall -y || true
           pip3 install hf_transfer
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
@@ -114,6 +117,8 @@ jobs:
       - name: Run all GPU unit tests
         run: |
           pytest -s -x --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" tests/
+        with:
+          max_attempts: 3
       - name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
         run: |
           LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_special_linear_cross_entropy_tp.py
diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml
@@ -113,7 +113,7 @@ jobs:
           fetch-depth: 0
       - name: Install the current repository
         run: |
-          pip3 install hf_transfer fastmcp pytest-asyncio
+          pip3 install hf_transfer fastmcp pytest-asyncio pytest-retry
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
       - name: Prepare gsm8k dataset
@@ -122,7 +122,9 @@ jobs:
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
       - name: Test the latest SGLang Rollout async with agent loop
         run: |
-          ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop
+          ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop --retries 3 --retry-delay 5
+        with:
+          max_attempts: 3
 
   cleanup:
     runs-on: ubuntu-latest