Skip to content

Commit 6a8d7ca

Browse files
committed
Revert "Try to make CI more stable."
This reverts commit ff77ac6.
1 parent 9b57c40 commit 6a8d7ca

File tree

4 files changed

+2
-23
lines changed

4 files changed

+2
-23
lines changed

.github/workflows/e2e_ppo_trainer_megatron_sglang.yml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,16 +131,12 @@ jobs:
131131
run: |
132132
ray stop --force
133133
OPTIM_MEMORY_EFFICIENT=True ENGINE=sglang SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
134-
with:
135-
max_attempts: 3
136134
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
137135
run: |
138136
ray stop --force
139137
export VLLM_USE_V1=1
140138
ray start --head
141139
ENGINE=sglang MODE=async RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
142-
with:
143-
max_attempts: 3
144140
- name: Profiling GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
145141
run: |
146142
ray stop --force
@@ -151,8 +147,6 @@ jobs:
151147
else
152148
echo "[SUCCESS] profile success"
153149
fi
154-
with:
155-
max_attempts: 3
156150
- name: clean up
157151
run: |
158152
rm -rf checkpoints

.github/workflows/e2e_ppo_trainer_megatron_vllm.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,15 +153,11 @@ jobs:
153153
ray stop --force
154154
ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=4 LORA_RANK=8 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False USE_DIST_CKPT=False \
155155
bash tests/special_e2e/run_ppo_trainer_megatron.sh
156-
with:
157-
max_attempts: 3
158156
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
159157
run: |
160158
ray stop --force
161159
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=1 COMMON_PP=4 LORA_RANK=8 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False USE_DIST_CKPT=False \
162160
bash tests/special_e2e/run_ppo_trainer_megatron.sh
163-
with:
164-
max_attempts: 3
165161
- name: clean up
166162
run: |
167163
rm -rf checkpoints
@@ -193,15 +189,11 @@ jobs:
193189
run: |
194190
ray stop --force
195191
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
196-
with:
197-
max_attempts: 3
198192
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
199193
run: |
200194
ray stop --force
201195
export VLLM_USE_V1=1
202196
ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
203-
with:
204-
max_attempts: 3
205197
- name: clean up
206198
run: |
207199
rm -rf checkpoints

.github/workflows/gpu_unit_tests.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ jobs:
8888
mode: "create"
8989
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
9090
mlp-image: "${{ env.IMAGE }}"
91-
max_attempts: 3
9291

9392
gpu_unit_tests:
9493
if: github.repository_owner == 'volcengine'
@@ -106,8 +105,6 @@ jobs:
106105
fetch-depth: 0
107106
- name: Install the current repository
108107
run: |
109-
pip3 list | grep cupy || true
110-
pip3 list | grep cupy | xargs pip3 uninstall -y || true
111108
pip3 install hf_transfer
112109
pip3 install -r requirements-test.txt
113110
pip3 install --no-deps -e .
@@ -117,8 +114,6 @@ jobs:
117114
- name: Run all GPU unit tests
118115
run: |
119116
pytest -s -x --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" tests/
120-
with:
121-
max_attempts: 3
122117
- name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
123118
run: |
124119
LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_special_linear_cross_entropy_tp.py

.github/workflows/sgl.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ jobs:
113113
fetch-depth: 0
114114
- name: Install the current repository
115115
run: |
116-
pip3 install hf_transfer fastmcp pytest-asyncio pytest-retry
116+
pip3 install hf_transfer fastmcp pytest-asyncio
117117
pip3 install -r requirements-test.txt
118118
pip3 install --no-deps -e .
119119
- name: Prepare gsm8k dataset
@@ -122,9 +122,7 @@ jobs:
122122
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
123123
- name: Test the latest SGLang Rollout async with agent loop
124124
run: |
125-
ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop --retries 3 --retry-delay 5
126-
with:
127-
max_attempts: 3
125+
ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop
128126
129127
cleanup:
130128
runs-on: ubuntu-latest

0 commit comments

Comments
 (0)