Skip to content

refactor: use ctypes binding #1797

refactor: use ctypes binding

refactor: use ctypes binding #1797

Workflow file for this run

name: Atom Test
on:
push:
branches: [main]
pull_request:
types: [opened, synchronize, reopened, ready_for_review, labeled]
branches: [main]
paths-ignore:
- '**/*.md'
- 'docs/**'
- 'LICENSE'
- '.gitignore'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
ATOM_BRANCH: "main"
ATOM_REPOSITORY_URL: "ROCm/ATOM"
BASE_IMAGE: "rocm/atom-dev:latest"
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/Aiter.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
jobs:
check-signal:
if: >-
github.event_name != 'pull_request' ||
(github.event.pull_request.draft == false &&
(contains(github.event.pull_request.labels.*.name, 'ci:atom') ||
contains(github.event.pull_request.labels.*.name, 'ci:all')))
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download and check signal artifact
run: ./.github/scripts/check_signal.sh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_SHA: ${{ github.sha }}
atom_benchmark:
if: >-
github.event_name != 'pull_request' ||
(github.event.pull_request.draft == false &&
(contains(github.event.pull_request.labels.*.name, 'ci:atom') ||
contains(github.event.pull_request.labels.*.name, 'ci:all')))
needs: [check-signal]
name: ATOM Benchmark
strategy:
fail-fast: false
matrix:
include:
# run_on_pr: true = run on all events; false = skip on PR (still runs on push/schedule/workflow_dispatch)
- model_name: "DeepSeek-R1-0528"
label: MI325
model_path: "deepseek-ai/DeepSeek-R1-0528"
extraArgs: "--kv_cache_dtype fp8 -tp 8"
env_vars: ""
accuracy_test_threshold: "0.94"
runner: aiter-8gpu-runner
run_on_pr: true
- model_name: "gpt-oss-120b"
label: MI355
model_path: "openai/gpt-oss-120b"
extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
env_vars: |
ATOM_GPT_OSS_MODEL=1
accuracy_test_threshold: "0.38"
runner: linux-aiter-mi355-1
run_on_pr: true
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
uses: actions/checkout@v4
with:
repository: ${{ env.ATOM_REPOSITORY_URL }}
branch: ${{ env.ATOM_BRANCH }}
- name: Download the ATOM base image
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
docker pull ${{ env.BASE_IMAGE }}
- name: Generate Dockerfile
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.BASE_IMAGE }}
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y amd-aiter
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip show pybind11
RUN rm -rf /app/aiter-test
RUN git clone ${{ env.GITHUB_REPO_URL }} /app/aiter-test && \\
cd /app/aiter-test && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
EOF
- name: Build the ATOM test image
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
docker build --network=host \
--no-cache \
-t rocm/aiter-ci:atom-test \
-f Dockerfile.mod .
- name: Start CI container
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
echo "Clean up containers..."
docker ps -aq -f name=atom_test | xargs -r docker stop | xargs -r docker rm
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
cat > /tmp/env_file.txt << 'EOF'
${{ matrix.env_vars }}
EOF
echo "Starting container: rocm/aiter-ci:atom-test"
echo "Model-specific environment variables for ${{ matrix.model_name }}:"
cat /tmp/env_file.txt
docker run -dt --device=/dev/kfd $DEVICE_FLAG \
--ipc=host --group-add video \
--shm-size=16G \
--privileged \
--cap-add=SYS_PTRACE \
--env-file /tmp/env_file.txt \
--security-opt seccomp=unconfined \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-e ATOM_DISABLE_MMAP=true \
-e HF_TOKEN="${HF_TOKEN:-${{ secrets.HF_TOKEN_TEST }}}" \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name atom_aiter_test \
rocm/aiter-ci:atom-test
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Check shm size
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
df -h # for testing
df -h /dev/shm
docker exec atom_aiter_test df -h /dev/shm
- name: Check version
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
docker exec atom_aiter_test bash -lc "pip show amd-aiter atom && pip list"
- name: Download Models
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.model_name == 'DeepSeek-R1-0528'
run: |
if [ -d "/run" ]; then
echo "/run directory found, downloading model to /run/${{ matrix.model_path }}"
if ! docker exec atom_aiter_test bash -lc "hf download ${{ matrix.model_path }} --local-dir /run/${{ matrix.model_path }}"; then
echo "Model download failed for '${{ matrix.model_path }}'. Aborting."
exit 1
fi
else
echo "/run directory not found, skipping model download"
fi
- name: Run ATOM accuracy test
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
timeout-minutes: 60
run: |
set -euo pipefail
echo ""
echo "========== Launching ATOM server =========="
if [ -d "/run/${{ matrix.model_path }}" ]; then
model_path="/run/${{ matrix.model_path }}"
else
model_path="${{ matrix.model_path }}"
fi
docker exec atom_aiter_test bash -lc "
.github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }}
"
echo ""
echo "========== Running accuracy test =========="
docker exec atom_aiter_test bash -lc "
.github/scripts/atom_test.sh accuracy $model_path
" 2>&1 | tee atom_accuracy_output.txt
- name: Check accuracy test results
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success()
run: |
result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1)
if [ -z "$result_file" ] || [ ! -f "$result_file" ]; then
echo "ERROR: No results JSON file found in accuracy_test_results/"
exit 2
else
echo "RESULT_FILE: $result_file"
fi
flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file")
echo "Flexible extract value: $flexible_extract_value"
echo "Accuracy test threshold: ${{ matrix.accuracy_test_threshold }}"
result=$(awk -v val="$flexible_extract_value" -v threshold="${{ matrix.accuracy_test_threshold }}" 'BEGIN {print (val < threshold) ? 1 : 0}')
if [ "$result" -eq 1 ]; then
echo "Accuracy test failed: Flexible extract value $flexible_extract_value is less than the threshold ${{ matrix.accuracy_test_threshold }}."
exit 1
else
echo "Accuracy test passed: Flexible extract value $flexible_extract_value is greater than or equal to the threshold ${{ matrix.accuracy_test_threshold }}."
exit 0
fi
- name: Collect Test Summary
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success()
run: |
echo "Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY
awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY
- name: Upload output
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always()
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.model_name }}_atom_accuracy_output.txt
path: atom_accuracy_output.txt
- name: Clean Up
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always()
run: |
docker stop atom_aiter_test || true
docker rm atom_aiter_test || true