CK mha bwd: add sink attention score gradient support #1788
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Atom Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| types: [opened, synchronize, reopened, ready_for_review, labeled] | |
| branches: [main] | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| env: | |
| ATOM_BRANCH: "main" | |
| ATOM_REPOSITORY_URL: "ROCm/ATOM" | |
| BASE_IMAGE: "rocm/atom-dev:latest" | |
| GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/Aiter.git' }} | |
| GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} | |
| jobs: | |
| check-signal: | |
| if: >- | |
| github.event_name != 'pull_request' || | |
| (github.event.pull_request.draft == false && | |
| (contains(github.event.pull_request.labels.*.name, 'ci:atom') || | |
| contains(github.event.pull_request.labels.*.name, 'ci:all'))) | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download and check signal artifact | |
| run: ./.github/scripts/check_signal.sh | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GITHUB_SHA: ${{ github.sha }} | |
| atom_benchmark: | |
| if: >- | |
| github.event_name != 'pull_request' || | |
| (github.event.pull_request.draft == false && | |
| (contains(github.event.pull_request.labels.*.name, 'ci:atom') || | |
| contains(github.event.pull_request.labels.*.name, 'ci:all'))) | |
| needs: [check-signal] | |
| name: ATOM Benchmark | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| # run_on_pr: true = run on all events; false = skip on PR (still runs on push/schedule/workflow_dispatch) | |
| - model_name: "DeepSeek-R1-0528" | |
| label: MI325 | |
| model_path: "deepseek-ai/DeepSeek-R1-0528" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.94" | |
| runner: aiter-8gpu-runner | |
| run_on_pr: true | |
| - model_name: "gpt-oss-120b" | |
| label: MI355 | |
| model_path: "openai/gpt-oss-120b" | |
| extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3" | |
| env_vars: | | |
| ATOM_GPT_OSS_MODEL=1 | |
| accuracy_test_threshold: "0.38" | |
| runner: linux-aiter-mi355-1 | |
| run_on_pr: true | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout code | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: ${{ env.ATOM_REPOSITORY_URL }} | |
| branch: ${{ env.ATOM_BRANCH }} | |
| - name: Download the ATOM base image | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| docker pull ${{ env.BASE_IMAGE }} | |
| - name: Generate Dockerfile | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| cat <<EOF > Dockerfile.mod | |
| FROM ${{ env.BASE_IMAGE }} | |
| RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true | |
| RUN pip uninstall -y amd-aiter | |
| RUN pip install --upgrade "pybind11>=3.0.1" | |
| RUN pip show pybind11 | |
| RUN rm -rf /app/aiter-test | |
| RUN git clone ${{ env.GITHUB_REPO_URL }} /app/aiter-test && \\ | |
| cd /app/aiter-test && \\ | |
| git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\ | |
| git submodule sync && git submodule update --init --recursive && \\ | |
| MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop | |
| RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true | |
| EOF | |
| - name: Build the ATOM test image | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| docker build --network=host \ | |
| --no-cache \ | |
| -t rocm/aiter-ci:atom-test \ | |
| -f Dockerfile.mod . | |
| - name: Start CI container | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| echo "Clean up containers..." | |
| docker ps -aq -f name=atom_test | xargs -r docker stop | xargs -r docker rm | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| cat > /tmp/env_file.txt << 'EOF' | |
| ${{ matrix.env_vars }} | |
| EOF | |
| echo "Starting container: rocm/aiter-ci:atom-test" | |
| echo "Model-specific environment variables for ${{ matrix.model_name }}:" | |
| cat /tmp/env_file.txt | |
| docker run -dt --device=/dev/kfd $DEVICE_FLAG \ | |
| --ipc=host --group-add video \ | |
| --shm-size=16G \ | |
| --privileged \ | |
| --cap-add=SYS_PTRACE \ | |
| --env-file /tmp/env_file.txt \ | |
| --security-opt seccomp=unconfined \ | |
| --ulimit memlock=-1 \ | |
| --ulimit stack=67108864 \ | |
| -e ATOM_DISABLE_MMAP=true \ | |
| -e HF_TOKEN="${HF_TOKEN:-${{ secrets.HF_TOKEN_TEST }}}" \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| --name atom_aiter_test \ | |
| rocm/aiter-ci:atom-test | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Check shm size | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| df -h # for testing | |
| df -h /dev/shm | |
| docker exec atom_aiter_test df -h /dev/shm | |
| - name: Check version | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| docker exec atom_aiter_test bash -lc "pip show amd-aiter atom && pip list" | |
| - name: Download Models | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.model_name == 'DeepSeek-R1-0528' | |
| run: | | |
| if [ -d "/run" ]; then | |
| echo "/run directory found, downloading model to /run/${{ matrix.model_path }}" | |
| if ! docker exec atom_aiter_test bash -lc "hf download ${{ matrix.model_path }} --local-dir /run/${{ matrix.model_path }}"; then | |
| echo "Model download failed for '${{ matrix.model_path }}'. Aborting." | |
| exit 1 | |
| fi | |
| else | |
| echo "/run directory not found, skipping model download" | |
| fi | |
| - name: Run ATOM accuracy test | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| timeout-minutes: 60 | |
| run: | | |
| set -euo pipefail | |
| echo "" | |
| echo "========== Launching ATOM server ==========" | |
| if [ -d "/run/${{ matrix.model_path }}" ]; then | |
| model_path="/run/${{ matrix.model_path }}" | |
| else | |
| model_path="${{ matrix.model_path }}" | |
| fi | |
| docker exec atom_aiter_test bash -lc " | |
| .github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }} | |
| " | |
| echo "" | |
| echo "========== Running accuracy test ==========" | |
| docker exec atom_aiter_test bash -lc " | |
| .github/scripts/atom_test.sh accuracy $model_path | |
| " 2>&1 | tee atom_accuracy_output.txt | |
| - name: Check accuracy test results | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success() | |
| run: | | |
| result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1) | |
| if [ -z "$result_file" ] || [ ! -f "$result_file" ]; then | |
| echo "ERROR: No results JSON file found in accuracy_test_results/" | |
| exit 2 | |
| else | |
| echo "RESULT_FILE: $result_file" | |
| fi | |
| flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file") | |
| echo "Flexible extract value: $flexible_extract_value" | |
| echo "Accuracy test threshold: ${{ matrix.accuracy_test_threshold }}" | |
| result=$(awk -v val="$flexible_extract_value" -v threshold="${{ matrix.accuracy_test_threshold }}" 'BEGIN {print (val < threshold) ? 1 : 0}') | |
| if [ "$result" -eq 1 ]; then | |
| echo "Accuracy test failed: Flexible extract value $flexible_extract_value is less than the threshold ${{ matrix.accuracy_test_threshold }}." | |
| exit 1 | |
| else | |
| echo "Accuracy test passed: Flexible extract value $flexible_extract_value is greater than or equal to the threshold ${{ matrix.accuracy_test_threshold }}." | |
| exit 0 | |
| fi | |
| - name: Collect Test Summary | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success() | |
| run: | | |
| echo "Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY | |
| awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY | |
| - name: Upload output | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ matrix.model_name }}_atom_accuracy_output.txt | |
| path: atom_accuracy_output.txt | |
| - name: Clean Up | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always() | |
| run: | | |
| docker stop atom_aiter_test || true | |
| docker rm atom_aiter_test || true |