Integrate MaxText CI with Codecov

shralex · shralex · commit 9c766f50aefe · 2025-12-22T03:37:35.000Z
Integrates Codecov using a two-flag scheme (regular, scheduled) and carryforward logic to accurately track coverage across tiered test suites.

Adds codecov.yml to enable carryforward for tests skipped in PRs (scheduled_only).
Updates test workflows to generate coverage reports via pytest-cov and upload results with conditional flags.
Sets Project coverage to track the full scheduled baseline and Patch coverage to evaluate new code against regular PR tests.
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -0,0 +1,64 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MaxText Codecov Configuration
+# 
+# We use a two-flag scheme ('regular' and 'scheduled') to handle our tiered test suite.
+# 'carryforward' is enabled because Pull Requests only run a subset of tests (excluding 'scheduled_only').
+# Without it, PRs would show a significant coverage drop as they would 'overwrite' the full-suite results.
+#
+# Scheme:
+# - 'regular': Updated by every PR/Schedule. Used to evaluate 'patch' (new code) coverage.
+# - 'scheduled': Updated ONLY by scheduled full runs. Used to anchor 'project' (total health) coverage.
+# During PRs, the 'scheduled' flag is carried forward from the last full run on 'main' to keep the score stable.
+
+# Exclude non-source code, deprecated and experimental folders from coverage tracking
+ignore:
+  - "src/MaxText/assets"
+  - "src/MaxText/configs"
+  - "src/MaxText/examples"
+  - "src/MaxText/experimental"
+  - "src/MaxText/inference"
+  - "src/MaxText/inference_mlperf"
+  - "src/MaxText/scratch_code"
+  - "src/MaxText/test_assets"
+
+
+flags:
+  # Updated on every PR and during every scheduled run (contains a subset of tests).
+  regular:
+    carryforward: true
+  # Updated ONLY during scheduled runs (contains all tests).
+  scheduled:
+    carryforward: true
+
+coverage:
+  status:
+    # Project score remains stable at the 'Full Suite' level.
+    # It carries forward the last 'scheduled' results during PRs.
+    project:
+      default:
+        target: auto
+        threshold: 5% # fail on 5+ percent degradation
+        flags:
+          - scheduled
+
+    # Patch score provides feedback on the code changed in a PR.
+    patch:
+      default:
+        target: auto
+        threshold: 5% # fail on 5+ percent degradation
+        flags:
+          - regular
+
diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml
@@ -79,6 +79,7 @@ jobs:
           source .venv/bin/activate
           maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
           uv pip install ${maxtext_wheel}[tpu] --resolution=lowest
+          uv pip install pytest-cov
           uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
           python3 --version
           python3 -m pip freeze
@@ -96,8 +97,21 @@ jobs:
           export MAXTEXT_TEST_ASSETS_ROOT=$(pwd)/src/MaxText/test_assets
           export MAXTEXT_PKG_DIR=$(pwd)/src/MaxText
           # TODO(b/454659463): Enable test_default_hlo_match after volume mount is supported.
-          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest and not CompileThenLoad" --durations=0
-    
+          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} \
+            -v \
+            -m "${FINAL_PYTEST_MARKER}" \
+            -k "not AotHloIdenticalTest and not CompileThenLoad" \
+            --durations=0 \
+            --cov=src/MaxText \
+            --cov-report=xml
+      - name: Upload results to Codecov
+        uses: codecov/codecov-action@v5
+        continue-on-error: true
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          # If scheduled, upload to BOTH flags. If PR, upload ONLY to regular.
+          flags: ${{ inputs.is_scheduled_run == 'true' && 'regular,scheduled' || 'regular' }}
+
     services:
       resource_manager:
         image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:latest
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
@@ -88,6 +88,7 @@ jobs:
           uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
           python3 --version
           python3 -m pip freeze
+          uv pip install pytest-cov
       - name: Copy test assets files
         run : gcloud storage cp gs://maxtext-test-assets/* src/MaxText/test_assets
       - name: Run Tests
@@ -107,6 +108,25 @@ jobs:
           if [ "${{ inputs.device_type }}" != "cuda12" ]; then
             export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536'
           fi
+          if [ "${{ inputs.total_workers }}" -gt 1 ]; then
+            .venv/bin/python3 -m pip install --quiet pytest-split
+            SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }}"
+          else
+            SPLIT_ARGS=""
+          fi
           # TODO: Fix the skipped tests and remove the deselect flags
-          [ "${{ inputs.total_workers }}" -gt 1 ] && .venv/bin/python3 -m pip install --quiet pytest-split && SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }}" || SPLIT_ARGS=""
-          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" --durations=0 --deselect "tests/tokenizer_test.py::TokenizerTest::test_detokenize" $SPLIT_ARGS
+          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} \
+            -v \
+            -m "${FINAL_PYTEST_MARKER}" \
+            --durations=0 \
+            --deselect "tests/tokenizer_test.py::TokenizerTest::test_detokenize" \
+            --cov=src/MaxText \
+            --cov-report=xml \
+            $SPLIT_ARGS
+      - name: Upload results to Codecov
+        uses: codecov/codecov-action@v5
+        continue-on-error: true
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          # If scheduled, upload to BOTH flags. If PR, upload ONLY to regular.
+          flags: ${{ inputs.is_scheduled_run == 'true' && 'regular,scheduled' || 'regular' }}
diff --git a/tests/integration_tests/checkpoint_compatibility_test.py b/tests/integration_tests/checkpoint_compatibility_test.py
@@ -82,6 +82,7 @@ def run_checkpoint_compatibility(hardware, attention_type):
 
 @pytest.mark.integration_test
 @pytest.mark.tpu_only
+@pytest.mark.skip(reason="Flaky test b/470704234")
 def test_autoselected_attention():
   run_checkpoint_compatibility("tpu", "autoselected")