Skip to content

test: byoidc compatibility #2236

test: byoidc compatibility

test: byoidc compatibility #2236

Workflow file for this run

# e2e tests workflow for CodeFlare-SDK
name: e2e
on:
pull_request:
branches:
- main
- "release-*"
- ray-jobs-feature
paths-ignore:
- "docs/**"
- "**.adoc"
- "**.md"
- "LICENSE"
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
KUEUE_VERSION: v0.13.4
KUBERAY_VERSION: v1.4.2
jobs:
kubernetes:
runs-on: gpu-t4-4-core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: "project-codeflare/codeflare-common"
ref: "main"
path: "common"
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 1
- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
- name: Deploy Kueue and KubeRay
id: deploy
run: |
# Install Kueue
echo "Installing Kueue ${KUEUE_VERSION}..."
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
# Install KubeRay from opendatahub-io fork (has RHOAI features)
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
# Create default Kueue resources for the tests
echo "Creating Kueue resources..."
kubectl apply -f - <<EOF
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: default-flavor
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
namespaceSelector: {}
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
flavors:
- name: default-flavor
resources:
- name: cpu
nominalQuota: 100
- name: memory
nominalQuota: 100Gi
- name: nvidia.com/gpu
nominalQuota: 10
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: local-queue
namespace: default
annotations:
kueue.x-k8s.io/default-queue: "true"
spec:
clusterQueue: cluster-queue
EOF
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl config use-context sdk-user
- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e/ -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
- name: Print Kueue controller logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue controller logs"
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log