Skip to content

test: byoidc compatibility #1290

test: byoidc compatibility

test: byoidc compatibility #1290

name: Guided notebooks tests
on:
pull_request:
branches: [ main ]
types: [ labeled, synchronize ]
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
KUEUE_VERSION: v0.13.4
KUBERAY_VERSION: v1.4.2
jobs:
verify-0_basic_ray:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
- name: Deploy Kueue and KubeRay
id: deploy
run: |
# Install Kueue
echo "Installing Kueue ${KUEUE_VERSION}..."
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
# Install KubeRay from opendatahub-io fork (has RHOAI features)
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
# Create default Kueue resources for the tests
echo "Creating Kueue resources..."
kubectl apply -f - <<EOF
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: default-flavor
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
namespaceSelector: {}
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
flavors:
- name: default-flavor
resources:
- name: cpu
nominalQuota: 100
- name: memory
nominalQuota: 100Gi
- name: nvidia.com/gpu
nominalQuota: 10
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: local-queue
namespace: default
annotations:
kueue.x-k8s.io/default-queue: "true"
spec:
clusterQueue: cluster-queue
EOF
- name: Setup Guided notebooks execution
run: |
echo "Installing papermill and dependencies..."
pip install poetry papermill ipython ipykernel
# Disable virtualenv due to problems using packaged in virtualenv in papermill
poetry config virtualenvs.create false
echo "Installing SDK..."
poetry install --with test,docs
- name: Run 0_basic_ray.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
sed -i "s/head_memory_limits=8,/head_memory_limits=8, namespace='default',/" 0_basic_ray.ipynb
# Disable dashboard check as KinD doesn't have HTTPRoute/Route configured
sed -i "s/cluster.wait_ready()/cluster.wait_ready(dashboard_check=False)/" 0_basic_ray.ipynb
# Run notebook
poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
working-directory: demo-notebooks/guided-demos
- name: Print Kueue operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue operator logs"
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${TEMP_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs-0_basic_ray
retention-days: 10
path: |
${{ env.TEMP_DIR }}/**/*.log
verify-4_rayjob_existing_cluster:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
- name: Deploy Kueue and KubeRay
id: deploy
run: |
# Install Kueue
echo "Installing Kueue ${KUEUE_VERSION}..."
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
# Install KubeRay from opendatahub-io fork (has RHOAI features)
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
# Create default Kueue resources for the tests
echo "Creating Kueue resources..."
kubectl apply -f - <<EOF
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: default-flavor
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
namespaceSelector: {}
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
flavors:
- name: default-flavor
resources:
- name: cpu
nominalQuota: 100
- name: memory
nominalQuota: 100Gi
- name: nvidia.com/gpu
nominalQuota: 10
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: local-queue
namespace: default
annotations:
kueue.x-k8s.io/default-queue: "true"
spec:
clusterQueue: cluster-queue
EOF
- name: Setup Guided notebooks execution
run: |
echo "Installing papermill and dependencies..."
pip install poetry papermill ipython ipykernel
# Disable virtualenv due to problems using packaged in virtualenv in papermill
poetry config virtualenvs.create false
echo "Installing SDK..."
poetry install --with test,docs
- name: Run 4_rayjob_existing_cluster.ipynb
run: |
set -euo pipefail
# Remove oc login cell, as KinD doesn't support oc login
jq -r 'del(.cells[] | select(.source[] | contains("oc login")))' 4_rayjob_existing_cluster.ipynb > 4_rayjob_existing_cluster.ipynb.tmp && mv 4_rayjob_existing_cluster.ipynb.tmp 4_rayjob_existing_cluster.ipynb
# Remove GPU requests (KinD doesn't have GPUs)
sed -i "s/head_extended_resource_requests={'nvidia.com\/gpu':1},/head_extended_resource_requests={'nvidia.com\/gpu':0},/" 4_rayjob_existing_cluster.ipynb
sed -i "s/worker_extended_resource_requests={'nvidia.com\/gpu':1},/worker_extended_resource_requests={'nvidia.com\/gpu':0},/" 4_rayjob_existing_cluster.ipynb
# Set explicit namespace for RayJob (notebook stores JSON with escaped quotes)
sed -i 's/namespace=\\"your-namespace\\"/namespace=\\"default\\"/' 4_rayjob_existing_cluster.ipynb
# Add namespace to ClusterConfiguration
sed -i "s/head_memory_limits=8,/head_memory_limits=8, namespace='default',/" 4_rayjob_existing_cluster.ipynb
# Run notebook
poetry run papermill 4_rayjob_existing_cluster.ipynb 4_rayjob_existing_cluster_out.ipynb --log-output --execution-timeout 600
working-directory: demo-notebooks/guided-demos
- name: Print Kueue operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue operator logs"
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${TEMP_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs-4_rayjob_existing_cluster
retention-days: 10
path: |
${{ env.TEMP_DIR }}/**/*.log
verify-5_submit_rayjob_cr:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
- name: Deploy Kueue and KubeRay
id: deploy
run: |
# Install Kueue
echo "Installing Kueue ${KUEUE_VERSION}..."
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
# Install KubeRay from opendatahub-io fork (has RHOAI features)
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
# Create default Kueue resources for the tests
echo "Creating Kueue resources..."
kubectl apply -f - <<EOF
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: default-flavor
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
namespaceSelector: {}
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
flavors:
- name: default-flavor
resources:
- name: cpu
nominalQuota: 100
- name: memory
nominalQuota: 100Gi
- name: nvidia.com/gpu
nominalQuota: 10
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: local-queue
namespace: default
annotations:
kueue.x-k8s.io/default-queue: "true"
spec:
clusterQueue: cluster-queue
EOF
- name: Setup Guided notebooks execution
run: |
echo "Installing papermill and dependencies..."
pip install poetry papermill ipython ipykernel
# Disable virtualenv due to problems using packaged in virtualenv in papermill
poetry config virtualenvs.create false
echo "Installing SDK..."
poetry install --with test,docs
- name: Run 5_submit_rayjob_cr.ipynb
run: |
set -euo pipefail
# Remove oc login cell, as KinD doesn't support oc login
jq -r 'del(.cells[] | select(.source[] | contains("oc login")))' 5_submit_rayjob_cr.ipynb > 5_submit_rayjob_cr.ipynb.tmp && mv 5_submit_rayjob_cr.ipynb.tmp 5_submit_rayjob_cr.ipynb
# Set explicit namespace (notebook stores JSON with escaped quotes)
sed -i 's/namespace=\\"your-namespace\\"/namespace=\\"default\\"/' 5_submit_rayjob_cr.ipynb
# Run notebook
poetry run papermill 5_submit_rayjob_cr.ipynb 5_submit_rayjob_cr_out.ipynb --log-output --execution-timeout 600
working-directory: demo-notebooks/guided-demos
- name: Print Kueue operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue operator logs"
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${TEMP_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs-5_submit_rayjob_cr
retention-days: 10
path: |
${{ env.TEMP_DIR }}/**/*.log