Skip to content
49 changes: 49 additions & 0 deletions .github/ct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
Comment thread
ombhojane marked this conversation as resolved.
Outdated
# Chart testing configuration for kubeflow-trainer

# Directories containing charts to test
chart-dirs:
- charts

# Charts to exclude from testing (if any)
excluded-charts: []

# Target branch for comparison (used for detecting changes)
target-branch: main

# Remote repository for comparison
remote: origin

# Version increment checking
# Set to false for initial setup, enable later for strict versioning
check-version-increment: false

# Schema validation settings
validate-chart-schema: true
validate-maintainers: true
validate-yaml: true

# Helm configuration - increase timeout for slow installations
helm-extra-args: --timeout 600s --wait --debug

# Helm extra set args for testing
# Note: jobset.install=false because we install JobSet manually
helm-extra-set-args: --set=jobset.install=false --set=replicaCount=1 --set=image.pullPolicy=IfNotPresent --set=resources.limits.cpu=100m --set=resources.limits.memory=128Mi

# Debug settings
debug: true

# Skip cleanup after testing for debugging (will be cleaned up in workflow)
skip-clean-up: true

# Test upgrades (install chart, then upgrade)
upgrade: false

# Print config for debugging
print-config: true

# Skip dependency build since we handle JobSet manually
helm-dependency-extra-args: --skip-refresh

# Test configuration with increased verbosity
test-extra-args: --logs --debug
195 changes: 195 additions & 0 deletions .github/workflows/test-helm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
---
name: Unit and Integration Test - Helm

on:
pull_request:

jobs:
generate:
name: Generate
runs-on: ubuntu-latest

steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
fetch-depth: 0
Comment thread
ombhojane marked this conversation as resolved.
Outdated

- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: 'latest'

- name: Run make generate
run: |
if make help 2>/dev/null | grep -q "generate"; then
make generate
else
echo "No generate target found, skipping..."
fi

- name: Run make helm-unittest
run: |
if make help 2>/dev/null | grep -q "helm-unittest"; then
make helm-unittest
else
echo "No helm-unittest target found, skipping..."
fi
Comment thread
ombhojane marked this conversation as resolved.
Outdated

test:
name: Test
runs-on: ubuntu-latest
needs: generate

steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
fetch-depth: 0
Comment thread
ombhojane marked this conversation as resolved.
Outdated

- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: 'latest'

- name: Set up chart-testing
uses: helm/chart-testing-action@v2.7.0

- name: Set up Kind
uses: helm/kind-action@v1.10.0
with:
cluster_name: chart-testing
wait: 300s

- name: Run chart-testing (lint)
run: |
ct lint \
--config .github/ct.yaml \
--all \
--debug

- name: Install JobSet dependency manually
Comment thread
ombhojane marked this conversation as resolved.
Outdated
run: |
JOBSET_VERSION="v0.8.1"
echo "Installing JobSet $JOBSET_VERSION..."
helm install jobset oci://registry.k8s.io/jobset/charts/jobset \
--version $JOBSET_VERSION \
--create-namespace \
--namespace=jobset-system \
--wait \
--timeout=300s

echo "JobSet installed successfully"
kubectl get pods -n jobset-system

# Wait for JobSet to be ready
kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=jobset --timeout=300s -n jobset-system

- name: Create test values file for chart testing
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we leverage the e2e-setup-cluster.sh script to deploy Helm Chart ?

I think, the steps are similar, and we should just deploy it with Helm Charts instead of Kustomize manifests.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @andreyvelich this should ideally not be duplicated.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resolved!

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leveraged e2e-setup-cluster-helm.sh

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ombhojane As I can see, you still build and load images as part of your e2e-setup-cluster-helm.sh script.
I suggest that you just add condition here, to whether you want to deploy Kubeflow Trainer with Kustomize or Helm Charts:

echo "Deploy Kubeflow Trainer control plane"
E2E_MANIFESTS_DIR="artifacts/e2e/manifests"
mkdir -p "${E2E_MANIFESTS_DIR}"
cat <<EOF >"${E2E_MANIFESTS_DIR}/kustomization.yaml"
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../../manifests/overlays/manager
images:
- name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}"
newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
EOF

In that case, you don't need to maintain separate script just for Helm deployment.

run: |
mkdir -p charts/kubeflow-trainer/ci
cat > charts/kubeflow-trainer/ci/ci-values.yaml << 'EOF'
# CI-specific values for chart testing
replicaCount: 1
image:
pullPolicy: IfNotPresent
jobset:
install: false # We install JobSet manually
resources:
limits:
cpu: 100m
memory: 128Mi
requests:
cpu: 50m
memory: 64Mi
# Minimal configuration for testing
controller:
replicas: 1
Comment thread
ombhojane marked this conversation as resolved.
Outdated
EOF

- name: Run chart-testing (install) with better error handling
run: |
set -e
echo "Starting chart installation tests..."

# Run with more verbose output and longer timeout
ct install \
--config .github/ct.yaml \
--all \
--debug \
--helm-extra-args "--timeout=600s --debug" \
--skip-clean-up || {
echo "Chart testing failed, checking cluster state..."

echo "=== Cluster Nodes ==="
kubectl get nodes -o wide || true

echo "=== All Namespaces ==="
kubectl get namespaces || true

echo "=== All Pods ==="
kubectl get pods --all-namespaces -o wide || true

echo "=== Events ==="
kubectl get events --all-namespaces --sort-by='.lastTimestamp' || true

echo "=== Helm Releases ==="
helm list --all-namespaces || true

# Still exit with error
exit 1
}

- name: Verify Helm installation
if: success()
run: |
echo "=== Cluster Status ==="
kubectl get nodes -o wide

echo "=== Namespaces ==="
kubectl get namespaces

echo "=== All Pods ==="
kubectl get pods --all-namespaces -o wide

echo "=== Training Operator Pods ==="
kubectl get pods --all-namespaces \
-l app.kubernetes.io/name=kubeflow-trainer || true

echo "=== JobSet Pods ==="
kubectl get pods -n jobset-system || true

echo "=== Services ==="
kubectl get svc --all-namespaces

echo "=== CRDs ==="
kubectl get crd | grep -E "(training|jobset)" || \
echo "No training/jobset CRDs found"

echo "=== Helm Releases ==="
helm list --all-namespaces

- name: Run basic functionality tests
Comment thread
ombhojane marked this conversation as resolved.
Outdated
if: success()
run: |
echo "=== Chart Testing Completed Successfully ==="
echo "The chart has been installed, tested, and validated by chart-testing tool."
echo "All components are running properly as shown in the verification step above."
echo "Chart validation successful!"

- name: Cleanup
if: always()
run: |
echo "Cleaning up resources..."

# Clean up any helm releases first
helm list --all-namespaces -q | xargs -I {} helm uninstall {} --ignore-not-found || true

# Clean up namespaces
kubectl delete namespace kubeflow-system \
--ignore-not-found=true --timeout=60s || true
kubectl delete namespace jobset-system \
--ignore-not-found=true --timeout=60s || true

# Clean up any remaining test namespaces
kubectl get namespaces -o name | grep -E "(kubeflow-trainer|test-)" | xargs kubectl delete --timeout=60s || true
2 changes: 1 addition & 1 deletion charts/kubeflow-trainer/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ type: application

dependencies:
- name: jobset
repository: oci://registry.k8s.io/jobset/charts/jobset
repository: oci://registry.k8s.io/jobset/charts
Comment thread
ombhojane marked this conversation as resolved.
Outdated
version: v0.8.2
condition: jobset.install

Expand Down
2 changes: 1 addition & 1 deletion cmd/initializers/dataset/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11-alpine
FROM python:3.11-slim-bookworm
Comment thread
ombhojane marked this conversation as resolved.

WORKDIR /app

Expand Down
2 changes: 1 addition & 1 deletion cmd/initializers/model/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11-alpine
FROM python:3.11-slim-bookworm

WORKDIR /app

Expand Down
Loading