[tmpnet] Enable deployment to kube (#3615)

marun · maru-ava · StephenButtolph · web-flow · commit 93a22bd04a73 · 2025-05-28T18:25:59.000Z
Signed-off-by: maru &lt;maru.newby@avalabs.org&gt;
Co-authored-by: Maru Newby &lt;maru.newby@avalabs.org&gt;
Co-authored-by: Stephen Buttolph &lt;stephen@avalabs.org&gt;
diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml
@@ -8,6 +8,9 @@ inputs:
   run_env:
     description: 'a string containing env vars for the command e.g. "MY_VAR1=foo MY_VAR2=bar"'
     default: ''
+  runtime:
+    description: 'the tmpnet runtime being used'
+    default: 'process'
   filter_by_owner:
     default: ''
   artifact_prefix:
@@ -67,8 +70,10 @@ runs:
       # --impure ensures the env vars are accessible to the command
       run: ${{ inputs.run_env }} ${{ github.action_path }}/nix-develop.sh --impure --command bash -x ${{ inputs.run }}
       env:
+        # Always collect metrics locally even when nodes are running in kube to enable collection from the test workload
         TMPNET_START_METRICS_COLLECTOR: ${{ inputs.prometheus_username != '' }}
-        TMPNET_START_LOGS_COLLECTOR: ${{ inputs.loki_username != '' }}
+        # Skip local log collection when nodes are running in kube since collection will occur in-cluster.
+        TMPNET_START_LOGS_COLLECTOR: ${{ inputs.loki_username != '' && inputs.runtime == 'process' }}
         TMPNET_CHECK_METRICS_COLLECTED: ${{ inputs.prometheus_username != '' }}
         TMPNET_CHECK_LOGS_COLLECTED: ${{ inputs.loki_username != '' }}
         LOKI_USERNAME: ${{ inputs.loki_username }}
@@ -86,7 +91,7 @@ runs:
     # easy way to compose custom actions for use by other repos
     # without running into versioning issues.
     - name: Upload tmpnet data
-      if: always()
+      if: always() && (inputs.runtime == 'process')
       uses: actions/upload-artifact@v4
       with:
         name: ${{ inputs.artifact_prefix }}-tmpnet-data
@@ -95,3 +100,14 @@ runs:
           ~/.tmpnet/prometheus/prometheus.log
           ~/.tmpnet/promtail/promtail.log
         if-no-files-found: error
+    - name: Export kind logs
+      if: always() && (inputs.runtime == 'kube')
+      shell: bash
+      run: kind export logs /tmp/kind-logs
+    - name: Upload kind logs
+      if: always() && (inputs.runtime == 'kube')
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ inputs.artifact_prefix }}-kind-logs
+        path: /tmp/kind-logs
+        if-no-files-found: error
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -76,6 +76,21 @@ jobs:
           prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }}
           loki_username: ${{ secrets.LOKI_ID || '' }}
           loki_password: ${{ secrets.LOKI_PASSWORD || '' }}
+  e2e_kube:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/setup-go-for-project
+      - uses: ./.github/actions/run-monitored-tmpnet-cmd
+        with:
+          run: ./scripts/run_task.sh test-e2e-kube-ci
+          runtime: kube
+          artifact_prefix: e2e-kube
+          filter_by_owner: avalanchego-e2e
+          prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }}
+          prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }}
+          loki_username: ${{ secrets.LOKI_ID || '' }}
+          loki_password: ${{ secrets.LOKI_PASSWORD || '' }}
   e2e_existing_network:
     runs-on: ubuntu-latest
     steps:
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -47,6 +47,10 @@ tasks:
     desc: Builds xsvm plugin
     cmd: ./scripts/build_xsvm.sh
 
+  build-xsvm-image:
+    desc: Builds xsvm image
+    cmd: ./scripts/build_xsvm_image.sh
+
   check-clean-branch:
     desc: Checks that the git working tree is clean
     cmd: .github/workflows/check-clean-branch.sh
@@ -177,6 +181,18 @@ tasks:
       - task: build-xsvm
       - cmd: bash -x ./scripts/tests.e2e.existing.sh {{.CLI_ARGS}}
 
+  test-e2e-kube:
+    desc: Runs e2e tests against a network deployed to kube
+    cmds:
+      - cmd: bash -x ./scripts/tests.e2e.kube.sh {{.CLI_ARGS}}
+
+  test-e2e-kube-ci:
+    desc: Runs e2e tests against a network deployed to kube [serially]
+    env:
+      E2E_SERIAL: 1
+    cmds:
+      - task: test-e2e-kube
+
   # To use a different fuzz time, run `task test-fuzz FUZZTIME=[value in seconds]`.
   # A value of `-1` will run until it encounters a failing output.
 
diff --git a/scripts/build_xsvm_image.sh b/scripts/build_xsvm_image.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# e.g.,
+# ./scripts/build_image.sh                                                   # Build local single-arch image
+# AVALANCHEGO_IMAGE=localhost:5001/avalanchego ./scripts/build_xsvm_image.sh # Build and push image to private registry
+
+if ! [[ "$0" =~ scripts/build_xsvm_image.sh ]]; then
+  echo "must be run from repository root"
+  exit 255
+fi
+
+source ./scripts/image_tag.sh
+
+AVALANCHEGO_IMAGE="${AVALANCHEGO_IMAGE:-avalanchego}"
+XSVM_IMAGE="${XSVM_IMAGE:-avalanchego-xsvm}"
+
+# Build the avalanchego base image
+SKIP_BUILD_RACE=1 DOCKER_IMAGE="${AVALANCHEGO_IMAGE}" bash -x ./scripts/build_image.sh
+
+DOCKER_CMD=("docker" "buildx" "build")
+if [[ "${XSVM_IMAGE}" == *"/"* ]]; then
+  # Push to a registry when the image name includes a slash which indicates the
+  # use of a registry e.g.
+  #
+  #  - dockerhub: [repo]/[image name]:[tag]
+  #  - private registry: [private registry hostname]/[image name]:[tag]
+  DOCKER_CMD+=("--push")
+fi
+
+GO_VERSION="$(go list -m -f '{{.GoVersion}}')"
+
+"${DOCKER_CMD[@]}" --build-arg GO_VERSION="${GO_VERSION}" --build-arg AVALANCHEGO_NODE_IMAGE="${AVALANCHEGO_IMAGE}:${image_tag}" \
+  -t "${XSVM_IMAGE}" -f ./vms/example/xsvm/Dockerfile .
diff --git a/scripts/tests.e2e.kube.sh b/scripts/tests.e2e.kube.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Run e2e tests against nodes deployed to a kind cluster.
+
+# TODO(marun) Support testing against a remote cluster
+
+if ! [[ "$0" =~ scripts/tests.e2e.kube.sh ]]; then
+  echo "must be run from repository root"
+  exit 255
+fi
+
+# This script will use kubeconfig arguments if supplied
+./scripts/start_kind_cluster.sh "$@"
+
+# Use an image that will be pushed to the local registry that the kind cluster is configured to use.
+AVALANCHEGO_IMAGE="localhost:5001/avalanchego"
+XSVM_IMAGE="${AVALANCHEGO_IMAGE}-xsvm"
+if [[ -n "${SKIP_BUILD_IMAGE:-}" ]]; then
+  echo "Skipping build of xsvm image due to SKIP_BUILD_IMAGE=${SKIP_BUILD_IMAGE}"
+else
+  XSVM_IMAGE="${XSVM_IMAGE}" AVALANCHEGO_IMAGE="${AVALANCHEGO_IMAGE}" bash -x ./scripts/build_xsvm_image.sh
+fi
+
+bash -x ./scripts/tests.e2e.sh --runtime=kube --kube-image="${XSVM_IMAGE}" "$@"
diff --git a/scripts/tests.e2e.sh b/scripts/tests.e2e.sh
@@ -20,10 +20,14 @@ fi
 # the instructions to build non-portable BLST.
 source ./scripts/constants.sh
 
-# Ensure an absolute path to avoid dependency on the working directory
-# of script execution.
-AVALANCHEGO_PATH="$(realpath "${AVALANCHEGO_PATH:-./build/avalanchego}")"
-E2E_ARGS="--avalanchego-path=${AVALANCHEGO_PATH}"
+E2E_ARGS=("${@}")
+
+# If not running in kubernetes, default to using a local avalanchego binary
+if ! [[ "${E2E_ARGS[*]}" =~ "--runtime=kube" && ! "${E2E_ARGS[*]}" =~ "--avalanchego-path" ]]; then
+  # Ensure an absolute path to avoid dependency on the working directory of script execution.
+  AVALANCHEGO_PATH="$(realpath "${AVALANCHEGO_PATH:-./build/avalanchego}")"
+  E2E_ARGS+=("--avalanchego-path=${AVALANCHEGO_PATH}")
+fi
 
 #################################
 # Determine ginkgo args
@@ -55,4 +59,4 @@ fi
 
 #################################
 # shellcheck disable=SC2086
-./bin/ginkgo ${GINKGO_ARGS} -v ./tests/e2e -- "${E2E_ARGS[@]}" "${@}"
+./bin/ginkgo ${GINKGO_ARGS} -v ./tests/e2e -- "${E2E_ARGS[@]}"
diff --git a/tests/e2e/faultinjection/duplicate_node_id.go b/tests/e2e/faultinjection/duplicate_node_id.go
@@ -25,6 +25,11 @@ var _ = ginkgo.Describe("Duplicate node handling", func() {
 	ginkgo.It("should ensure that a given Node ID (i.e. staking keypair) can be used at most once on a network", func() {
 		network := e2e.GetEnv(tc).GetNetwork()
 
+		if network.DefaultRuntimeConfig.Kube != nil {
+			// Enabling this test for kube requires supporting a flexible name mapping
+			ginkgo.Skip("This test is not supported on kube to avoid having to deviate from composing the statefulset name with the network uuid + nodeid")
+		}
+
 		tc.By("creating new node")
 		node1 := e2e.AddEphemeralNode(tc, network, tmpnet.NewEphemeralNode(tmpnet.FlagsMap{}))
 		e2e.WaitForHealthy(tc, node1)
diff --git a/tests/fixture/bootstrapmonitor/e2e/e2e_test.go b/tests/fixture/bootstrapmonitor/e2e/e2e_test.go
@@ -263,9 +263,10 @@ func buildImage(tc tests.TestContext, imageName string, forceNewHash bool, scrip
 	require.NoError(err, "Image build failed: %s", output)
 }
 
-func newNodeStatefulSet(name string, flags map[string]string) *appsv1.StatefulSet {
+func newNodeStatefulSet(name string, flags tmpnet.FlagsMap) *appsv1.StatefulSet {
 	statefulSet := tmpnet.NewNodeStatefulSet(
 		name,
+		true, // generateName
 		latestAvalanchegoImage,
 		nodeContainerName,
 		volumeName,
diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go
@@ -96,11 +96,13 @@ func (v *FlagVars) StartLogsCollector() bool {
 }
 
 func (v *FlagVars) CheckMetricsCollected() bool {
-	return v.checkMetricsCollected
+	// TODO(marun) Enable this check for kube in a subsequent PR
+	return v.startNetworkVars.ProcessRuntimeConfigured() && v.checkMetricsCollected
 }
 
 func (v *FlagVars) CheckLogsCollected() bool {
-	return v.checkLogsCollected
+	// TODO(marun) Enable this check for kube in a subsequent PR
+	return v.startNetworkVars.ProcessRuntimeConfigured() && v.checkLogsCollected
 }
 
 func (v *FlagVars) NetworkDir() string {
diff --git a/tests/fixture/e2e/ginkgo_test_context.go b/tests/fixture/e2e/ginkgo_test_context.go
@@ -46,7 +46,7 @@ func newGinkgoLogger(cfg zapcore.Encoder) logging.Logger {
 	return logging.NewLogger(
 		"",
 		logging.NewWrappedCore(
-			logging.Verbo,
+			logging.Info,
 			&ginkgoWriteCloser{},
 			cfg,
 		),
diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go
@@ -0,0 +1,86 @@
+// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
+// See the file LICENSE for licensing terms.
+
+package flags
+
+import (
+	"errors"
+	"flag"
+	"fmt"
+
+	"github.com/spf13/pflag"
+
+	"github.com/ava-labs/avalanchego/tests/fixture/tmpnet"
+)
+
+const (
+	kubeRuntime     = "kube"
+	kubeFlagsPrefix = kubeRuntime + "-"
+	kubeDocPrefix   = "[kube runtime] "
+)
+
+var (
+	errKubeNamespaceRequired     = errors.New("--kube-namespace is required")
+	errKubeImageRequired         = errors.New("--kube-image is required")
+	errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB)
+)
+
+type kubeRuntimeVars struct {
+	namespace    string
+	image        string
+	volumeSizeGB uint
+	config       *KubeconfigVars
+}
+
+func (v *kubeRuntimeVars) registerWithFlag() {
+	v.config = newKubeconfigFlagVars(kubeDocPrefix)
+	v.register(flag.StringVar, flag.UintVar)
+}
+
+func (v *kubeRuntimeVars) registerWithFlagSet(flagSet *pflag.FlagSet) {
+	v.config = newKubeconfigFlagSetVars(flagSet, kubeDocPrefix)
+	v.register(flagSet.StringVar, flagSet.UintVar)
+}
+
+func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[uint]) {
+	stringVar(
+		&v.namespace,
+		"kube-namespace",
+		tmpnet.DefaultTmpnetNamespace,
+		kubeDocPrefix+"The namespace in the target cluster to create nodes in",
+	)
+	stringVar(
+		&v.image,
+		"kube-image",
+		"avaplatform/avalanchego:latest",
+		kubeDocPrefix+"The name of the docker image to use for creating nodes",
+	)
+	uintVar(
+		&v.volumeSizeGB,
+		"kube-volume-size",
+		tmpnet.MinimumVolumeSizeGB,
+		kubeDocPrefix+fmt.Sprintf(
+			"The size in gigabytes of the PeristentVolumeClaim to create for the data directory of each node. Value must be >= %d.",
+			tmpnet.MinimumVolumeSizeGB,
+		),
+	)
+}
+
+func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, error) {
+	if len(v.namespace) == 0 {
+		return nil, errKubeNamespaceRequired
+	}
+	if len(v.image) == 0 {
+		return nil, errKubeImageRequired
+	}
+	if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB {
+		return nil, errKubeMinVolumeSizeRequired
+	}
+	return &tmpnet.KubeRuntimeConfig{
+		ConfigPath:    v.config.Path,
+		ConfigContext: v.config.Context,
+		Namespace:     v.namespace,
+		Image:         v.image,
+		VolumeSizeGB:  v.volumeSizeGB,
+	}, nil
+}
diff --git a/tests/fixture/tmpnet/flags/runtime.go b/tests/fixture/tmpnet/flags/runtime.go
@@ -14,17 +14,20 @@ import (
 
 var validRuntimes = []string{
 	processRuntime,
+	kubeRuntime,
 }
 
 type RuntimeConfigVars struct {
 	runtime            string
 	processRuntimeVars processRuntimeVars
+	kubeRuntimeVars    kubeRuntimeVars
 }
 
 // NewRuntimeConfigFlagVars registers runtime config flag variables for stdlib flag
 func NewRuntimeConfigFlagVars() *RuntimeConfigVars {
 	v := &RuntimeConfigVars{}
 	v.processRuntimeVars.registerWithFlag()
+	v.kubeRuntimeVars.registerWithFlag()
 	v.register(flag.StringVar)
 	return v
 }
@@ -33,6 +36,7 @@ func NewRuntimeConfigFlagVars() *RuntimeConfigVars {
 func NewRuntimeConfigFlagSetVars(flagSet *pflag.FlagSet) *RuntimeConfigVars {
 	v := &RuntimeConfigVars{}
 	v.processRuntimeVars.registerWithFlagSet(flagSet)
+	v.kubeRuntimeVars.registerWithFlagSet(flagSet)
 	v.register(flagSet.StringVar)
 	return v
 }
@@ -60,6 +64,14 @@ func (v *RuntimeConfigVars) GetNodeRuntimeConfig() (*tmpnet.NodeRuntimeConfig, e
 		return &tmpnet.NodeRuntimeConfig{
 			Process: processRuntimeConfig,
 		}, nil
+	case kubeRuntime:
+		kubeRuntimeConfig, err := v.kubeRuntimeVars.getKubeRuntimeConfig()
+		if err != nil {
+			return nil, err
+		}
+		return &tmpnet.NodeRuntimeConfig{
+			Kube: kubeRuntimeConfig,
+		}, nil
 	default:
 		return nil, fmt.Errorf("--runtime expected one of %v, got: %s", validRuntimes, v.runtime)
 	}
diff --git a/tests/fixture/tmpnet/flags/start_network.go b/tests/fixture/tmpnet/flags/start_network.go
@@ -59,6 +59,10 @@ func (v *StartNetworkVars) register(stringVar varFunc[string], intVar varFunc[in
 	)
 }
 
+func (v *StartNetworkVars) ProcessRuntimeConfigured() bool {
+	return v.runtimeVars.runtime == processRuntime
+}
+
 func (v *StartNetworkVars) GetNodeCount() (int, error) {
 	if v.nodeCount < 1 {
 		return 0, fmt.Errorf("--node-count must be greater than 0 but got %d", v.nodeCount)
diff --git a/tests/fixture/tmpnet/kube.go b/tests/fixture/tmpnet/kube.go
diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go
diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go
diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go
diff --git a/tests/fixture/tmpnet/tmpnetctl/main.go b/tests/fixture/tmpnet/tmpnetctl/main.go
diff --git a/vms/example/xsvm/Dockerfile b/vms/example/xsvm/Dockerfile

Original file line number	Diff line number	Diff line change
`@@ -96,11 +96,13 @@ func (v *FlagVars) StartLogsCollector() bool {`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`func (v *FlagVars) CheckMetricsCollected() bool {`
`99`		`- return v.checkMetricsCollected`
	`99`	`+ // TODO(marun) Enable this check for kube in a subsequent PR`
	`100`	`+ return v.startNetworkVars.ProcessRuntimeConfigured() && v.checkMetricsCollected`
`100`	`101`	`}`
`101`	`102`
`102`	`103`	`func (v *FlagVars) CheckLogsCollected() bool {`
`103`		`- return v.checkLogsCollected`
	`104`	`+ // TODO(marun) Enable this check for kube in a subsequent PR`
	`105`	`+ return v.startNetworkVars.ProcessRuntimeConfigured() && v.checkLogsCollected`
`104`	`106`	`}`
`105`	`107`
`106`	`108`	`func (v *FlagVars) NetworkDir() string {`
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,10 @@ func (v *StartNetworkVars) register(stringVar varFunc[string], intVar varFunc[in`
`59`	`59`	`)`
`60`	`60`	`}`
`61`	`61`
	`62`	`+func (v *StartNetworkVars) ProcessRuntimeConfigured() bool {`
	`63`	`+ return v.runtimeVars.runtime == processRuntime`
	`64`	`+}`
	`65`	`+`
`62`	`66`	`func (v *StartNetworkVars) GetNodeCount() (int, error) {`
`63`	`67`	`if v.nodeCount < 1 {`
`64`	`68`	`return 0, fmt.Errorf("--node-count must be greater than 0 but got %d", v.nodeCount)`