mlops-engineering-101/local_setup.sh at main · himanshudongre/mlops-engineering-101 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#!/usr/bin/env bash
# =============================================================================
# Local workstation setup: deploys the full MLOps stack (K3s + KFP + MLflow)
# directly on a GPU workstation without any cloud dependencies.
#
# This is the on-premises equivalent of the AWS flow (deploy.sh -> step2.sh ->
# remote_setup.sh). Run this script directly on your GPU workstation.
#
# Prerequisites (install these first -- see training/03-ml-workstation-setup/):
#   - NVIDIA GPU drivers (nvidia-smi must work)
#   - Docker with NVIDIA Container Toolkit (docker run --gpus all must work)
#   - sudo access (needed for K3s install and containerd image import)
#   - python3, curl, git
#
# Usage:
#   ./local_setup.sh
#
# The script is idempotent -- safe to re-run after reboots or partial failures.
# =============================================================================
set -euo pipefail

# --- Configuration -----------------------------------------------------------
# All variables can be overridden via environment. Defaults match the AWS flow
# so that pipeline.py, Dockerfile, train_wrapper.py work unchanged.
KFP_NAMESPACE="${KFP_NAMESPACE:-kubeflow}"
KFP_VERSION="${KFP_VERSION:-2.4.1}"
KFP_MINIO_IMAGE="${KFP_MINIO_IMAGE:-minio/minio:RELEASE.2019-08-14T20-37-41Z}"
KFP_ARGOEXEC_IMAGE="${KFP_ARGOEXEC_IMAGE:-quay.io/argoproj/argoexec:v3.4.17}"
YOLO_IMAGE="${YOLO_IMAGE:-docker.io/library/yolov5-mlops:v1}"
MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI:-http://mlflow-service.kubeflow.svc.cluster.local:5000}"
PIPELINE_ROOT="${PIPELINE_ROOT:-minio://mlpipeline/v2/artifacts}"
MINIO_REGION="${MINIO_REGION:-us-east-1}"

LOCAL_WORKDIR="${LOCAL_WORKDIR:-$(cd "$(dirname "$0")" && pwd)}"
LOCAL_VENV="${LOCAL_VENV:-${LOCAL_WORKDIR}/.venv}"
LOCAL_LOG_DIR="${LOCAL_LOG_DIR:-${LOCAL_WORKDIR}/logs}"
LOCAL_GENERATED_DIR="${LOCAL_GENERATED_DIR:-${LOCAL_WORKDIR}/generated}"

# Port-forward bind address: 0.0.0.0 so remote laptops can reach dashboards
# via SSH tunnel. Change to 127.0.0.1 if only local access is needed.
BIND_ADDRESS="${BIND_ADDRESS:-0.0.0.0}"
KFP_UI_PORT="${KFP_UI_PORT:-3000}"
MLFLOW_PORT="${MLFLOW_PORT:-5000}"

export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
export DEBIAN_FRONTEND=noninteractive

# --- Logging and helpers -----------------------------------------------------
log() {
    printf '[local %s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}

die() {
    log "ERROR: $*"
    exit 1
}

require_cmd() {
    command -v "$1" >/dev/null 2>&1 || die "Required command not found: $1"
}

wait_for_resource() {
    local kind="$1" name="$2" namespace="$3" timeout="$4"
    kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}"
}

wait_for_allocatable_gpu() {
    local waited=0 value=""
    while [ "${waited}" -lt 180 ]; do
        value="$(kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)"
        if [ -n "${value}" ] && [ "${value}" != "0" ]; then
            printf '%s\n' "${value}"
            return 0
        fi
        [ "${waited}" -gt 0 ] || log "Waiting for Kubernetes to advertise allocatable NVIDIA GPUs"
        sleep 5
        waited=$((waited + 5))
    done
    return 1
}

node_condition_status() {
    local condition_type="$1"
    kubectl get nodes -o jsonpath="{.items[0].status.conditions[?(@.type=='${condition_type}')].status}" 2>/dev/null || true
}

ensure_no_disk_pressure() {
    local status="" waited=0
    status="$(node_condition_status "DiskPressure")"
    [ "${status}" = "True" ] || return 0

    log "Node reports DiskPressure=True; waiting for kubelet to refresh"
    while [ "${waited}" -lt 60 ]; do
        sleep 5
        waited=$((waited + 5))
        status="$(node_condition_status "DiskPressure")"
        [ "${status}" = "True" ] || { log "DiskPressure cleared"; return 0; }
    done

    log "DiskPressure still stuck; restarting k3s"
    sudo systemctl restart k3s
    kubectl wait --for=condition=Ready node --all --timeout=300s
    status="$(node_condition_status "DiskPressure")"
    [ "${status}" != "True" ] || die "DiskPressure persists after k3s restart"
}

secret_field() {
    local secret_name="$1" field_name="$2"
    kubectl -n "${KFP_NAMESPACE}" get secret "${secret_name}" \
        -o "jsonpath={.data.${field_name}}" 2>/dev/null | base64 --decode 2>/dev/null || true
}

find_minio_secret() {
    local candidate=""
    for candidate in mlpipeline-minio-artifact minio-artifact minio; do
        if kubectl -n "${KFP_NAMESPACE}" get secret "${candidate}" >/dev/null 2>&1; then
            printf '%s\n' "${candidate}"
            return 0
        fi
    done
    kubectl -n "${KFP_NAMESPACE}" get secret \
        -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep 'minio' | head -n 1
}

restart_port_forward() {
    local name="$1" svc_name="$2" local_port="$3" remote_port="$4"
    local pid_file="${LOCAL_LOG_DIR}/${name}.pid"
    local log_file="${LOCAL_LOG_DIR}/${name}.log"

    if [ -f "${pid_file}" ] && kill -0 "$(cat "${pid_file}")" >/dev/null 2>&1; then
        kill "$(cat "${pid_file}")" >/dev/null 2>&1 || true
        sleep 2
    fi

    nohup kubectl -n "${KFP_NAMESPACE}" port-forward \
        --address "${BIND_ADDRESS}" "svc/${svc_name}" \
        "${local_port}:${remote_port}" >"${log_file}" 2>&1 &
    echo $! > "${pid_file}"
}

run_gpu_smoke_test() {
    local waited=0 phase=""
    kubectl -n kube-system delete -f k8s/gpu-smoke-test.yaml --ignore-not-found >/dev/null 2>&1 || true
    kubectl apply -f k8s/gpu-smoke-test.yaml

    while [ "${waited}" -lt 300 ]; do
        phase="$(kubectl -n kube-system get pod gpu-smoke-test -o jsonpath='{.status.phase}' 2>/dev/null || true)"
        if [ "${phase}" = "Succeeded" ]; then
            kubectl -n kube-system logs pod/gpu-smoke-test
            kubectl -n kube-system delete -f k8s/gpu-smoke-test.yaml >/dev/null
            return 0
        fi
        if [ "${phase}" = "Failed" ]; then
            log "GPU smoke test pod failed:"
            kubectl -n kube-system describe pod gpu-smoke-test || true
            kubectl -n kube-system logs pod/gpu-smoke-test --all-containers=true || true
            return 1
        fi
        sleep 5
        waited=$((waited + 5))
    done

    log "GPU smoke test pod timed out:"
    kubectl -n kube-system describe pod gpu-smoke-test || true
    return 1
}

cleanup_stale_image_artifacts() {
    log "Cleaning up stale image-import artifacts"
    sudo pkill -f 'k3s ctr --namespace k8s.io images import' >/dev/null 2>&1 || true
    sudo rm -f /tmp/yolov5-mlops.tar
    sudo docker image rm "${YOLO_IMAGE}" "yolov5-mlops:v1" >/dev/null 2>&1 || true
    sudo k3s ctr --namespace k8s.io images rm "${YOLO_IMAGE}" >/dev/null 2>&1 || true
    sudo k3s ctr --namespace k8s.io images rm "docker.io/library/yolov5-mlops:v1" >/dev/null 2>&1 || true
    sudo docker builder prune -af >/dev/null 2>&1 || true
    sudo k3s ctr --namespace k8s.io content prune >/dev/null 2>&1 || true
}

install_or_recover_k3s() {
    local attempt=1
    while [ "${attempt}" -le 2 ]; do
        if command -v k3s >/dev/null 2>&1; then
            log "K3s binary already present; ensuring the service is enabled"
            sudo systemctl daemon-reload || true
            sudo systemctl enable --now k3s || true
        else
            log "Installing K3s with NVIDIA as the default runtime (attempt ${attempt})"
            curl -sfL https://get.k3s.io | \
                INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --disable traefik --default-runtime nvidia" sh -s - || \
                log "K3s installer returned non-zero; checking service status"
            sudo systemctl daemon-reload || true
            sudo systemctl enable --now k3s || true
        fi

        if sudo systemctl is-active --quiet k3s; then
            return 0
        fi

        log "k3s service not active; journal output:"
        sudo journalctl -u k3s --no-pager -n 50 || true
        attempt=$((attempt + 1))
        sleep 10
    done
    return 1
}

# =============================================================================
# Main setup sequence
# =============================================================================

cd "${LOCAL_WORKDIR}"
mkdir -p "${LOCAL_LOG_DIR}" "${LOCAL_GENERATED_DIR}"

log "================================================================="
log "  MLOps 101 -- Local Workstation Setup"
log "  Working directory: ${LOCAL_WORKDIR}"
log "================================================================="

# --- Step 1: Verify prerequisites -------------------------------------------
log "Checking prerequisites"
require_cmd curl
require_cmd python3
require_cmd sudo
require_cmd docker
require_cmd git

log "Verifying NVIDIA GPU drivers"
if ! nvidia-smi >/dev/null 2>&1; then
    die "nvidia-smi not found or failed. Install NVIDIA drivers first.
    See: training/03-ml-workstation-setup/system-setup-guide.md"
fi
nvidia-smi

log "Verifying Docker GPU pass-through"
if ! sudo docker run --rm --gpus all nvidia/cuda:12.3.2-base-ubuntu22.04 nvidia-smi >/dev/null 2>&1; then
    die "Docker GPU pass-through failed. Install NVIDIA Container Toolkit first.
    See: training/03-ml-workstation-setup/system-setup-guide.md"
fi
log "Docker GPU pass-through OK"

# Configure the NVIDIA runtime for containerd (required by K3s).
# Docker uses its own runtime config, but K3s uses containerd directly.
if command -v nvidia-ctk >/dev/null 2>&1; then
    log "Configuring NVIDIA runtime for containerd (used by K3s)"
    sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true
fi

# --- Step 2: Install K3s with NVIDIA runtime --------------------------------
install_or_recover_k3s || die "K3s service failed to become active"

log "Validating K3s service state"
sudo systemctl is-active --quiet k3s || die "K3s service is not active"
sudo grep -q 'default_runtime_name = "nvidia"' \
    /var/lib/rancher/k3s/agent/etc/containerd/config.toml 2>/dev/null || \
    log "WARNING: NVIDIA runtime may not be the default in containerd config"

log "Waiting for the K3s node to become Ready"
kubectl wait --for=condition=Ready node --all --timeout=300s
kubectl get nodes -o wide

# --- Step 3: Deploy NVIDIA device plugin ------------------------------------
log "Deploying NVIDIA device plugin"
kubectl apply -f k8s/nvidia-device-plugin.yaml
wait_for_resource daemonset nvidia-device-plugin-daemonset kube-system 300s

GPU_COUNT="$(wait_for_allocatable_gpu || true)"
[ -n "${GPU_COUNT}" ] && [ "${GPU_COUNT}" != "0" ] || \
    die "No allocatable NVIDIA GPU detected in Kubernetes"
log "Kubernetes sees ${GPU_COUNT} GPU(s)"

# --- Step 4: Clean up stale artifacts and run GPU smoke test ----------------
cleanup_stale_image_artifacts
ensure_no_disk_pressure

log "Running GPU smoke test pod"
run_gpu_smoke_test || die "GPU smoke test did not complete successfully"

# --- Step 5: Build the training image and import into K3s -------------------
log "Building the YOLOv5 training image"
sudo docker build -t "${YOLO_IMAGE}" -t "yolov5-mlops:v1" .
sudo docker image inspect "${YOLO_IMAGE}" >/dev/null

log "Importing the training image into K3s containerd"
sudo sh -c "docker save '${YOLO_IMAGE}' | k3s ctr --namespace k8s.io images import -"
sudo k3s ctr --namespace k8s.io images ls | grep -F 'yolov5-mlops' >/dev/null
sudo docker image rm "${YOLO_IMAGE}" "yolov5-mlops:v1" >/dev/null 2>&1 || true
sudo docker builder prune -af >/dev/null 2>&1 || true
ensure_no_disk_pressure

# --- Step 6: Deploy Kubeflow Pipelines (standalone) -------------------------
log "Deploying standalone Kubeflow Pipelines ${KFP_VERSION}"
kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=${KFP_VERSION}"
kubectl wait --for=condition=established --timeout=180s crd/applications.app.k8s.io
kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/env/platform-agnostic?ref=${KFP_VERSION}"

log "Patching KFP MinIO image to ${KFP_MINIO_IMAGE}"
kubectl -n "${KFP_NAMESPACE}" set image deployment/minio minio="${KFP_MINIO_IMAGE}" >/dev/null

log "Patching workflow controller executor image to ${KFP_ARGOEXEC_IMAGE}"
kubectl -n "${KFP_NAMESPACE}" patch deployment workflow-controller \
    --type='json' \
    -p="[{\"op\":\"replace\",\"path\":\"/spec/template/spec/containers/0/args/3\",\"value\":\"${KFP_ARGOEXEC_IMAGE}\"}]" >/dev/null

log "Waiting for Kubeflow Pipelines components"
for deployment_name in $(kubectl -n "${KFP_NAMESPACE}" get deployment -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
    wait_for_resource deployment "${deployment_name}" "${KFP_NAMESPACE}" 600s
done
for statefulset_name in $(kubectl -n "${KFP_NAMESPACE}" get statefulset -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
    wait_for_resource statefulset "${statefulset_name}" "${KFP_NAMESPACE}" 600s
done
for job_name in $(kubectl -n "${KFP_NAMESPACE}" get job -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
    kubectl -n "${KFP_NAMESPACE}" wait --for=condition=complete "job/${job_name}" --timeout=600s
done
kubectl -n "${KFP_NAMESPACE}" get pods -o wide
kubectl -n "${KFP_NAMESPACE}" get svc minio-service ml-pipeline-ui ml-pipeline mysql >/dev/null

# --- Step 7: Resolve MinIO credentials and create MLflow bucket -------------
log "Resolving MinIO credentials from the Kubeflow install"
MINIO_SECRET_NAME="$(find_minio_secret)"
[ -n "${MINIO_SECRET_NAME}" ] || die "Unable to locate MinIO credentials secret in ${KFP_NAMESPACE}"
MINIO_ACCESS_KEY="$(secret_field "${MINIO_SECRET_NAME}" accesskey)"
[ -n "${MINIO_ACCESS_KEY}" ] || MINIO_ACCESS_KEY="$(secret_field "${MINIO_SECRET_NAME}" rootUser)"
MINIO_SECRET_KEY="$(secret_field "${MINIO_SECRET_NAME}" secretkey)"
[ -n "${MINIO_SECRET_KEY}" ] || MINIO_SECRET_KEY="$(secret_field "${MINIO_SECRET_NAME}" rootPassword)"
[ -n "${MINIO_ACCESS_KEY}" ] || die "Unable to resolve MinIO access key from ${MINIO_SECRET_NAME}"
[ -n "${MINIO_SECRET_KEY}" ] || die "Unable to resolve MinIO secret key from ${MINIO_SECRET_NAME}"

log "Creating the MLflow artifact bucket in MinIO"
kubectl -n "${KFP_NAMESPACE}" delete pod minio-bootstrap --ignore-not-found >/dev/null 2>&1 || true
kubectl -n "${KFP_NAMESPACE}" run minio-bootstrap \
    --image=minio/mc \
    --restart=Never \
    --env="AWS_ACCESS_KEY_ID=${MINIO_ACCESS_KEY}" \
    --env="AWS_SECRET_ACCESS_KEY=${MINIO_SECRET_KEY}" \
    --command -- /bin/sh -lc \
    "mc alias set local http://minio-service.${KFP_NAMESPACE}.svc.cluster.local:9000 \"${MINIO_ACCESS_KEY}\" \"${MINIO_SECRET_KEY}\" && mc mb --ignore-existing local/mlflow && mc ls local"
kubectl -n "${KFP_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded pod/minio-bootstrap --timeout=300s
kubectl -n "${KFP_NAMESPACE}" logs pod/minio-bootstrap
kubectl -n "${KFP_NAMESPACE}" delete pod minio-bootstrap >/dev/null

log "Creating MLflow MinIO credentials secret"
kubectl -n "${KFP_NAMESPACE}" delete secret mlflow-minio-credentials --ignore-not-found >/dev/null
kubectl -n "${KFP_NAMESPACE}" create secret generic mlflow-minio-credentials \
    --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}" \
    --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}" \
    --from-literal=AWS_DEFAULT_REGION="${MINIO_REGION}" \
    --from-literal=MLFLOW_S3_ENDPOINT_URL="http://minio-service.${KFP_NAMESPACE}.svc.cluster.local:9000" \
    --from-literal=MLFLOW_S3_IGNORE_TLS="true" >/dev/null

# --- Step 8: Deploy MLflow --------------------------------------------------
log "Deploying MLflow"
kubectl apply -f mlflow.yaml
wait_for_resource deployment mlflow "${KFP_NAMESPACE}" 600s
kubectl -n "${KFP_NAMESPACE}" get pvc mlflow-data
kubectl -n "${KFP_NAMESPACE}" get svc mlflow-service >/dev/null

# --- Step 9: Create Python venv and compile pipeline ------------------------
log "Preparing Python environment for pipeline compilation"
if [ ! -d "${LOCAL_VENV}" ]; then
    python3 -m venv "${LOCAL_VENV}"
fi
# shellcheck disable=SC1090
source "${LOCAL_VENV}/bin/activate"
python -m pip install --upgrade pip >/dev/null
python -m pip install -r requirements-remote.txt >/dev/null

log "Compiling the pipeline"
python pipeline.py \
    --output "${LOCAL_GENERATED_DIR}/pipeline.yaml" \
    --image "${YOLO_IMAGE}" \
    --mlflow-uri "${MLFLOW_TRACKING_URI}" \
    --pipeline-root "${PIPELINE_ROOT}"

log "Patching imagePullPolicy into the compiled pipeline"
python3 -c "
import yaml, sys
path = '${LOCAL_GENERATED_DIR}/pipeline.yaml'
with open(path) as f:
    docs = list(yaml.safe_load_all(f))
for doc in docs:
    if doc is None:
        continue
    for ex in doc.get('deploymentSpec', {}).get('executors', {}).values():
        c = ex.get('container', {})
        if c:
            c['imagePullPolicy'] = 'Never'
with open(path, 'w') as f:
    yaml.dump_all(docs, f, default_flow_style=False, sort_keys=False)
print('Patched imagePullPolicy=Never into pipeline.yaml')
"

# --- Step 10: Start port-forwards -------------------------------------------
log "Starting port-forwards for dashboard access"
restart_port_forward kfp-port-forward ml-pipeline-ui "${KFP_UI_PORT}" 80
restart_port_forward mlflow-port-forward mlflow-service "${MLFLOW_PORT}" 5000

log "Checking Kubeflow Pipelines API health"
for attempt in $(seq 1 20); do
    if python submit_run.py \
        --host "http://127.0.0.1:${KFP_UI_PORT}" \
        --health-check-only; then
        break
    fi
    [ "${attempt}" -lt 20 ] || die "Kubeflow Pipelines API did not become healthy"
    sleep 5
done

log "Checking MLflow HTTP health"
curl --retry 12 --retry-delay 5 --retry-connrefused -sf \
    "http://127.0.0.1:${MLFLOW_PORT}/" >/dev/null

log "================================================================="
log "  Local setup finished successfully!"
log "================================================================="
log ""
log "Dashboards (on this machine):"
log "  Kubeflow Pipelines: http://localhost:${KFP_UI_PORT}"
log "  MLflow:             http://localhost:${MLFLOW_PORT}"
log ""
log "From a remote laptop, SSH tunnel with:"
log "  ssh -L 8080:127.0.0.1:${KFP_UI_PORT} -L 5000:127.0.0.1:${MLFLOW_PORT} $(whoami)@<this-machine-ip>"
log "  Then open http://localhost:8080 (KFP) and http://localhost:5000 (MLflow)"
log ""
log "Submit a training run:"
log "  source .venv/bin/activate"
log "  python submit_run.py --host http://127.0.0.1:${KFP_UI_PORT} --pipeline-file generated/pipeline.yaml"
log ""
log "Useful commands:"
log "  kubectl get pods -A"
log "  kubectl -n ${KFP_NAMESPACE} get pods"
log "  kubectl -n ${KFP_NAMESPACE} logs deploy/mlflow"
log "  sudo k3s ctr --namespace k8s.io images ls | grep yolov5"
log ""
log "After a reboot, re-run this script to restore port-forwards."
log "K3s and all deployments survive reboots automatically."