diff --git a/Makefile b/Makefile index d729bd08..41387839 100644 --- a/Makefile +++ b/Makefile @@ -285,6 +285,14 @@ envtest: $(ENVTEST) ## Download envtest-setup locally if necessary. $(ENVTEST): $(LOCALBIN) test -s $(LOCALBIN)/setup-envtest || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest +.PHONY: install-prometheus +install-prometheus: + kubectl apply --server-side -k config/prometheus + +.PHONY: uninstall-prometheus +uninstall-prometheus: + kubectl delete -k config/prometheus + ##@Release .PHONY: artifacts @@ -300,7 +308,7 @@ HELMIFY ?= $(LOCALBIN)/helmify .PHONY: helmify helmify: $(HELMIFY) ## Download helmify locally if necessary. $(HELMIFY): $(LOCALBIN) - test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/helmify@v0.4.17 + test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/helmify@v0.4.18 .PHONY: helm helm: manifests kustomize helmify diff --git a/chart/templates/prometheus/prometheus.yaml b/chart/templates/prometheus/prometheus.yaml new file mode 100644 index 00000000..605915e2 --- /dev/null +++ b/chart/templates/prometheus/prometheus.yaml @@ -0,0 +1,22 @@ +{{- if .Values.prometheus.enable }} +{{- if not (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }} +{{- fail "The cluster does not support the required API resource `monitoring.coreos.com/v1/ServiceMonitor`." }} +{{- end }} +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: {{ include "chart.fullname" . }}-prometheus +spec: + serviceAccountName: {{ include "chart.fullname" . }}-prometheus + # Associated ServiceMonitor selector + serviceMonitorSelector: + # Need to match the label in ServiceMonitor + # https://github.com/kubernetes-sigs/jobset/blob/main/config/components/prometheus/monitor.yaml#L7 + matchLabels: + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 4 }} + resources: + requests: + memory: 400Mi + enableAdminAPI: false +{{- end }} diff --git a/chart/templates/prometheus/service-monitor.yaml b/chart/templates/prometheus/service-monitor.yaml new file mode 100644 index 00000000..b7ed92c2 --- /dev/null +++ b/chart/templates/prometheus/service-monitor.yaml @@ -0,0 +1,27 @@ +{{- if .Values.prometheus.enable }} +{{- if not (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }} +{{- fail "The cluster does not support the required API resource `monitoring.coreos.com/v1/ServiceMonitor`." }} +{{- end }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "chart.fullname" . }}-controller-manager-metrics-monitor + labels: + app.kubernetes.io/component: metrics + app.kubernetes.io/created-by: llmaz + app.kubernetes.io/part-of: llmaz + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 4 }} +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + path: /metrics + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/chart/templates/prometheus/serviceaccount.yaml b/chart/templates/prometheus/serviceaccount.yaml new file mode 100644 index 00000000..1d200445 --- /dev/null +++ b/chart/templates/prometheus/serviceaccount.yaml @@ -0,0 +1,42 @@ +{{- if .Values.prometheus.enable }} +{{- if not (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }} +{{- fail "The cluster does not support the required API resource `monitoring.coreos.com/v1/ServiceMonitor`." }} +{{- end }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "chart.fullname" . }}-prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "chart.fullname" . }}-prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "chart.fullname" . }}-prometheus +subjects: + - kind: ServiceAccount + name: {{ include "chart.fullname" . }}-prometheus + namespace: llmaz-system +{{- end }} diff --git a/chart/values.global.yaml b/chart/values.global.yaml index 0d84d9a3..2d780fe6 100644 --- a/chart/values.global.yaml +++ b/chart/values.global.yaml @@ -28,3 +28,7 @@ leaderWorkerSet: image: repository: registry.k8s.io/lws/lws tag: v0.5.0 + +prometheus: + # -- Whether to enable Prometheus metrics exporting. + enable: false diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml index f46769c0..d0a842bc 100644 --- a/config/default/manager_metrics_patch.yaml +++ b/config/default/manager_metrics_patch.yaml @@ -1,4 +1,3 @@ -# This patch exposes 8443 port used by metrics service apiVersion: apps/v1 kind: Deployment metadata: @@ -8,8 +7,8 @@ spec: template: spec: containers: - - name: manager - ports: - - containerPort: 8443 - name: metrics - protocol: TCP + - name: manager + ports: + - containerPort: 8443 + name: metrics + protocol: TCP diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml index ed137168..a2c8d48d 100644 --- a/config/prometheus/kustomization.yaml +++ b/config/prometheus/kustomization.yaml @@ -1,2 +1,7 @@ +namespace: llmaz-system +namePrefix: llmaz- + resources: - monitor.yaml +- prometheus.yaml +- serviceaccount.yaml diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml index 52e90517..e6ed6b64 100644 --- a/config/prometheus/monitor.yaml +++ b/config/prometheus/monitor.yaml @@ -5,11 +5,9 @@ metadata: labels: control-plane: controller-manager app.kubernetes.io/name: servicemonitor - app.kubernetes.io/instance: controller-manager-metrics-monitor app.kubernetes.io/component: metrics app.kubernetes.io/created-by: llmaz app.kubernetes.io/part-of: llmaz - app.kubernetes.io/managed-by: kustomize name: controller-manager-metrics-monitor namespace: system spec: @@ -22,4 +20,4 @@ spec: insecureSkipVerify: true selector: matchLabels: - control-plane: controller-manager + app.kubernetes.io/name: service diff --git a/config/prometheus/prometheus.yaml b/config/prometheus/prometheus.yaml new file mode 100644 index 00000000..53d2501e --- /dev/null +++ b/config/prometheus/prometheus.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + namespace: system +spec: + serviceAccountName: llmaz-prometheus + # Associated ServiceMonitor selector + serviceMonitorSelector: + # Need to match the label in ServiceMonitor + matchLabels: + control-plane: controller-manager + resources: + requests: + memory: 400Mi + enableAdminAPI: false diff --git a/config/prometheus/serviceaccount.yaml b/config/prometheus/serviceaccount.yaml new file mode 100644 index 00000000..05f26a88 --- /dev/null +++ b/config/prometheus/serviceaccount.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: llmaz-system diff --git a/docs/prometheus-operator/README.md b/docs/prometheus-operator/README.md new file mode 100644 index 00000000..2d9dfad9 --- /dev/null +++ b/docs/prometheus-operator/README.md @@ -0,0 +1,76 @@ +# Install Prometheus Operator Guide + +Currently, llmaz has already integrated metrics. This document provides deployment steps explaining how to install and configure Prometheus Operator in a Kubernetes cluster. + +### Install the prometheus operator + +Please follow the [documentation](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/getting-started/installation.md) to install + +```bash +# Installing the prometheus operator +root@VM-0-5-ubuntu:/home/ubuntu# kubectl get pods +NAME READY STATUS RESTARTS AGE +prometheus-operator-55b5c96cf8-jl2nx 1/1 Running 0 12s +``` +Ensure that the Prometheus Operator Pod is running successfully. + +### Install the ServiceMonitor CR for llmaz + +To enable monitoring for the llmaz system, you need to install the ServiceMonitor custom resource (CR). +You can either modify the Helm chart prometheus according to the [documentation](./../../chart/values.global.yaml) or use `make install-prometheus` in Makefile. + +- Using Helm Chart: to modify the values.global.yaml +```yaml +prometheus: + # -- Whether to enable Prometheus metrics exporting. + enable: true +``` +- Using Makefile Command: `make install-prometheus ` +```bash +root@VM-0-5-ubuntu:/home/ubuntu/llmaz# make install-prometheus +kubectl apply --server-side -k config/prometheus +serviceaccount/llmaz-prometheus serverside-applied +clusterrole.rbac.authorization.k8s.io/llmaz-prometheus serverside-applied +clusterrolebinding.rbac.authorization.k8s.io/llmaz-prometheus serverside-applied +prometheus.monitoring.coreos.com/llmaz-prometheus serverside-applied +servicemonitor.monitoring.coreos.com/llmaz-controller-manager-metrics-monitor serverside-applied +``` + +### Check Related Resources + +Verify that the necessary resources have been created: + +- ServiceMonitor +```bash +root@VM-0-5-ubuntu:/home/ubuntu/llmaz# kubectl get ServiceMonitor -n llmaz-system +NAME AGE +llmaz-controller-manager-metrics-monitor 59s +``` +- Prometheus Pods +```bash +root@VM-0-5-ubuntu:/home/ubuntu/llmaz# kubectl get pods -n llmaz-system +NAME READY STATUS RESTARTS AGE +llmaz-controller-manager-7ff8f7d9bd-vztls 2/2 Running 0 28s +prometheus-llmaz-prometheus-0 2/2 Running 0 27s +``` +- Services +```bash +root@VM-0-5-ubuntu:/home/ubuntu/llmaz# kubectl get svc -n llmaz-system +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +llmaz-controller-manager-metrics-service ClusterIP 10.96.79.226 8443/TCP 46s +llmaz-webhook-service ClusterIP 10.96.249.226 443/TCP 46s +prometheus-operated ClusterIP None 9090/TCP 45s +``` + +### View metrics using the prometheus UI +Use port forwarding to access the Prometheus UI from your local machine: + +```bash +root@VM-0-5-ubuntu:/home/ubuntu# kubectl port-forward services/prometheus-operated 9090:9090 --address 0.0.0.0 -n llmaz-system +Forwarding from 0.0.0.0:9090 -> 9090 +``` + +If using kind, we can use port-forward, `kubectl port-forward services/prometheus-operated 39090:9090 --address 0.0.0.0 -n llmaz-system` +This allows us to access prometheus using a browser: `http://localhost:9090/query` + +![prometheus](prometheus.png?raw=true) diff --git a/docs/prometheus-operator/prometheus.png b/docs/prometheus-operator/prometheus.png new file mode 100644 index 00000000..06a9ac9d Binary files /dev/null and b/docs/prometheus-operator/prometheus.png differ