Skip to content

Commit 9107af9

Browse files
authored
Add memory bandwidth exporter for AI workload. (#379)
* Add memory bandwidth exporter for AI workload. Signed-off-by: Yugar-1 <[email protected]>
1 parent bf10bdd commit 9107af9

File tree

22 files changed

+2518
-1
lines changed

22 files changed

+2518
-1
lines changed

.github/workflows/pr-go-unittests.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ jobs:
9191
9292
- name: Run tests and generate coverage
9393
run: |
94+
if [ "${{ matrix.gopath }}" == "${MBE_DIR}" ]; then
95+
exit 0
96+
fi
9497
cd ${{ matrix.gopath }}
9598
go test -coverprofile=coverage.out $(go list ./... | grep -v /e2e)
96-
../.github/workflows/scripts/go-coverage.sh
99+
${{ github.workspace }}/.github/workflows/scripts/go-coverage.sh
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
run:
5+
timeout: 5m
6+
allow-parallel-runners: true
7+
8+
issues:
9+
exclude-use-default: false
10+
exclude-rules:
11+
- path: _test.go
12+
linters:
13+
- errcheck
14+
15+
linters:
16+
disable-all: true
17+
enable:
18+
- depguard
19+
- misspell
20+
- revive
21+
- dupl
22+
- errcheck
23+
- exportloopref
24+
- goconst
25+
- gocyclo
26+
- gofmt
27+
- goimports
28+
- gosimple
29+
- govet
30+
- ineffassign
31+
- lll
32+
- misspell
33+
- nakedret
34+
- prealloc
35+
- staticcheck
36+
- typecheck
37+
- unconvert
38+
- unparam
39+
- unused
40+
41+
42+
linters-settings:
43+
depguard:
44+
rules:
45+
no_exec_policy:
46+
files:
47+
- "!$test"
48+
deny:
49+
- pkg: "os/exec"
50+
desc: "Using os/exec to run sub processes it not allowed by policy"
51+
errcheck:
52+
exclude-functions:
53+
# Used in HTTP handlers, any error is handled by the server itself.
54+
- (net/http.ResponseWriter).Write
55+
# Never check for logger errors.
56+
- (github.com/go-kit/log.Logger).Log
57+
revive:
58+
rules:
59+
# https://github.com/mgechev/revive/blob/master/RULES_DESCRIPTIONS.md#unused-parameter
60+
- name: unused-parameter
61+
severity: warning
62+
disabled: true
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
FROM golang:1.22 AS builder
2+
ARG TARGETOS
3+
ARG TARGETARCH
4+
5+
WORKDIR /workspace
6+
COPY . /workspace/
7+
RUN go mod download
8+
9+
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o memory-bandwidth-exporter cmd/main.go
10+
11+
FROM ubuntu:22.04
12+
USER root
13+
WORKDIR /
14+
COPY --from=builder /workspace/memory-bandwidth-exporter .
15+
16+
ENTRYPOINT ["bash", "-c"]
17+
CMD ["/memory-bandwidth-exporter --collector.node.name=${NODE_NAME} --collector.container.namespaceWhiteList=${NAMESPACE_WHITELIST}"]
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
GO_CMD := go
2+
DEBUG ?= 0
3+
DOCKER_REGISTRY ?= docker.io/opea
4+
CONTAINER_TOOL ?= docker
5+
VERSION ?= latest
6+
7+
ifeq ($(DEBUG),0)
8+
GOFLAGS=-ldflags="all=-s -w"
9+
endif
10+
11+
MBE_IMG_NAME = memory-bandwidth-exporter:$(VERSION)
12+
MBE_IMG = ${DOCKER_REGISTRY}/${MBE_IMG_NAME}
13+
14+
build:
15+
@mkdir -p bin
16+
@echo "Building memory-bandwidth-exporter binary..."
17+
$(GO_CMD) build -o bin/memory-bandwidth-exporter $(GOFLAGS) cmd/main.go
18+
19+
docker.build:
20+
@echo "Building memory-bandwidth-exporter Docker image..."
21+
$(CONTAINER_TOOL) build -t ${MBE_IMG} -f Dockerfile .
22+
23+
docker.push:
24+
@echo "Push memory-bandwidth-exporter Docker image..."
25+
$(CONTAINER_TOOL) push ${MBE_IMG}
26+
27+
clean:
28+
@echo "Cleaning up..."
29+
rm -rf bin
30+
31+
change_img:
32+
sed -i "s\MBE_IMG\${MBE_IMG}\g" config/manifests/memory-bandwidth-exporter.yaml
33+
34+
test:
35+
@echo "Running tests..."
36+
$(GO_CMD) test ./...
37+
38+
lint: golangci-lint ## Run golangci-lint linter & yamllint
39+
@echo "Running linters...${GOLANGCI_LINT}"
40+
$(GOLANGCI_LINT) run ./...
41+
42+
lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
43+
$(GOLANGCI_LINT) run --fix ./...
44+
45+
##@ Dependencies
46+
47+
## Location to install dependencies to
48+
LOCALBIN ?= $(shell pwd)/bin
49+
$(LOCALBIN):
50+
mkdir -p $(LOCALBIN)
51+
52+
## Tool Binaries
53+
GOLANGCI_LINT = $(LOCALBIN)/golangci-lint-$(GOLANGCI_LINT_VERSION)
54+
55+
## Tool Versions
56+
GOLANGCI_LINT_VERSION ?= v1.59.1
57+
58+
.PHONY: golangci-lint
59+
golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
60+
$(GOLANGCI_LINT): $(LOCALBIN)
61+
$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,${GOLANGCI_LINT_VERSION})
62+
63+
# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
64+
# $1 - target path with name of binary (ideally with version)
65+
# $2 - package url which can be installed
66+
# $3 - specific version of package
67+
define go-install-tool
68+
@[ -f $(1) ] || { \
69+
set -e; \
70+
package=$(2)@$(3) ;\
71+
echo "Downloading $${package}" ;\
72+
GOBIN=$(LOCALBIN) go install $${package} ;\
73+
mv "$$(echo "$(1)" | sed "s/-$(3)$$//")" $(1) ;\
74+
}
75+
endef
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# memory bandwidth exporter
2+
3+
Pod/container grained memory bandwidth exporter provides users memory bandwidth metrics of their running containers. The metrics include llc_occupancy, mbm_local_bytes, mbm_total_bytes, cpu utilization and memory usage, and the metrics have been processed. In addition to container-level metrics, it also provides class-level and socket-level metrics. Users can configure the list of metrics to be collected. It serves as an exporter which can be connected to Promethus-like obserbility tools. And it also can be used as a telementry provider.
4+
5+
Memory bandwidth exporter makes use of state-of-the-art technologies like NRI to build a resource-efficient and well-maintained solution. This solution provides observability to memory bandwidth to OPEA micro-services. It lays the groundwork of better scaling and auto scaling of OPEA. It can also be deployed separately on end user environments, supporting any cases that memory bandwidth metrics are required.
6+
7+
The memory bandwidth exporter currently only supports Intel platforms with RDT, and will fail on other platforms. We will add node feature discovery in the future.
8+
9+
## Setup
10+
11+
### Enable NRI in Containerd
12+
13+
```sh
14+
# download containerd binary, containerd version v1.7.0 or higher is required
15+
$ wget https://github.com/containerd/containerd/releases/download/v1.7.0/containerd-1.7.0-linux-amd64.tar.gz
16+
17+
# stop running containerd
18+
$ sudo systemctl stop containerd
19+
20+
# replace old containerd
21+
$ sudo tar Cxzvf /usr/local containerd-1.7.0-linux-amd64.tar.gz
22+
23+
# enable NRI in containerd
24+
# add an item in /etc/containerd/config.toml
25+
[plugins."io.containerd.nri.v1.nri"]
26+
disable = false
27+
disable_connections = false
28+
plugin_config_path = "/etc/containerd/certs.d"
29+
plugin_path = "/opt/nri/plugins"
30+
socket_path = "/var/run/nri/nri.sock"
31+
config_file = "/etc/nri/nri.conf"
32+
33+
# restart containerd
34+
$ sudo systemctl start containerd
35+
$ sudo systemctl status containerd
36+
37+
# test nri
38+
$ git clone https://github.com/containerd/nri
39+
$ cd nri
40+
$ make
41+
$ ./build/bin/logger -idx 00
42+
```
43+
44+
### Enable RDT
45+
46+
Mount resctrl to the directory `/sys/fs/resctrl`:
47+
48+
```sh
49+
$ sudo mount -t resctrl resctrl /sys/fs/resctrl
50+
```
51+
52+
### Setup memory bandwidth exporter
53+
54+
Before setup, you need to configure the runc hook:
55+
56+
```sh
57+
$ ./config/config.sh
58+
```
59+
60+
#### How to build the binary and setup?
61+
62+
```sh
63+
$ make build
64+
$ sudo ./bin/memory-bandwidth-exporter
65+
# e.g., sudo ./bin/memory-bandwidth-exporter --collector.node.name=<node_name> --collector.container.namespaceWhiteList="calico-apiserver,calico-system,kube-system,tigera-operator"
66+
67+
# get memory bandwidth metrics
68+
$ curl http://localhost:9100/metrics
69+
```
70+
71+
#### How to build the docker image and setup?
72+
73+
```sh
74+
$ make docker.build
75+
$ sudo docker run \
76+
-e NODE_NAME=<node_name> \
77+
-e NAMESPACE_WHITELIST="calico-apiserver,calico-system,kube-system,tigera-operator" \
78+
--mount type=bind,source=/etc/containers/oci/hooks.d/,target=/etc/containers/oci/hooks.d/ \
79+
--privileged \
80+
--cgroupns=host \
81+
--pid=host \
82+
--mount type=bind,source=/usr/,target=/usr/ \
83+
--mount type=bind,source=/sys/fs/resctrl/,target=/sys/fs/resctrl/ \
84+
--mount type=bind,source=/var/run/nri/,target=/var/run/nri/ \
85+
-d -p 9100:9100 \
86+
--name=memory-bandwidth-exporter \
87+
opea/memory-bandwidth-exporter:latest
88+
89+
# get memory bandwidth metrics
90+
$ curl http://localhost:9100/metrics
91+
```
92+
93+
#### How to deploy on the K8s cluster?
94+
95+
Build and push your image to the location specified by `MBE_IMG`, and apply manifest:
96+
97+
```sh
98+
$ make docker.build docker.push MBE_IMG=<some-registry>/opea/memory-bandwidth-exporter:<tag>
99+
$ make change_img MBE_IMG=<some-registry>/opea/memory-bandwidth-exporter:<tag>
100+
# If namespace system does not exist, create it.
101+
$ kubectl create ns system
102+
$ kubectl apply -f config/manifests/memory-bandwidth-exporter.yaml
103+
```
104+
105+
Check the installation result:
106+
107+
```sh
108+
kubectl get pods -n system
109+
NAME READY STATUS RESTARTS AGE
110+
memory-bandwidth-exporter-zxhdl 1/1 Running 0 3m
111+
```
112+
113+
get memory bandwidth metrics
114+
115+
```sh
116+
$ curl http://<memory_bandwidth_exporter_container_ip>:9100/metrics
117+
```
118+
119+
#### How to delete binary?
120+
121+
```sh
122+
$ make clean
123+
```
124+
125+
## More flags about memory bandwidth exporter
126+
127+
There are some flags to help users better use memory bandwidth exporter:
128+
129+
```sh
130+
-h, --[no-]help Show context-sensitive help (also try --help-long and --help-man).
131+
--collector.node.name="" Give node name.
132+
--collector.container.namespaceWhiteList="" Filter out containers whose namespaces belong to the namespace whitelist, namespaces separated by commas, like "xx,xx,xx".
133+
--collector.container.monTimes=10 Scan the pids of containers created before the exporter starts to prevent the loss of pids.
134+
--collector.container.metrics="all" Enable container collector metrics.
135+
--collector.class.metrics="none" Enable class collector metrics.
136+
--collector.node.metrics="none" Enable node collector metrics.
137+
--web.telemetry-path="/metrics" Path under which to expose metrics.
138+
--[no-]web.disable-exporter-metrics Exclude metrics about the exporter itself (promhttp_*, process_*, go_*).
139+
--web.max-requests=40 Maximum number of parallel scrape requests. Use 0 to disable.
140+
--runtime.gomaxprocs=1 The target number of CPUs Go will run on (GOMAXPROCS) ($GOMAXPROCS)
141+
--[no-]web.systemd-socket Use systemd socket activation listeners instead of port listeners (Linux only).
142+
--web.listen-address=:9100 ... Addresses on which to expose metrics and web interface. Repeatable for multiple addresses.
143+
--web.config.file="" Path to configuration file that can enable TLS or authentication. See: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md
144+
--collector.interval=3s memory bandwidth exporter collect metrics interval
145+
--NRIplugin.name="mb-nri-plugin" Plugin name to register to NRI
146+
--NRIplugin.idx="11" Plugin index to register to NRI
147+
--[no-]disableWatch Disable watching hook directories for new hooks
148+
--log.level=info Only log messages with the given severity or above. One of: [debug, info, warn, error]
149+
--log.format=logfmt Output format of log messages. One of: [logfmt, json]
150+
--[no-]version Show application version.
151+
```

0 commit comments

Comments
 (0)