Skip to content

Commit 54bb1bc

Browse files
authored
Support watcher pod rbac in kvcache controller (#1071)
* Support watcher pod rbac in kv cache controller * Update hpkv key in redis * Grant pods/exec role to controller-manager * Add prometheus scrape configuration in controller and watcher --------- Signed-off-by: Jiaxin Shan <[email protected]>
1 parent c12ff66 commit 54bb1bc

File tree

11 files changed

+271
-126
lines changed

11 files changed

+271
-126
lines changed

cmd/kvcache-watcher/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ import (
5656
const KVCacheLabelKeyIdentifier = "kvcache.orchestration.aibrix.ai/name"
5757
const KVCacheLabelKeyRole = "kvcache.orchestration.aibrix.ai/role"
5858
const KVCacheLabelValueRoleCache = "cache"
59-
const HPKVRedisNodeMemberKey = "hpkv_nodes"
59+
const HPKVRedisNodeMemberKey = "hpkv_cluster_metadata"
6060
const InfiniStoreRedisNodeMemberKey = "kvcache_nodes"
6161

6262
const networkStatusAnnotation = "k8s.volcengine.com/network-status"

config/manager/manager.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ metadata:
6767
control-plane: controller-manager
6868
app.kubernetes.io/name: aibrix
6969
app.kubernetes.io/managed-by: kustomize
70+
annotations:
71+
prometheus.io/scrape: "true"
72+
prometheus.io/port: "8080"
73+
prometheus.io/path: "/metrics"
7074
name: controller-manager-metrics-service
7175
namespace: system
7276
spec:

config/rbac/controller-manager/role.yaml

Lines changed: 43 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,37 @@ rules:
1212
- create
1313
- patch
1414
- update
15+
- apiGroups:
16+
- ""
17+
resources:
18+
- secrets
19+
verbs:
20+
- get
21+
- list
22+
- update
23+
- watch
24+
- apiGroups:
25+
- admissionregistration.k8s.io
26+
resources:
27+
- mutatingwebhookconfigurations
28+
- validatingwebhookconfigurations
29+
verbs:
30+
- get
31+
- list
32+
- update
33+
- watch
34+
- apiGroups:
35+
- apiextensions.k8s.io
36+
resources:
37+
- customresourcedefinitions
38+
verbs:
39+
- get
40+
- list
1541
- apiGroups:
1642
- apps
1743
resources:
1844
- deployments
45+
- statefulsets
1946
verbs:
2047
- create
2148
- delete
@@ -28,30 +55,11 @@ rules:
2855
- apps
2956
resources:
3057
- deployments/status
58+
- statefulsets/status
3159
verbs:
3260
- get
3361
- patch
3462
- update
35-
- apiGroups:
36-
- apps
37-
resources:
38-
- statefulsets
39-
verbs:
40-
- create
41-
- delete
42-
- get
43-
- list
44-
- patch
45-
- update
46-
- watch
47-
- apiGroups:
48-
- apps
49-
resources:
50-
- statefulsets/status
51-
verbs:
52-
- get
53-
- patch
54-
- update
5563
- apiGroups:
5664
- autoscaling
5765
resources:
@@ -106,7 +114,9 @@ rules:
106114
- apiGroups:
107115
- ""
108116
resources:
117+
- pods/exec
109118
- pods/status
119+
- services
110120
verbs:
111121
- create
112122
- delete
@@ -118,14 +128,12 @@ rules:
118128
- apiGroups:
119129
- ""
120130
resources:
121-
- services
131+
- serviceaccounts
122132
verbs:
123133
- create
124134
- delete
125135
- get
126136
- list
127-
- patch
128-
- update
129137
- watch
130138
- apiGroups:
131139
- ""
@@ -159,17 +167,6 @@ rules:
159167
- gateway.networking.k8s.io
160168
resources:
161169
- httproutes
162-
verbs:
163-
- create
164-
- delete
165-
- get
166-
- list
167-
- patch
168-
- update
169-
- watch
170-
- apiGroups:
171-
- gateway.networking.k8s.io
172-
resources:
173170
- referencegrants
174171
verbs:
175172
- create
@@ -209,57 +206,7 @@ rules:
209206
- orchestration.aibrix.ai
210207
resources:
211208
- kvcaches
212-
verbs:
213-
- create
214-
- delete
215-
- get
216-
- list
217-
- patch
218-
- update
219-
- watch
220-
- apiGroups:
221-
- orchestration.aibrix.ai
222-
resources:
223-
- kvcaches/finalizers
224-
verbs:
225-
- update
226-
- apiGroups:
227-
- orchestration.aibrix.ai
228-
resources:
229-
- kvcaches/status
230-
verbs:
231-
- get
232-
- patch
233-
- update
234-
- apiGroups:
235-
- orchestration.aibrix.ai
236-
resources:
237209
- rayclusterfleets
238-
verbs:
239-
- create
240-
- delete
241-
- get
242-
- list
243-
- patch
244-
- update
245-
- watch
246-
- apiGroups:
247-
- orchestration.aibrix.ai
248-
resources:
249-
- rayclusterfleets/finalizers
250-
verbs:
251-
- update
252-
- apiGroups:
253-
- orchestration.aibrix.ai
254-
resources:
255-
- rayclusterfleets/status
256-
verbs:
257-
- get
258-
- patch
259-
- update
260-
- apiGroups:
261-
- orchestration.aibrix.ai
262-
resources:
263210
- rayclusterreplicasets
264211
verbs:
265212
- create
@@ -272,12 +219,16 @@ rules:
272219
- apiGroups:
273220
- orchestration.aibrix.ai
274221
resources:
222+
- kvcaches/finalizers
223+
- rayclusterfleets/finalizers
275224
- rayclusterreplicasets/finalizers
276225
verbs:
277226
- update
278227
- apiGroups:
279228
- orchestration.aibrix.ai
280229
resources:
230+
- kvcaches/status
231+
- rayclusterfleets/status
281232
- rayclusterreplicasets/status
282233
verbs:
283234
- get
@@ -310,28 +261,23 @@ rules:
310261
- patch
311262
- update
312263
- apiGroups:
313-
- ""
264+
- rbac.authorization.k8s.io
314265
resources:
315-
- secrets
266+
- rolebindings
316267
verbs:
268+
- create
269+
- delete
317270
- get
318271
- list
319-
- update
320272
- watch
321273
- apiGroups:
322-
- admissionregistration.k8s.io
274+
- rbac.authorization.k8s.io
323275
resources:
324-
- mutatingwebhookconfigurations
325-
- validatingwebhookconfigurations
276+
- roles
326277
verbs:
278+
- create
279+
- delete
327280
- get
328281
- list
329282
- update
330283
- watch
331-
- apiGroups:
332-
- apiextensions.k8s.io
333-
resources:
334-
- customresourcedefinitions
335-
verbs:
336-
- get
337-
- list

config/standalone/kv-cache-controller/patch.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ spec:
1212
- --leader-elect
1313
- --leader-election-id=aibrix-kv-cache-controller
1414
- --health-probe-bind-address=:8081
15-
- --metrics-bind-address=0
15+
- --metrics-bind-address=:8080
1616
- --controllers=kv-cache-controller
1717
- --disable-webhook

pkg/controller/kvcache/backends/common.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
2323
"github.com/vllm-project/aibrix/pkg/constants"
2424
corev1 "k8s.io/api/core/v1"
25+
rbacv1 "k8s.io/api/rbac/v1"
2526
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627
"k8s.io/apimachinery/pkg/util/intstr"
2728
)
@@ -113,3 +114,75 @@ func buildRedisService(kvCache *orchestrationv1alpha1.KVCache) *corev1.Service {
113114

114115
return svc
115116
}
117+
118+
// buildServiceAccount creates a new ServiceAccount for Distributed kv cache solution.
119+
func buildServiceAccount(kvCache *orchestrationv1alpha1.KVCache) *corev1.ServiceAccount {
120+
sa := &corev1.ServiceAccount{
121+
ObjectMeta: metav1.ObjectMeta{
122+
Name: kvCache.Name,
123+
Namespace: kvCache.Namespace,
124+
Labels: map[string]string{
125+
constants.KVCacheLabelKeyIdentifier: kvCache.Name,
126+
constants.KVCacheLabelKeyRole: constants.KVCacheLabelValueRoleCache,
127+
},
128+
},
129+
}
130+
131+
return sa
132+
}
133+
134+
// buildRole creates a new Role for a KVCache resource.
135+
func buildRole(kvCache *orchestrationv1alpha1.KVCache) *rbacv1.Role {
136+
role := &rbacv1.Role{
137+
ObjectMeta: metav1.ObjectMeta{
138+
Name: kvCache.Name,
139+
Namespace: kvCache.Namespace,
140+
Labels: map[string]string{
141+
constants.KVCacheLabelKeyIdentifier: kvCache.Name,
142+
constants.KVCacheLabelKeyRole: constants.KVCacheLabelValueRoleCache,
143+
},
144+
},
145+
Rules: []rbacv1.PolicyRule{
146+
{
147+
APIGroups: []string{""},
148+
Resources: []string{"pods"},
149+
Verbs: []string{"get", "list", "watch"},
150+
},
151+
{
152+
APIGroups: []string{""},
153+
Resources: []string{"pods/exec"},
154+
Verbs: []string{"create"},
155+
},
156+
},
157+
}
158+
159+
return role
160+
}
161+
162+
// buildRoleBinding creates rolebinding for a kvCache object
163+
func buildRoleBinding(kvCache *orchestrationv1alpha1.KVCache) *rbacv1.RoleBinding {
164+
rb := &rbacv1.RoleBinding{
165+
ObjectMeta: metav1.ObjectMeta{
166+
Name: kvCache.Name,
167+
Namespace: kvCache.Namespace,
168+
Labels: map[string]string{
169+
constants.KVCacheLabelKeyIdentifier: kvCache.Name,
170+
constants.KVCacheLabelKeyRole: constants.KVCacheLabelValueRoleCache,
171+
},
172+
},
173+
Subjects: []rbacv1.Subject{
174+
{
175+
Kind: rbacv1.ServiceAccountKind,
176+
Name: kvCache.Name,
177+
Namespace: kvCache.Namespace,
178+
},
179+
},
180+
RoleRef: rbacv1.RoleRef{
181+
APIGroup: rbacv1.GroupName,
182+
Kind: "Role",
183+
Name: kvCache.Name,
184+
},
185+
}
186+
187+
return rb
188+
}

pkg/controller/kvcache/backends/distributed.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ func (r *DistributedReconciler) Reconcile(ctx context.Context, kvCache *orchestr
6060
return reconcile.Result{}, err
6161
}
6262

63+
if err := r.reconcileWatcherPodServiceAccount(ctx, r.Backend.BuildWatcherPodServiceAccount(kvCache)); err != nil {
64+
return reconcile.Result{}, err
65+
}
66+
67+
if err := r.reconcileWatcherPodRole(ctx, r.Backend.BuildWatcherPodRole(kvCache)); err != nil {
68+
return reconcile.Result{}, err
69+
}
70+
71+
if err := r.reconcileWatcherPodRoleBinding(ctx, r.Backend.BuildWatcherPodRoleBinding(kvCache)); err != nil {
72+
return reconcile.Result{}, err
73+
}
74+
6375
// Handle infinistore kvCache Deployment
6476
if err := r.ReconcileStatefulsetObject(ctx, r.Backend.BuildCacheStatefulSet(kvCache)); err != nil {
6577
return ctrl.Result{}, err

pkg/controller/kvcache/backends/distributed_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222

2323
appsv1 "k8s.io/api/apps/v1"
2424
corev1 "k8s.io/api/core/v1"
25+
rbacv1 "k8s.io/api/rbac/v1"
2526

2627
"github.com/stretchr/testify/assert"
2728
"github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
@@ -181,6 +182,9 @@ type mockBackend struct {
181182
watcher *corev1.Pod
182183
svc *corev1.Service
183184
sts *appsv1.StatefulSet
185+
sa *corev1.ServiceAccount
186+
role *rbacv1.Role
187+
rb *rbacv1.RoleBinding
184188
}
185189

186190
func (m mockBackend) Name() string {
@@ -199,6 +203,18 @@ func (m mockBackend) BuildMetadataService(*v1alpha1.KVCache) *corev1.Service {
199203
return m.svc
200204
}
201205

206+
func (m mockBackend) BuildWatcherPodServiceAccount(*v1alpha1.KVCache) *corev1.ServiceAccount {
207+
return m.sa
208+
}
209+
210+
func (m mockBackend) BuildWatcherPodRole(*v1alpha1.KVCache) *rbacv1.Role {
211+
return m.role
212+
}
213+
214+
func (m mockBackend) BuildWatcherPodRoleBinding(*v1alpha1.KVCache) *rbacv1.RoleBinding {
215+
return m.rb
216+
}
217+
202218
func (m mockBackend) BuildWatcherPod(*v1alpha1.KVCache) *corev1.Pod {
203219
return m.watcher
204220
}

0 commit comments

Comments
 (0)