-
Notifications
You must be signed in to change notification settings - Fork 950
feat: KEP-2437 - PodGroup Creation for Volcano Scheduler #2729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
8867e71
0cd20fc
2e911ef
ed0425f
fca9f40
8ab5d71
bd60987
ec6c5a8
f182b65
ffa27f9
f8ea7dd
4fa3b6a
1a247da
73bb476
b92383a
fe6ffd0
1c33eba
7cbad55
cdd9309
8e68bba
79618cb
8814111
114f11b
c8fa0fd
f8d8912
c084ac8
f65d7a7
aa90695
83c0585
6f24588
d2fd159
e95a158
d582a81
fe19174
deafc5d
d8bae4a
cf578a2
a332de4
a4f09e6
71996b6
99e0462
583b1d6
e60a5a9
ef25760
28bcd41
e2b1d89
d037828
aaf47a2
9c1c5e0
8497e6b
67bbedb
04aa3e8
e044c4f
167a595
d52581e
a8be8ca
8756dc7
7ba807e
6075a0c
132bb16
7065606
e6c7646
2eb8629
4ede544
ef69e38
63393c5
2fad831
f0e5a4c
9c1eae1
e185049
497a0b6
4030ce5
c886583
451bf6e
49bd5b9
4175fef
1f05304
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -21,6 +21,7 @@ import ( | |||||||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||||||
| "k8s.io/apimachinery/pkg/util/intstr" | ||||||||||
| jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" | ||||||||||
| volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| const ( | ||||||||||
|
|
@@ -134,7 +135,8 @@ type PodGroupPolicySource struct { | |||||||||
| // Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling. | ||||||||||
| Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"` | ||||||||||
|
|
||||||||||
| // TODO (andreyvelich): Add support for Volcano gang-scheduler. | ||||||||||
| // Volcano plugin for gang-scheduling. | ||||||||||
| Volcano *VolcanoPodGroupPolicySource `json:"volcano,omitempty"` | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin. | ||||||||||
|
|
@@ -147,6 +149,25 @@ type CoschedulingPodGroupPolicySource struct { | |||||||||
| ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // VolcanoPodGroupPolicySource represents configuration for the Volcano gang-scheduler. | ||||||||||
| type VolcanoPodGroupPolicySource struct { | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, would like to why other fields are not present, like queue, minMember, minResource?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As introduced here, we'll use annotation to inject trainer/docs/proposals/2437-volcano-scheduler/README.md Lines 183 to 184 in d239d6a
For trainer/docs/proposals/2437-volcano-scheduler/README.md Lines 179 to 180 in d239d6a
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great, so users cannot set a custom minmember, it will always be equal to the number of replicas? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Doris-xm currently volcano uses 'scheduling.k8s.io/group-name' as group name annotation, not 'scheduling.volcano.sh/group-name', as can be seen in https://github.com/volcano-sh/volcano/blob/5007c6b010a44518ec2d946c6d126f2dfdadc980/pkg/controllers/podgroup/pg_controller_handler.go#L189 cc @Monokaix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Autually also want to know why not put all these attrs in VolcanoPodGroupPolicySource?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we discussed in the KEP, we want to make it consistent with other schedulers like Coscheduling or Kueue: #2672 (comment) |
||||||||||
| // Queue is the name of the Volcano queue to which the PodGroup will be submitted. | ||||||||||
| // Defaults to the “default” queue, which has the lowest weight. | ||||||||||
| // +kubebuilder:default=default | ||||||||||
| Queue *string `json:"queue,omitempty"` | ||||||||||
|
|
||||||||||
| // PriorityClassName is the name of the Kubernetes PriorityClass to use for the PodGroup. | ||||||||||
| // e.g. system-node-critical, system-cluster-critical. | ||||||||||
| // This field is optional. | ||||||||||
| PriorityClassName *string `json:"priorityClassName,omitempty"` | ||||||||||
|
Doris-xm marked this conversation as resolved.
Outdated
|
||||||||||
|
|
||||||||||
| // NetworkTopology defines the NetworkTopology config, this field works in conjunction with network topology feature and hyperNode CRD. | ||||||||||
| // +kubebuilder:validation:EmbeddedResource | ||||||||||
| // +kubebuilder:pruning:PreserveUnknownFields | ||||||||||
| // +optional | ||||||||||
| NetworkTopology *volcanov1beta1.NetworkTopologySpec `json:"networkTopology,omitempty"` | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // MLPolicy represents configuration for the model trining with ML-specific parameters. | ||||||||||
| // +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured" | ||||||||||
| // +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured" | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,56 @@ | ||||||||||||||||||||||
| /* | ||||||||||||||||||||||
| Copyright 2024 The Kubeflow Authors. | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||||||||||||||
| you may not use this file except in compliance with the License. | ||||||||||||||||||||||
| You may obtain a copy of the License at | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| Unless required by applicable law or agreed to in writing, software | ||||||||||||||||||||||
| distributed under the License is distributed on an "AS IS" BASIS, | ||||||||||||||||||||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||||||||||||||||||
| See the License for the specific language governing permissions and | ||||||||||||||||||||||
| limitations under the License. | ||||||||||||||||||||||
| */ | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| package volcano | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| import ( | ||||||||||||||||||||||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| trainer "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1" | ||||||||||||||||||||||
| ) | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Doris-xm @Electronic-Waste @tenzen-y @astefanutti Can we share indexer across all gang-scheduling plugins ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'll meet indexer conflict panic if we use the same index key. The root cause is that he framework initializes all registered plugins. Both the
trainer/pkg/runtime/framework/core/framework.go Lines 54 to 55 in c6997fc
Currently, I use different keys to avoid conflicts. trainer/pkg/runtime/framework/plugins/volcano/indexer.go Lines 26 to 27 in 8497e6b
Is there a better solution? @andreyvelich @Electronic-Waste @rudeigerc
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Doris-xm Please can you explore if we can create a single indexer that we can re-use across Volcano and Coscheduling plugin ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I will do that. But still I think the problem is the conflict index key.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've created a single indexer:
But I keep two different keys to avoid conflicts as I stated above. trainer/pkg/runtime/framework/plugins/indexer/indexer.go Lines 26 to 29 in 167a595
Still wonder if there's a better solution. |
||||||||||||||||||||||
| var ( | ||||||||||||||||||||||
| TrainingRuntimeContainerRuntimeClassKey = ".trainingRuntimeSpec.jobSetTemplateSpec.replicatedJobs.podTemplateSpec.runtimeClassName" | ||||||||||||||||||||||
| ClusterTrainingRuntimeContainerRuntimeClassKey = ".clusterTrainingRuntimeSpec.jobSetTemplateSpec.replicatedJobs.podTemplateSpec.runtimeClassName" | ||||||||||||||||||||||
| ) | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| func IndexTrainingRuntimeContainerRuntimeClass(obj client.Object) []string { | ||||||||||||||||||||||
| runtime, ok := obj.(*trainer.TrainingRuntime) | ||||||||||||||||||||||
| if !ok { | ||||||||||||||||||||||
| return nil | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| var runtimeClasses []string | ||||||||||||||||||||||
| for _, rJob := range runtime.Spec.Template.Spec.ReplicatedJobs { | ||||||||||||||||||||||
| if rJob.Template.Spec.Template.Spec.RuntimeClassName != nil { | ||||||||||||||||||||||
| runtimeClasses = append(runtimeClasses, *rJob.Template.Spec.Template.Spec.RuntimeClassName) | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| return runtimeClasses | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| func IndexClusterTrainingRuntimeContainerRuntimeClass(obj client.Object) []string { | ||||||||||||||||||||||
| clRuntime, ok := obj.(*trainer.ClusterTrainingRuntime) | ||||||||||||||||||||||
| if !ok { | ||||||||||||||||||||||
| return nil | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| var runtimeClasses []string | ||||||||||||||||||||||
| for _, rJob := range clRuntime.Spec.Template.Spec.ReplicatedJobs { | ||||||||||||||||||||||
| if rJob.Template.Spec.Template.Spec.RuntimeClassName != nil { | ||||||||||||||||||||||
| runtimeClasses = append(runtimeClasses, *rJob.Template.Spec.Template.Spec.RuntimeClassName) | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| return runtimeClasses | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.