Skip to content
This repository was archived by the owner on Apr 22, 2024. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,5 @@ replace (
k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.25.0
k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.25.0
k8s.io/sample-controller => k8s.io/sample-controller v0.25.0
volcano.sh/apis => github.com/predibase/volcano-apis v0.0.0-20230208222101-1946093f2249
)
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/predibase/volcano-apis v0.0.0-20230208222101-1946093f2249 h1:IKdy8kADUHn8OPAjatWMxCZXfHWiEWk65QXPlJ4SsQs=
github.com/predibase/volcano-apis v0.0.0-20230208222101-1946093f2249/go.mod h1:xe38GChdXXam/g/FkQXIsR0vhqp4twoZdY2gaGkEP24=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
Expand Down Expand Up @@ -831,5 +833,3 @@ sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8=
stathat.com/c/consistent v1.0.0 h1:ezyc51EGcRPJUxfHGSgJjWzJdj3NiMU9pNfLNGiXV0c=
stathat.com/c/consistent v1.0.0/go.mod h1:QkzMWzcbB+yQBL2AttO6sgsQS/JSTapcDISJalmCDS0=
volcano.sh/apis v1.6.0-alpha.0.0.20230214095022-ad92502b1a57 h1:aQhXCHqcaOtkCtFn3XSniKWlIb1xxwJ8G7SQeHkZ6vM=
volcano.sh/apis v1.6.0-alpha.0.0.20230214095022-ad92502b1a57/go.mod h1:h+xbUpkjfRaHjktAi8h+7JNnNahjwhRSgpN9FUUwNXQ=
15 changes: 7 additions & 8 deletions installer/helm/chart/volcano/config/volcano-scheduler.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@ actions: "enqueue, allocate, backfill"
tiers:
- plugins:
- name: priority
- name: gang
enablePreemptable: false
# - name: gang
- name: conformance
- plugins:
- name: overcommit
- name: drf
enablePreemptable: false
- name: fit
# - name: proportion
# - name: overcommit
# - name: drf
- name: predicates
- name: proportion
- name: nodeorder
- name: binpack
# - name: nodeorder
# - name: binpack
10 changes: 10 additions & 0 deletions installer/helm/chart/volcano/templates/admission.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@ kind: ServiceAccount
metadata:
name: {{ .Release.Name }}-admission
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ .Release.Name }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade
rules:
- apiGroups: [""]
resources: ["configmaps"]
Expand Down Expand Up @@ -49,6 +53,8 @@ kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ .Release.Name }}-admission-role
annotations:
"helm.sh/hook": pre-install,pre-upgrade
subjects:
- kind: ServiceAccount
name: {{ .Release.Name }}-admission
Expand Down Expand Up @@ -139,6 +145,10 @@ metadata:
namespace: {{ .Release.Namespace }}
labels:
app: volcano-admission-init
annotations:
"helm.sh/hook": pre-install
"helm.sh/hook-weight": "0"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 3
template:
Expand Down
2 changes: 1 addition & 1 deletion installer/helm/chart/volcano/templates/scheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ spec:
- --scheduler-conf=/volcano.scheduler/{{base .Values.basic.scheduler_config_file}}
- --enable-healthz=true
- --enable-metrics=true
- -v=3
- -v=5
- 2>&1
imagePullPolicy: {{ .Values.basic.image_pull_policy }}
volumeMounts:
Expand Down
8 changes: 4 additions & 4 deletions installer/helm/chart/volcano/values.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
basic:
controller_image_name: "volcanosh/vc-controller-manager"
scheduler_image_name: "volcanosh/vc-scheduler"
admission_image_name: "volcanosh/vc-webhook-manager"
image_tag_version: "sha-7e1b436"
controller_image_name: "public.ecr.aws/n9u9x7z1/volcano-controller-manager"
scheduler_image_name: "public.ecr.aws/n9u9x7z1/volcano-scheduler"
admission_image_name: "public.ecr.aws/n9u9x7z1/volcano-webhook-manager"
admission_secret_name: "volcano-admission-secret"
admission_config_file: "config/volcano-admission.conf"
scheduler_config_file: "config/volcano-scheduler.conf"
image_pull_secret: ""
image_pull_policy: "IfNotPresent"
image_tag_version: "latest"
admission_port: 8443
custom:
metrics_enable: false
Expand Down
7 changes: 5 additions & 2 deletions pkg/scheduler/actions/allocate/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,13 @@ func (alloc *Action) Execute(ssn *framework.Session) {
break
}

predicateNodes, fitErrors := ph.PredicateNodes(task, allNodes, predicateFn, true)
predicateNodes, fitErrors := ph.PredicateNodes(task, allNodes, predicateFn, false)
if len(predicateNodes) == 0 {
klog.V(3).Infof("PredicateNodes for task %s/%s found: %v", task.Namespace, task.Name, fitErrors.Error())
job.NodesFitErrors[task.UID] = fitErrors
break
// Don't break the loop here. We need to perform this check for all tasks to ensure they have proper NodeFitErrors set (if applicable)
// so that the right pod condition is set for the cluster autoscaler.
continue
}

var candidateNodes []*api.NodeInfo
Expand Down
11 changes: 5 additions & 6 deletions pkg/scheduler/api/job_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,6 @@ func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string)
ctx = *taskInfo.LastTransaction
}

msg = ji.JobFitErrors
switch status := ctx.Status; status {
case Allocated, Pipelined:
// Pod is schedulable
Expand All @@ -662,13 +661,13 @@ func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string)
return PodReasonSchedulable, msg
case Pending:
if fe := ji.NodesFitErrors[tid]; fe != nil {
// Pod is not schedulable
return PodReasonUnschedulable, fe.Error()
// Pod is not schedulable on currently available nodes. We want to set 'Unschedulable' as the reason to trigger the cluster autoscaler.
return PodReasonUnschedulable, fmt.Sprintf("fiterr: %s", fe.Error())
}
// Pod is not scheduled yet, keep UNSCHEDULABLE as the reason to support cluster autoscaler
return PodReasonUnschedulable, msg
// Pod hasn't cleared the enqueue phase yet. Use the 'Ineligible' status to bypass the cluster autoscaler.
return PodReasonIneligible, "pod is not yet eligible to schedule"
default:
return status.String(), msg
return status.String(), ji.JobFitErrors
}
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/scheduler/api/unschedule_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ const (

// These are reasons for a pod's transition to a condition.
const (
// PodReasonIneligible reason means that the pod is not currently eligible for scheduling on a node,
// for example due to queue constraints.
PodReasonIneligible = "Ineligible"
// PodReasonUnschedulable reason in PodScheduled PodCondition means that the scheduler
// can't schedule the pod right now, for example due to insufficient resources in the cluster.
// It can also mean that the scheduler skips scheduling the pod which left the pod `Undetermined`,
Expand Down
4 changes: 3 additions & 1 deletion pkg/scheduler/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bo

// UpdatePodCondition will Update pod with podCondition
func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) {
klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s), reason %s, msg %s", pod.Namespace, pod.Name, condition.Type, condition.Status, condition.Reason, condition.Message)
if podutil.UpdatePodCondition(&pod.Status, condition) {
return su.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
}
Expand Down Expand Up @@ -869,6 +869,8 @@ func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason
Message: message,
}

klog.V(4).Infof("task unscheduleable %s/%s, reason: %s message: %s", pod.Namespace, pod.Name, reason, message)

if podConditionHaveUpdate(&pod.Status, condition) {
pod = pod.DeepCopy()

Expand Down
5 changes: 3 additions & 2 deletions pkg/scheduler/framework/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,13 +265,14 @@ func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus {
}
}

// If there're enough allocated resource, it's running
if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember {
if int32(len(jobInfo.TaskStatusIndex[api.Running])) >= jobInfo.PodGroup.Spec.MinMember {
status.Phase = scheduling.PodGroupRunning
// If all allocated tasks is succeeded, it's completed
if len(jobInfo.TaskStatusIndex[api.Succeeded]) == allocated {
status.Phase = scheduling.PodGroupCompleted
}
} else if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember {
status.Phase = scheduling.PodGroupAllocated
} else if jobInfo.PodGroup.Status.Phase != scheduling.PodGroupInqueue {
status.Phase = scheduling.PodGroupPending
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/scheduler/plugins/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"volcano.sh/volcano/pkg/scheduler/plugins/conformance"
"volcano.sh/volcano/pkg/scheduler/plugins/drf"
"volcano.sh/volcano/pkg/scheduler/plugins/extender"
"volcano.sh/volcano/pkg/scheduler/plugins/fit"
"volcano.sh/volcano/pkg/scheduler/plugins/gang"
"volcano.sh/volcano/pkg/scheduler/plugins/nodeorder"
"volcano.sh/volcano/pkg/scheduler/plugins/numaaware"
Expand Down Expand Up @@ -55,6 +56,7 @@ func init() {
framework.RegisterPluginBuilder(cdp.PluginName, cdp.New)
framework.RegisterPluginBuilder(rescheduling.PluginName, rescheduling.New)
framework.RegisterPluginBuilder(usage.PluginName, usage.New)
framework.RegisterPluginBuilder(fit.PluginName, fit.New)

// Plugins for Queues
framework.RegisterPluginBuilder(proportion.PluginName, proportion.New)
Expand Down
Loading