Skip to content

Commit 57e24a2

Browse files
committed
Watch pod events to detect issues preventing workspace start
Add functionality to check pod events during reconciles, in order to detect unrecoverable states in workspace startup that aren't reflected in the pod's status. Signed-off-by: Angel Misevski <[email protected]>
1 parent bcc3595 commit 57e24a2

File tree

2 files changed

+53
-13
lines changed

2 files changed

+53
-13
lines changed

controllers/workspace/devworkspace_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ type DevWorkspaceReconciler struct {
6868
/////// Required permissions for controller
6969
// +kubebuilder:rbac:groups=apps;extensions,resources=deployments;replicasets,verbs=*
7070
// +kubebuilder:rbac:groups="",resources=pods;serviceaccounts;secrets;configmaps;persistentvolumeclaims,verbs=*
71-
// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch
71+
// +kubebuilder:rbac:groups="",resources=namespaces;events,verbs=get;list;watch
7272
// +kubebuilder:rbac:groups="batch",resources=jobs,verbs=get;create;list;watch;update;patch;delete
7373
// +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=mutatingwebhookconfigurations;validatingwebhookconfigurations,verbs=get;list;watch;create;update;patch;delete
7474
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings;clusterroles;clusterrolebindings,verbs=get;list;watch;create;update

controllers/workspace/provision/deployment.go

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ import (
1818
"fmt"
1919
"strings"
2020

21+
"k8s.io/apimachinery/pkg/fields"
22+
2123
"github.com/devfile/devworkspace-operator/apis/controller/v1alpha1"
2224
"github.com/devfile/devworkspace-operator/controllers/workspace/env"
2325
maputils "github.com/devfile/devworkspace-operator/internal/map"
@@ -41,13 +43,21 @@ import (
4143
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4244
)
4345

44-
var ContainerFailureStateReasons = []string{
46+
var containerFailureStateReasons = []string{
4547
"CrashLoopBackOff",
4648
"ImagePullBackOff",
4749
"CreateContainerError",
4850
"RunContainerError",
4951
}
5052

53+
var unrecoverablePodEventReasons = []string{
54+
"FailedMount",
55+
"FailedScheduling",
56+
"MountVolume.SetUp failed",
57+
"FailedCreate",
58+
"ReplicaSetCreateError",
59+
}
60+
5161
type DeploymentProvisioningStatus struct {
5262
ProvisioningStatus
5363
}
@@ -163,21 +173,24 @@ func SyncDeploymentToCluster(
163173
}
164174
}
165175

166-
failureMsg, checkErr := checkFailedPods(workspace, clusterAPI)
176+
failureMsg, checkErr := checkPodsState(workspace, clusterAPI)
167177
if checkErr != nil {
168178
return DeploymentProvisioningStatus{
169179
ProvisioningStatus: ProvisioningStatus{
170180
Err: checkErr,
171181
},
172182
}
173183
}
174-
175-
return DeploymentProvisioningStatus{
176-
ProvisioningStatus: ProvisioningStatus{
177-
FailStartup: failureMsg != "",
178-
Message: failureMsg,
179-
},
184+
if failureMsg != "" {
185+
return DeploymentProvisioningStatus{
186+
ProvisioningStatus{
187+
FailStartup: true,
188+
Message: failureMsg,
189+
},
190+
}
180191
}
192+
193+
return DeploymentProvisioningStatus{}
181194
}
182195

183196
// DeleteWorkspaceDeployment deletes the deployment for the DevWorkspace
@@ -369,10 +382,12 @@ func getPods(workspace *dw.DevWorkspace, client runtimeClient.Client) (*corev1.P
369382
return pods, nil
370383
}
371384

372-
// checkFailedPods check if related pods has unrecoverable states: CrashLoopBackOffReason, ImagePullErr
385+
// checkPodsState checks if workspace-related pods are in an unrecoverable state. A pod is considered to be unrecoverable
386+
// if it has a container with one of the containerStateFailureReasons states, or if an unrecoverable event (with reason
387+
// matching unrecoverablePodEventReasons) has the pod as the involved object.
373388
// Returns optional message with detected unrecoverable state details
374-
// error is any happens during check
375-
func checkFailedPods(workspace *dw.DevWorkspace,
389+
// error if any happens during check
390+
func checkPodsState(workspace *dw.DevWorkspace,
376391
clusterAPI ClusterAPI) (stateMsg string, checkFailure error) {
377392
podList, err := getPods(workspace, clusterAPI.Client)
378393
if err != nil {
@@ -390,6 +405,31 @@ func checkFailedPods(workspace *dw.DevWorkspace,
390405
return fmt.Sprintf("Init Container %s has state %s", initContainerStatus.Name, initContainerStatus.State.Waiting.Reason), nil
391406
}
392407
}
408+
if msg, err := checkPodEvents(&pod, clusterAPI); err != nil || msg != "" {
409+
return msg, err
410+
}
411+
}
412+
return "", nil
413+
}
414+
415+
func checkPodEvents(pod *corev1.Pod, clusterAPI ClusterAPI) (msg string, err error) {
416+
evs := &corev1.EventList{}
417+
selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", pod.Name))
418+
if err != nil {
419+
return "", fmt.Errorf("failed to parse field selector: %s", err)
420+
}
421+
if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(pod.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil {
422+
return "", fmt.Errorf("failed to list events in namespace %s: %w", pod.Namespace, err)
423+
}
424+
for _, ev := range evs.Items {
425+
if ev.InvolvedObject.Kind != "Pod" {
426+
continue
427+
}
428+
for _, fatalEv := range unrecoverablePodEventReasons {
429+
if ev.Reason == fatalEv {
430+
return fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, ev.Message), nil
431+
}
432+
}
393433
}
394434
return "", nil
395435
}
@@ -477,7 +517,7 @@ func needsPVCWorkaround(podAdditions *v1alpha1.PodAdditions) bool {
477517

478518
func checkContainerStatusForFailure(containerStatus *corev1.ContainerStatus) (ok bool) {
479519
if containerStatus.State.Waiting != nil {
480-
for _, failureReason := range ContainerFailureStateReasons {
520+
for _, failureReason := range containerFailureStateReasons {
481521
if containerStatus.State.Waiting.Reason == failureReason {
482522
return false
483523
}

0 commit comments

Comments
 (0)