Skip to content

Commit ba1f75b

Browse files
Merge pull request #5319 from pmtk/healthcheck/check-unpulled-images
USHIFT-4919: Healthcheck: Debug information about unpulled & failed images
2 parents 96d8613 + 41062cd commit ba1f75b

File tree

3 files changed

+348
-5
lines changed

3 files changed

+348
-5
lines changed

pkg/healthcheck/debug_info.go

Lines changed: 124 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,35 @@
11
package healthcheck
22

33
import (
4+
"context"
5+
"fmt"
46
"os"
57
"path/filepath"
68
"strings"
79

810
"github.com/openshift/microshift/pkg/config"
11+
corev1 "k8s.io/api/core/v1"
12+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/sets"
914
"k8s.io/cli-runtime/pkg/genericclioptions"
15+
coreclientv1 "k8s.io/client-go/kubernetes/typed/core/v1"
1016
"k8s.io/client-go/util/homedir"
1117
"k8s.io/klog/v2"
1218
"k8s.io/kubectl/pkg/cmd/get"
1319
cmdutil "k8s.io/kubectl/pkg/cmd/util"
1420
"k8s.io/utils/ptr"
1521
)
1622

17-
func logPodsAndEvents() {
23+
func printPostFailureDebugInfo(ctx context.Context, coreClient *coreclientv1.CoreV1Client) {
24+
output := strings.Builder{}
25+
26+
unpulledOrFailedImages(ctx, coreClient, &output)
27+
allPodsAndEvents(&output)
28+
29+
klog.Infof("DEBUG INFORMATION\n%s", output.String())
30+
}
31+
32+
func allPodsAndEvents(output *strings.Builder) {
1833
cliOptions := genericclioptions.NewConfigFlags(true).WithDeprecatedPasswordFlag()
1934
cliOptions.KubeConfig = ptr.To(filepath.Join(config.DataDir, "resources", string(config.KubeAdmin), "kubeconfig"))
2035
if homedir.HomeDir() == "" {
@@ -27,8 +42,7 @@ func logPodsAndEvents() {
2742
matchVersionKubeConfigFlags := cmdutil.NewMatchVersionFlags(cliOptions)
2843
f := cmdutil.NewFactory(matchVersionKubeConfigFlags)
2944

30-
output := strings.Builder{}
31-
ioStreams := genericclioptions.IOStreams{In: os.Stdin, Out: &output, ErrOut: &output}
45+
ioStreams := genericclioptions.IOStreams{In: os.Stdin, Out: output, ErrOut: output}
3246

3347
cmdGet := get.NewCmdGet("", f, ioStreams)
3448
opts := get.NewGetOptions("", ioStreams)
@@ -49,12 +63,118 @@ func logPodsAndEvents() {
4963
klog.Errorf("Failed to run 'get pods': %v", err)
5064
return
5165
}
66+
output.WriteString("\n")
5267
output.WriteString("\n---------- EVENTS:\n")
5368
opts.SortBy = ".metadata.creationTimestamp"
5469
if err := opts.Run(f, []string{"events"}); err != nil {
5570
klog.Errorf("Failed to run 'get events': %v", err)
5671
return
5772
}
73+
output.WriteString("\n")
74+
}
5875

59-
klog.Infof("DEBUG INFORMATION\n%s", output.String())
76+
// unpulledOrFailedImages prepares a debug log with information about images that are still being pulled or failed to be pulled.
77+
func unpulledOrFailedImages(ctx context.Context, coreClient *coreclientv1.CoreV1Client, output *strings.Builder) {
78+
// Get list of existing Pods to skip Events belonging to non-existing Pods to avoid false positives:
79+
// If someone creates and deletes a lot of workloads, there might be "Pulling" events for each Pod without
80+
// the corresponding "Pulled" event.
81+
pods, err := coreClient.Pods("").List(ctx, v1.ListOptions{})
82+
if err != nil {
83+
klog.Errorf("Failed to retrieve pods: %v", err)
84+
return
85+
}
86+
existingPodsNames := sets.New[string]()
87+
for _, pod := range pods.Items {
88+
existingPodsNames.Insert(pod.Name)
89+
}
90+
91+
var pullingEvents, pulledEvents, failedEvents *corev1.EventList
92+
if pullingEvents, err = coreClient.Events("").List(ctx, v1.ListOptions{FieldSelector: "reportingComponent=kubelet,reason=Pulling"}); err != nil {
93+
klog.Errorf("Failed to retrieve Pulling events: %v", err)
94+
return
95+
}
96+
if pulledEvents, err = coreClient.Events("").List(ctx, v1.ListOptions{FieldSelector: "reportingComponent=kubelet,reason=Pulled"}); err != nil {
97+
klog.Errorf("Failed to retrieve Pulled events: %v", err)
98+
return
99+
}
100+
if failedEvents, err = coreClient.Events("").List(ctx, v1.ListOptions{FieldSelector: "reportingComponent=kubelet,reason=Failed"}); err != nil {
101+
klog.Errorf("Failed to retrieve Failed events: %v", err)
102+
return
103+
}
104+
105+
unpulledImages, failedImages := analyzeEventsLookingForUnpulledOrFailedImages(existingPodsNames, pullingEvents, pulledEvents, failedEvents)
106+
107+
if len(unpulledImages) > 0 {
108+
output.WriteString("---------- IMAGES THAT ARE STILL BEING PULLED:\n")
109+
for _, unpulledImage := range unpulledImages {
110+
output.WriteString(fmt.Sprintf("- %q for Pod %q in namespace %q\n", unpulledImage.Image, unpulledImage.PodName, unpulledImage.Namespace))
111+
}
112+
output.WriteString("\n")
113+
}
114+
115+
if len(failedImages) > 0 {
116+
output.WriteString("---------- IMAGES THAT FAILED TO BE PULLED:\n")
117+
for _, failedImage := range failedImages {
118+
output.WriteString(fmt.Sprintf("- %q for Pod %q in namespace %q: %s\n", failedImage.Image, failedImage.PodName, failedImage.Namespace, failedImage.Message))
119+
}
120+
output.WriteString("\n")
121+
}
122+
}
123+
124+
type unpulledImage struct {
125+
Namespace string
126+
PodName string
127+
Image string
128+
}
129+
130+
type failedImage struct {
131+
unpulledImage
132+
Message string
133+
}
134+
135+
// analyzeEventsLookingForUnpulledOrFailedImages goes through and tries to match
136+
// image related Events to find images that are still being pulled
137+
// and images that failed to be pulled.
138+
func analyzeEventsLookingForUnpulledOrFailedImages(existingPodsNames sets.Set[string], pullingEvents, pulledEvents, failedEvents *corev1.EventList) ([]unpulledImage, []failedImage) {
139+
getImageInfo := func(event corev1.Event) (string, string, string) {
140+
pod := event.InvolvedObject.Name
141+
ns := event.InvolvedObject.Namespace
142+
img := strings.Split(event.Message, "\"")[1]
143+
return ns, pod, img
144+
}
145+
146+
unpulledImages := sets.New[unpulledImage]()
147+
148+
for _, event := range pullingEvents.Items {
149+
ns, pod, img := getImageInfo(event)
150+
if !existingPodsNames.Has(pod) {
151+
continue
152+
}
153+
unpulledImages.Insert(unpulledImage{Namespace: ns, PodName: pod, Image: img})
154+
}
155+
156+
for _, event := range pulledEvents.Items {
157+
ns, pod, img := getImageInfo(event)
158+
unpulledImages.Delete(unpulledImage{Namespace: ns, PodName: pod, Image: img})
159+
}
160+
161+
failedImages := sets.New[failedImage]()
162+
163+
for _, event := range failedEvents.Items {
164+
if !strings.HasPrefix(event.Message, "Failed to pull image") {
165+
continue
166+
}
167+
ns, pod, img := getImageInfo(event)
168+
if !existingPodsNames.Has(pod) {
169+
continue
170+
}
171+
unpulledImages.Delete(unpulledImage{Namespace: ns, PodName: pod, Image: img})
172+
173+
failedImages.Insert(failedImage{
174+
unpulledImage: unpulledImage{Namespace: ns, PodName: pod, Image: img},
175+
Message: event.Message,
176+
})
177+
}
178+
179+
return unpulledImages.UnsortedList(), failedImages.UnsortedList()
60180
}

pkg/healthcheck/debug_info_test.go

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
package healthcheck
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
corev1 "k8s.io/api/core/v1"
8+
"k8s.io/apimachinery/pkg/util/sets"
9+
)
10+
11+
func Test_analyzeEventsLookingForUnpulledOrFailedImages(t *testing.T) {
12+
testCases := []struct {
13+
name string
14+
existingPodsNames sets.Set[string]
15+
pullingEvents *corev1.EventList
16+
pulledEvents *corev1.EventList
17+
failedEvents *corev1.EventList
18+
expectedUnpulled []unpulledImage
19+
expectedFailed []failedImage
20+
}{
21+
{
22+
name: "no events",
23+
existingPodsNames: sets.New[string](),
24+
pullingEvents: &corev1.EventList{},
25+
pulledEvents: &corev1.EventList{},
26+
failedEvents: &corev1.EventList{},
27+
expectedUnpulled: []unpulledImage{},
28+
expectedFailed: []failedImage{},
29+
},
30+
{
31+
name: "image still being pulled",
32+
existingPodsNames: sets.New("test-pod"),
33+
pullingEvents: &corev1.EventList{
34+
Items: []corev1.Event{
35+
{
36+
InvolvedObject: corev1.ObjectReference{
37+
Name: "test-pod",
38+
Namespace: "test-ns",
39+
},
40+
Message: `Pulling image "nginx:latest"`,
41+
},
42+
},
43+
},
44+
pulledEvents: &corev1.EventList{},
45+
failedEvents: &corev1.EventList{},
46+
expectedUnpulled: []unpulledImage{
47+
{Namespace: "test-ns", PodName: "test-pod", Image: "nginx:latest"},
48+
},
49+
expectedFailed: []failedImage{},
50+
},
51+
{
52+
name: "image successfully pulled",
53+
existingPodsNames: sets.New("test-pod"),
54+
pullingEvents: &corev1.EventList{
55+
Items: []corev1.Event{
56+
{
57+
InvolvedObject: corev1.ObjectReference{
58+
Name: "test-pod",
59+
Namespace: "test-ns",
60+
},
61+
Message: `Pulling image "nginx:latest"`,
62+
},
63+
},
64+
},
65+
pulledEvents: &corev1.EventList{
66+
Items: []corev1.Event{
67+
{
68+
InvolvedObject: corev1.ObjectReference{
69+
Name: "test-pod",
70+
Namespace: "test-ns",
71+
},
72+
Message: `Successfully pulled image "nginx:latest"`,
73+
},
74+
},
75+
},
76+
failedEvents: &corev1.EventList{},
77+
expectedUnpulled: []unpulledImage{},
78+
expectedFailed: []failedImage{},
79+
},
80+
{
81+
name: "image failed to pull",
82+
existingPodsNames: sets.New("test-pod"),
83+
pullingEvents: &corev1.EventList{
84+
Items: []corev1.Event{
85+
{
86+
InvolvedObject: corev1.ObjectReference{
87+
Name: "test-pod",
88+
Namespace: "test-ns",
89+
},
90+
Message: `Pulling image "nginx:latest"`,
91+
},
92+
},
93+
},
94+
pulledEvents: &corev1.EventList{},
95+
failedEvents: &corev1.EventList{
96+
Items: []corev1.Event{
97+
{
98+
InvolvedObject: corev1.ObjectReference{
99+
Name: "test-pod",
100+
Namespace: "test-ns",
101+
},
102+
Message: `Failed to pull image "nginx:latest": error message`,
103+
},
104+
},
105+
},
106+
expectedUnpulled: []unpulledImage{},
107+
expectedFailed: []failedImage{
108+
{
109+
unpulledImage: unpulledImage{Namespace: "test-ns", PodName: "test-pod", Image: "nginx:latest"},
110+
Message: `Failed to pull image "nginx:latest": error message`,
111+
},
112+
},
113+
},
114+
{
115+
name: "skip events for non-existing pods",
116+
existingPodsNames: sets.New("existing-pod"),
117+
pullingEvents: &corev1.EventList{
118+
Items: []corev1.Event{
119+
{
120+
InvolvedObject: corev1.ObjectReference{
121+
Name: "deleted-pod",
122+
Namespace: "test-ns",
123+
},
124+
Message: `Pulling image "nginx:latest"`,
125+
},
126+
{
127+
InvolvedObject: corev1.ObjectReference{
128+
Name: "existing-pod",
129+
Namespace: "test-ns",
130+
},
131+
Message: `Pulling image "redis:latest"`,
132+
},
133+
},
134+
},
135+
pulledEvents: &corev1.EventList{},
136+
failedEvents: &corev1.EventList{},
137+
expectedUnpulled: []unpulledImage{
138+
{Namespace: "test-ns", PodName: "existing-pod", Image: "redis:latest"},
139+
},
140+
expectedFailed: []failedImage{},
141+
},
142+
{
143+
name: "multiple images with mixed states",
144+
existingPodsNames: sets.New("pod1", "pod2", "pod3"),
145+
pullingEvents: &corev1.EventList{
146+
Items: []corev1.Event{
147+
{
148+
InvolvedObject: corev1.ObjectReference{
149+
Name: "pod1",
150+
Namespace: "ns1",
151+
},
152+
Message: `Pulling image "nginx:latest"`,
153+
},
154+
{
155+
InvolvedObject: corev1.ObjectReference{
156+
Name: "pod2",
157+
Namespace: "ns2",
158+
},
159+
Message: `Pulling image "redis:latest"`,
160+
},
161+
{
162+
InvolvedObject: corev1.ObjectReference{
163+
Name: "pod3",
164+
Namespace: "ns3",
165+
},
166+
Message: `Pulling image "postgres:13"`,
167+
},
168+
},
169+
},
170+
pulledEvents: &corev1.EventList{
171+
Items: []corev1.Event{
172+
{
173+
InvolvedObject: corev1.ObjectReference{
174+
Name: "pod1",
175+
Namespace: "ns1",
176+
},
177+
Message: `Successfully pulled image "nginx:latest"`,
178+
},
179+
},
180+
},
181+
failedEvents: &corev1.EventList{
182+
Items: []corev1.Event{
183+
{
184+
InvolvedObject: corev1.ObjectReference{
185+
Name: "pod2",
186+
Namespace: "ns2",
187+
},
188+
Message: `Failed to pull image "redis:latest": connection timeout`,
189+
},
190+
},
191+
},
192+
expectedUnpulled: []unpulledImage{
193+
{Namespace: "ns3", PodName: "pod3", Image: "postgres:13"},
194+
},
195+
expectedFailed: []failedImage{
196+
{
197+
unpulledImage: unpulledImage{Namespace: "ns2", PodName: "pod2", Image: "redis:latest"},
198+
Message: `Failed to pull image "redis:latest": connection timeout`,
199+
},
200+
},
201+
},
202+
}
203+
204+
for _, tc := range testCases {
205+
t.Run(tc.name, func(t *testing.T) {
206+
unpulled, failed := analyzeEventsLookingForUnpulledOrFailedImages(
207+
tc.existingPodsNames,
208+
tc.pullingEvents,
209+
tc.pulledEvents,
210+
tc.failedEvents,
211+
)
212+
213+
assert.ElementsMatch(t, tc.expectedUnpulled, unpulled)
214+
assert.ElementsMatch(t, tc.expectedFailed, failed)
215+
})
216+
}
217+
}

0 commit comments

Comments
 (0)