Skip to content

Commit 635a078

Browse files
authored
Keep queue-proxy admin server on HTTP for PreStop hooks (knative#16163) (#1614)
The queue-proxy admin server now always serves HTTP on port 8022, even when system-internal-tls is enabled. This simplifies the PreStop hook configuration and fixes graceful shutdown issues. Changes: - Queue-proxy admin server always uses HTTP, only main server uses TLS - PreStop hooks always use HTTP scheme (removed dynamic configuration) - Updated tests to reflect that admin server is always HTTP Why this approach: - PreStop hooks are called by kubelet locally within the pod (localhost) - No network traffic leaves the pod, so TLS isn't needed for security - Simpler implementation with no dynamic scheme configuration - Prevents TLS handshake errors during pod shutdown This fixes the issue where pods would receive HTTP 502 errors during scale-down operations when system-internal-tls was enabled. The error occurred because the PreStop hook was trying to connect with HTTP to a TLS-enabled admin server, causing immediate SIGTERM and dropped requests.
1 parent 08c944d commit 635a078

File tree

2 files changed

+121
-4
lines changed

2 files changed

+121
-4
lines changed

pkg/queue/sharedmain/main.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,16 +256,13 @@ func Main(opts ...Option) error {
256256

257257
if tlsEnabled {
258258
tlsServers["main"] = mainServer(":"+env.QueueServingTLSPort, mainHandler)
259-
tlsServers["admin"] = adminServer(":"+strconv.Itoa(networking.QueueAdminPort), adminHandler)
259+
// Keep admin server on HTTP even with TLS enabled since it's only accessed locally by kubelet
260260

261261
certWatcher, err = certificate.NewCertWatcher(certPath, keyPath, 1*time.Minute, logger)
262262
if err != nil {
263263
logger.Fatal("failed to create certWatcher", zap.Error(err))
264264
}
265265
defer certWatcher.Stop()
266-
267-
// Drop admin http server since the admin TLS server is listening on the same port
268-
delete(httpServers, "admin")
269266
}
270267

271268
logger.Info("Starting queue-proxy")

test/e2e/systeminternaltls/system_internal_tls_test.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ package systeminternaltls
2121

2222
import (
2323
"context"
24+
"fmt"
25+
"net/http"
2426
"net/url"
2527
"os"
2628
"strings"
2729
"testing"
30+
"time"
2831

2932
corev1 "k8s.io/api/core/v1"
3033
"k8s.io/apimachinery/pkg/api/errors"
@@ -274,3 +277,120 @@ func matchTLSLog(line string) bool {
274277
func matchCertReloadLog(line string) bool {
275278
return strings.Contains(line, certificate.CertReloadMessage)
276279
}
280+
281+
// TestGracefulShutdownWithTLS tests that PreStop hooks work correctly with system-internal-tls enabled.
282+
// This is a regression test for https://github.com/knative/serving/issues/16162
283+
// where PreStop hooks would fail with TLS handshake errors, causing HTTP 502 errors during scale-down.
284+
func TestGracefulShutdownWithTLS(t *testing.T) {
285+
if !test.ServingFlags.EnableAlphaFeatures {
286+
t.Skip("Alpha features not enabled")
287+
}
288+
289+
if !strings.Contains(test.ServingFlags.IngressClass, "kourier") &&
290+
!strings.Contains(test.ServingFlags.IngressClass, "contour") {
291+
t.Skip("Skip this test for non-kourier/contour ingress.")
292+
}
293+
294+
// Not running in parallel on purpose - we're testing pod deletion.
295+
clients := test.Setup(t)
296+
297+
names := test.ResourceNames{
298+
Service: test.ObjectNameForTest(t),
299+
Image: test.Autoscale,
300+
}
301+
test.EnsureTearDown(t, clients, &names)
302+
303+
// Create a service with a reasonable timeout
304+
const revisionTimeout = 5 * time.Minute
305+
objects, err := v1test.CreateServiceReady(t, clients, &names,
306+
rtesting.WithRevisionTimeoutSeconds(int64(revisionTimeout.Seconds())))
307+
if err != nil {
308+
t.Fatal("Failed to create a service:", err)
309+
}
310+
routeURL := objects.Route.Status.URL.URL()
311+
312+
// Verify the service is working
313+
if _, err = pkgTest.CheckEndpointState(
314+
context.Background(),
315+
clients.KubeClient,
316+
t.Logf,
317+
routeURL,
318+
spoof.IsStatusOK,
319+
"RouteServes",
320+
test.ServingFlags.ResolvableDomain,
321+
test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS),
322+
); err != nil {
323+
t.Fatalf("The endpoint for Route %s at %s didn't serve correctly: %v", names.Route, routeURL, err)
324+
}
325+
326+
// Get the pod
327+
pods, err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).List(context.Background(), v1.ListOptions{
328+
LabelSelector: "serving.knative.dev/revision=" + objects.Revision.Name,
329+
})
330+
if err != nil || len(pods.Items) == 0 {
331+
t.Fatal("No pods or error:", err)
332+
}
333+
t.Logf("Saw %d pods", len(pods.Items))
334+
335+
// Prepare a long-running request (12+ seconds)
336+
// NOTE: 12s + 6s must be less than drainSleepDuration and TERMINATION_DRAIN_DURATION_SECONDS.
337+
u, _ := url.Parse(routeURL.String())
338+
q := u.Query()
339+
q.Set("sleep", "12001")
340+
u.RawQuery = q.Encode()
341+
342+
httpClient, err := pkgTest.NewSpoofingClient(context.Background(), clients.KubeClient, t.Logf, u.Hostname(), test.ServingFlags.ResolvableDomain, test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS))
343+
if err != nil {
344+
t.Fatal("Error creating spoofing client:", err)
345+
}
346+
347+
// Start multiple long-running requests
348+
ctx := context.Background()
349+
numRequests := 6
350+
requestErrors := make(chan error, numRequests)
351+
352+
for i := range numRequests {
353+
// Request number starts at 1
354+
reqNum := i + 1
355+
356+
t.Logf("Starting request %d", reqNum)
357+
go func() {
358+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
359+
if err != nil {
360+
requestErrors <- fmt.Errorf("request %d: failed to create HTTP request: %w", reqNum, err)
361+
return
362+
}
363+
364+
res, err := httpClient.Do(req)
365+
t.Logf("Request %d completed", reqNum)
366+
if err != nil {
367+
requestErrors <- fmt.Errorf("request %d: request failed: %w", reqNum, err)
368+
return
369+
}
370+
if res.StatusCode != http.StatusOK {
371+
requestErrors <- fmt.Errorf("request %d: status = %v, want StatusOK (this could indicate PreStop hook failure)", reqNum, res.StatusCode)
372+
return
373+
}
374+
requestErrors <- nil
375+
}()
376+
time.Sleep(time.Second)
377+
}
378+
379+
// Immediately delete the pod while requests are in flight
380+
// This triggers the PreStop hook which must use HTTP (not TLS) to drain connections
381+
podToDelete := pods.Items[0].Name
382+
t.Logf("Deleting pod %q while requests are in flight", podToDelete)
383+
if err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).Delete(context.Background(), podToDelete, v1.DeleteOptions{}); err != nil {
384+
t.Fatal("Failed to delete pod:", err)
385+
}
386+
387+
// Wait for all requests to complete and check for errors
388+
t.Log("Waiting for all requests to complete...")
389+
for i := range numRequests {
390+
if err := <-requestErrors; err != nil {
391+
t.Errorf("Request %d: %v", i+1, err)
392+
}
393+
}
394+
395+
t.Log("All requests completed successfully - PreStop hook worked correctly with TLS enabled")
396+
}

0 commit comments

Comments
 (0)