Skip to content

Commit ad60cab

Browse files
authored
Merge pull request #61 from fjgcosta/timeout
Fix timeouts
2 parents dfede09 + e97238a commit ad60cab

File tree

6 files changed

+88
-32
lines changed

6 files changed

+88
-32
lines changed

charts/karpenter-optimizer/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ apiVersion: v2
22
name: karpenter-optimizer
33
description: A Helm chart for Karpenter Optimizer - Cost optimization tool for Karpenter NodePools
44
type: application
5-
version: 0.0.35
6-
appVersion: "0.0.35"
5+
version: 0.0.36
6+
appVersion: "0.0.36"
77
keywords:
88
- karpenter
99
- kubernetes

charts/karpenter-optimizer/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ The following table lists the configurable parameters and their default values:
132132
| `resources.limits.memory` | Memory limit | `512Mi` |
133133
| `resources.requests.cpu` | CPU request | `100m` |
134134
| `resources.requests.memory` | Memory request | `128Mi` |
135+
| `frontend.nginxConnectTimeout` | Nginx proxy connect timeout | `60s` |
136+
| `frontend.nginxSendTimeout` | Nginx proxy send timeout | `60s` |
137+
| `frontend.nginxReadTimeout` | Nginx proxy read timeout (should be >= backend timeout) | `120s` |
135138

136139
## Values
137140

charts/karpenter-optimizer/templates/configmap-nginx.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,9 @@ data:
3636
proxy_set_header X-Forwarded-Host $host;
3737
proxy_set_header X-Forwarded-Prefix {{ .Values.ingress.apiPath | default "/api" }};
3838
proxy_cache_bypass $http_upgrade;
39-
# Increase timeouts for long-running requests
40-
proxy_connect_timeout 60s;
41-
proxy_send_timeout 60s;
42-
proxy_read_timeout 60s;
39+
proxy_connect_timeout {{ .Values.frontend.nginxConnectTimeout | default "60s" }};
40+
proxy_send_timeout {{ .Values.frontend.nginxSendTimeout | default "60s" }};
41+
proxy_read_timeout {{ .Values.frontend.nginxReadTimeout | default "120s" }};
4342
# Error handling
4443
proxy_intercept_errors on;
4544
error_page 502 503 504 /50x.html;

charts/karpenter-optimizer/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,11 @@ frontend:
203203
# When using "localhost", nginx connects directly to the backend container in the same pod
204204
# When using "service", nginx connects via the Kubernetes service (useful for debugging or if localhost doesn't work)
205205
backendConnection: "localhost"
206+
# Nginx proxy timeout configuration.
207+
# nginxReadTimeout should be >= the backend request timeout (120s).
208+
nginxConnectTimeout: "60s"
209+
nginxSendTimeout: "60s"
210+
nginxReadTimeout: "120s"
206211
# Security context for frontend container (overrides pod-level securityContext)
207212
# Set to {} to use pod-level securityContext, or specify custom values
208213
securityContext:

internal/api/server.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,11 +620,18 @@ func (s *Server) listWorkloads(c *gin.Context) {
620620
return
621621
}
622622

623-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
623+
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
624624
defer cancel()
625625

626626
workloads, err := s.k8sClient.ListWorkloads(ctx, namespace)
627627
if err != nil {
628+
if ctx.Err() == context.DeadlineExceeded {
629+
c.JSON(504, gin.H{
630+
"error": "Request timeout listing workloads",
631+
"hint": fmt.Sprintf("Listing workloads in namespace '%s' took longer than 60 seconds. The API server may be under load.", namespace),
632+
})
633+
return
634+
}
628635
c.JSON(500, gin.H{
629636
"error": err.Error(),
630637
"hint": fmt.Sprintf("Check that namespace '%s' exists and you have permissions to list workloads", namespace),

internal/kubernetes/client.go

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"sort"
88
"strconv"
99
"strings"
10+
"sync"
1011
"time"
1112

1213
"bufio"
@@ -112,15 +113,14 @@ func NewClientWithDebug(kubeconfigPath, kubeContext string, debug bool) (*Client
112113
}
113114
}
114115

115-
// Configure rate limiting to prevent throttling
116-
// QPS: queries per second (default is 5, increase to 10)
117-
// Burst: maximum burst of requests (default is 10, increase to 20)
118-
// This helps prevent "client-side throttling" errors when querying many resources
116+
// Increase client-side rate limits (defaults: 5 QPS, 10 burst).
117+
// The parallel namespace listing issues up to maxConcurrent*4 calls concurrently;
118+
// low limits cause artificial multi-second delays per request.
119119
if config.QPS == 0 {
120-
config.QPS = 10
120+
config.QPS = 50
121121
}
122122
if config.Burst == 0 {
123-
config.Burst = 20
123+
config.Burst = 100
124124
}
125125

126126
clientset, err := kubernetes.NewForConfig(config)
@@ -350,13 +350,21 @@ func (c *Client) calculateWorkloadsStorage(ctx context.Context, workloads []Work
350350
return workloads, nil
351351
}
352352

353-
// For StatefulSets, fetch them to get volumeClaimTemplate names for accurate matching
354-
statefulSetMap := make(map[string]map[string]bool) // namespace/name -> set of PVC template names
355-
for _, workload := range workloads {
356-
if workload.Type == "statefulset" {
357-
sts, err := c.clientset.AppsV1().StatefulSets(workload.Namespace).Get(ctx, workload.Name, metav1.GetOptions{})
358-
if err == nil {
359-
key := fmt.Sprintf("%s/%s", workload.Namespace, workload.Name)
353+
// Build volumeClaimTemplate map with one cluster-wide List() instead of N individual Get() calls.
354+
statefulSetMap := make(map[string]map[string]bool) // "namespace/name" -> set of PVC template names
355+
hasStatefulSets := false
356+
for _, w := range workloads {
357+
if w.Type == "statefulset" {
358+
hasStatefulSets = true
359+
break
360+
}
361+
}
362+
if hasStatefulSets {
363+
allSTS, err := c.clientset.AppsV1().StatefulSets("").List(ctx, metav1.ListOptions{})
364+
if err == nil {
365+
for i := range allSTS.Items {
366+
sts := &allSTS.Items[i]
367+
key := fmt.Sprintf("%s/%s", sts.Namespace, sts.Name)
360368
templateNames := make(map[string]bool)
361369
for _, vct := range sts.Spec.VolumeClaimTemplates {
362370
templateNames[vct.Name] = true
@@ -488,25 +496,59 @@ func (c *Client) ListAllWorkloads(ctx context.Context) ([]WorkloadInfo, error) {
488496
return allWorkloads, nil
489497
}
490498

491-
// List workloads from each namespace
492-
var errors []string
493-
successfulNamespaces := 0
499+
// Filter out system namespaces first
500+
userNamespaces := make([]string, 0, len(namespaces))
494501
for _, ns := range namespaces {
495-
// Skip system namespaces
496502
if ns == "kube-system" || ns == "kube-public" || ns == "kube-node-lease" {
497503
continue
498504
}
505+
userNamespaces = append(userNamespaces, ns)
506+
}
499507

500-
workloads, err := c.ListWorkloads(ctx, ns)
501-
if err != nil {
502-
// Log error but continue with other namespaces
503-
// Collect errors to return if all namespaces fail
504-
errors = append(errors, fmt.Sprintf("namespace %s: %v", ns, err))
508+
// List workloads concurrently across namespaces with bounded parallelism.
509+
type nsResult struct {
510+
ns string
511+
workloads []WorkloadInfo
512+
err error
513+
}
514+
515+
const maxConcurrent = 10
516+
sem := make(chan struct{}, maxConcurrent)
517+
results := make(chan nsResult, len(userNamespaces))
518+
519+
var wg sync.WaitGroup
520+
for _, ns := range userNamespaces {
521+
wg.Add(1)
522+
go func(namespace string) {
523+
defer wg.Done()
524+
select {
525+
case sem <- struct{}{}:
526+
defer func() { <-sem }()
527+
case <-ctx.Done():
528+
results <- nsResult{ns: namespace, err: ctx.Err()}
529+
return
530+
}
531+
workloads, err := c.ListWorkloads(ctx, namespace)
532+
results <- nsResult{ns: namespace, workloads: workloads, err: err}
533+
}(ns)
534+
}
535+
536+
// Close results channel once all goroutines complete
537+
go func() {
538+
wg.Wait()
539+
close(results)
540+
}()
541+
542+
// Collect results
543+
var errors []string
544+
successfulNamespaces := 0
545+
for result := range results {
546+
if result.err != nil {
547+
errors = append(errors, fmt.Sprintf("namespace %s: %v", result.ns, result.err))
505548
continue
506549
}
507-
508550
successfulNamespaces++
509-
allWorkloads = append(allWorkloads, workloads...)
551+
allWorkloads = append(allWorkloads, result.workloads...)
510552
}
511553

512554
// If we have no workloads and errors from all namespaces, return an error
@@ -517,7 +559,7 @@ func (c *Client) ListAllWorkloads(ctx context.Context) ([]WorkloadInfo, error) {
517559
if len(errors) < maxErrors {
518560
maxErrors = len(errors)
519561
}
520-
return nil, fmt.Errorf("failed to list workloads from any namespace (checked %d namespaces). First errors: %s", len(namespaces), strings.Join(errors[:maxErrors], "; "))
562+
return nil, fmt.Errorf("failed to list workloads from any namespace (checked %d namespaces). First errors: %s", len(userNamespaces), strings.Join(errors[:maxErrors], "; "))
521563
}
522564

523565
// Calculate usage for all workloads in batch (much faster - single pod fetch)

0 commit comments

Comments
 (0)