Merge pull request #61 from fjgcosta/timeout

kaskol10 · web-flow · commit ad60cabcc5ee · 2026-04-02T13:49:37.000+02:00
Fix timeouts
diff --git a/charts/karpenter-optimizer/Chart.yaml b/charts/karpenter-optimizer/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: karpenter-optimizer
 description: A Helm chart for Karpenter Optimizer - Cost optimization tool for Karpenter NodePools
 type: application
-version: 0.0.35
-appVersion: "0.0.35"
+version: 0.0.36
+appVersion: "0.0.36"
 keywords:
   - karpenter
   - kubernetes
diff --git a/charts/karpenter-optimizer/README.md b/charts/karpenter-optimizer/README.md
@@ -132,6 +132,9 @@ The following table lists the configurable parameters and their default values:
 | `resources.limits.memory` | Memory limit | `512Mi` |
 | `resources.requests.cpu` | CPU request | `100m` |
 | `resources.requests.memory` | Memory request | `128Mi` |
+| `frontend.nginxConnectTimeout` | Nginx proxy connect timeout | `60s` |
+| `frontend.nginxSendTimeout` | Nginx proxy send timeout | `60s` |
+| `frontend.nginxReadTimeout` | Nginx proxy read timeout (should be >= backend timeout) | `120s` |
 
 ## Values
 
diff --git a/charts/karpenter-optimizer/templates/configmap-nginx.yaml b/charts/karpenter-optimizer/templates/configmap-nginx.yaml
@@ -36,10 +36,9 @@ data:
             proxy_set_header X-Forwarded-Host $host;
             proxy_set_header X-Forwarded-Prefix {{ .Values.ingress.apiPath | default "/api" }};
             proxy_cache_bypass $http_upgrade;
-            # Increase timeouts for long-running requests
-            proxy_connect_timeout 60s;
-            proxy_send_timeout 60s;
-            proxy_read_timeout 60s;
+            proxy_connect_timeout {{ .Values.frontend.nginxConnectTimeout | default "60s" }};
+            proxy_send_timeout {{ .Values.frontend.nginxSendTimeout | default "60s" }};
+            proxy_read_timeout {{ .Values.frontend.nginxReadTimeout | default "120s" }};
             # Error handling
             proxy_intercept_errors on;
             error_page 502 503 504 /50x.html;
diff --git a/charts/karpenter-optimizer/values.yaml b/charts/karpenter-optimizer/values.yaml
@@ -203,6 +203,11 @@ frontend:
   # When using "localhost", nginx connects directly to the backend container in the same pod
   # When using "service", nginx connects via the Kubernetes service (useful for debugging or if localhost doesn't work)
   backendConnection: "localhost"
+  # Nginx proxy timeout configuration.
+  # nginxReadTimeout should be >= the backend request timeout (120s).
+  nginxConnectTimeout: "60s"
+  nginxSendTimeout: "60s"
+  nginxReadTimeout: "120s"
   # Security context for frontend container (overrides pod-level securityContext)
   # Set to {} to use pod-level securityContext, or specify custom values
   securityContext:
diff --git a/internal/api/server.go b/internal/api/server.go
@@ -620,11 +620,18 @@ func (s *Server) listWorkloads(c *gin.Context) {
 		return
 	}
 
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
 	defer cancel()
 
 	workloads, err := s.k8sClient.ListWorkloads(ctx, namespace)
 	if err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			c.JSON(504, gin.H{
+				"error": "Request timeout listing workloads",
+				"hint":  fmt.Sprintf("Listing workloads in namespace '%s' took longer than 60 seconds. The API server may be under load.", namespace),
+			})
+			return
+		}
 		c.JSON(500, gin.H{
 			"error": err.Error(),
 			"hint":  fmt.Sprintf("Check that namespace '%s' exists and you have permissions to list workloads", namespace),
diff --git a/internal/kubernetes/client.go b/internal/kubernetes/client.go
@@ -7,6 +7,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"bufio"
@@ -112,15 +113,14 @@ func NewClientWithDebug(kubeconfigPath, kubeContext string, debug bool) (*Client
 		}
 	}
 
-	// Configure rate limiting to prevent throttling
-	// QPS: queries per second (default is 5, increase to 10)
-	// Burst: maximum burst of requests (default is 10, increase to 20)
-	// This helps prevent "client-side throttling" errors when querying many resources
+	// Increase client-side rate limits (defaults: 5 QPS, 10 burst).
+	// The parallel namespace listing issues up to maxConcurrent*4 calls concurrently;
+	// low limits cause artificial multi-second delays per request.
 	if config.QPS == 0 {
-		config.QPS = 10
+		config.QPS = 50
 	}
 	if config.Burst == 0 {
-		config.Burst = 20
+		config.Burst = 100
 	}
 
 	clientset, err := kubernetes.NewForConfig(config)
@@ -350,13 +350,21 @@ func (c *Client) calculateWorkloadsStorage(ctx context.Context, workloads []Work
 		return workloads, nil
 	}
 
-	// For StatefulSets, fetch them to get volumeClaimTemplate names for accurate matching
-	statefulSetMap := make(map[string]map[string]bool) // namespace/name -> set of PVC template names
-	for _, workload := range workloads {
-		if workload.Type == "statefulset" {
-			sts, err := c.clientset.AppsV1().StatefulSets(workload.Namespace).Get(ctx, workload.Name, metav1.GetOptions{})
-			if err == nil {
-				key := fmt.Sprintf("%s/%s", workload.Namespace, workload.Name)
+	// Build volumeClaimTemplate map with one cluster-wide List() instead of N individual Get() calls.
+	statefulSetMap := make(map[string]map[string]bool) // "namespace/name" -> set of PVC template names
+	hasStatefulSets := false
+	for _, w := range workloads {
+		if w.Type == "statefulset" {
+			hasStatefulSets = true
+			break
+		}
+	}
+	if hasStatefulSets {
+		allSTS, err := c.clientset.AppsV1().StatefulSets("").List(ctx, metav1.ListOptions{})
+		if err == nil {
+			for i := range allSTS.Items {
+				sts := &allSTS.Items[i]
+				key := fmt.Sprintf("%s/%s", sts.Namespace, sts.Name)
 				templateNames := make(map[string]bool)
 				for _, vct := range sts.Spec.VolumeClaimTemplates {
 					templateNames[vct.Name] = true
@@ -488,25 +496,59 @@ func (c *Client) ListAllWorkloads(ctx context.Context) ([]WorkloadInfo, error) {
 		return allWorkloads, nil
 	}
 
-	// List workloads from each namespace
-	var errors []string
-	successfulNamespaces := 0
+	// Filter out system namespaces first
+	userNamespaces := make([]string, 0, len(namespaces))
 	for _, ns := range namespaces {
-		// Skip system namespaces
 		if ns == "kube-system" || ns == "kube-public" || ns == "kube-node-lease" {
 			continue
 		}
+		userNamespaces = append(userNamespaces, ns)
+	}
 
-		workloads, err := c.ListWorkloads(ctx, ns)
-		if err != nil {
-			// Log error but continue with other namespaces
-			// Collect errors to return if all namespaces fail
-			errors = append(errors, fmt.Sprintf("namespace %s: %v", ns, err))
+	// List workloads concurrently across namespaces with bounded parallelism.
+	type nsResult struct {
+		ns        string
+		workloads []WorkloadInfo
+		err       error
+	}
+
+	const maxConcurrent = 10
+	sem := make(chan struct{}, maxConcurrent)
+	results := make(chan nsResult, len(userNamespaces))
+
+	var wg sync.WaitGroup
+	for _, ns := range userNamespaces {
+		wg.Add(1)
+		go func(namespace string) {
+			defer wg.Done()
+			select {
+			case sem <- struct{}{}:
+				defer func() { <-sem }()
+			case <-ctx.Done():
+				results <- nsResult{ns: namespace, err: ctx.Err()}
+				return
+			}
+			workloads, err := c.ListWorkloads(ctx, namespace)
+			results <- nsResult{ns: namespace, workloads: workloads, err: err}
+		}(ns)
+	}
+
+	// Close results channel once all goroutines complete
+	go func() {
+		wg.Wait()
+		close(results)
+	}()
+
+	// Collect results
+	var errors []string
+	successfulNamespaces := 0
+	for result := range results {
+		if result.err != nil {
+			errors = append(errors, fmt.Sprintf("namespace %s: %v", result.ns, result.err))
 			continue
 		}
-
 		successfulNamespaces++
-		allWorkloads = append(allWorkloads, workloads...)
+		allWorkloads = append(allWorkloads, result.workloads...)
 	}
 
 	// If we have no workloads and errors from all namespaces, return an error
@@ -517,7 +559,7 @@ func (c *Client) ListAllWorkloads(ctx context.Context) ([]WorkloadInfo, error) {
 		if len(errors) < maxErrors {
 			maxErrors = len(errors)
 		}
-		return nil, fmt.Errorf("failed to list workloads from any namespace (checked %d namespaces). First errors: %s", len(namespaces), strings.Join(errors[:maxErrors], "; "))
+		return nil, fmt.Errorf("failed to list workloads from any namespace (checked %d namespaces). First errors: %s", len(userNamespaces), strings.Join(errors[:maxErrors], "; "))
 	}
 
 	// Calculate usage for all workloads in batch (much faster - single pod fetch)