Merge pull request #70 from stefanprodan/append-headers

stefanprodan · web-flow · commit 535a92e87192 · 2019-03-04T10:39:43.000+02:00
Allow headers to be appended to HTTP requests
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 All notable changes to this project are documented in this file.
 
+## Unreleased
+
+#### Features
+
+- Allow headers to be appended to HTTP requests [#70](https://github.com/stefanprodan/flagger/pull/70)
+
 ## 0.7.0 (2019-02-28)
 
 Adds support for custom metric checks, HTTP timeouts and HTTP retries
diff --git a/README.md b/README.md
@@ -106,11 +106,11 @@ spec:
     # HTTP rewrite (optional)
     rewrite:
       uri: /
-    # timeout for HTTP requests (optional)
-    timeout: 5s
-    # retry policy when a HTTP request fails (optional)
-    retries:
-      attempts: 3
+    # Envoy timeout and retry policy (optional)
+    appendHeaders:
+      x-envoy-upstream-rq-timeout-ms: "15000"
+      x-envoy-max-retries: "10"
+      x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
   # promote the canary without analysing it (default false)
   skipAnalysis: false
   # define the canary analysis timing and KPIs
diff --git a/artifacts/canaries/canary.yaml b/artifacts/canaries/canary.yaml
@@ -26,15 +26,19 @@ spec:
     # Istio virtual service host names (optional)
     hosts:
     - app.istio.weavedx.com
-    # Istio virtual service HTTP match conditions (optional)
+    # HTTP match conditions (optional)
     match:
       - uri:
           prefix: /
-    # Istio virtual service HTTP rewrite (optional)
+    # HTTP rewrite (optional)
     rewrite:
       uri: /
-  # for emergency cases when you want to ship changes
-  # in production without analysing the canary
+    # Envoy timeout and retry policy (optional)
+    appendHeaders:
+      x-envoy-upstream-rq-timeout-ms: "15000"
+      x-envoy-max-retries: "10"
+      x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
+  # promote the canary without analysing it (default false)
   skipAnalysis: false
   canaryAnalysis:
     # schedule interval (default 60s)
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
@@ -17,3 +17,4 @@
 ## Tutorials
 
 * [Canaries with Helm charts and GitOps](tutorials/canary-helm-gitops.md)
+* [Zero downtime deployments](tutorials/zero-downtime-deployments.md)
diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md
@@ -46,12 +46,11 @@ spec:
     # HTTP rewrite (optional)
     rewrite:
       uri: /
-    # timeout for HTTP requests (optional)
-    timeout: 5s
-    # retry policy when a HTTP request fails (optional)
-    retries:
-      attempts: 3
-      perTryTimeout: 3s
+    # Envoy timeout and retry policy (optional)
+    appendHeaders:
+      x-envoy-upstream-rq-timeout-ms: "15000"
+      x-envoy-max-retries: "10"
+      x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
   # promote the canary without analysing it (default false)
   skipAnalysis: false
   # define the canary analysis timing and KPIs
@@ -138,8 +137,11 @@ metadata:
     # HTTP rewrite (optional)
     rewrite:
       uri: /
-    # timeout for HTTP requests (optional)
-    timeout: 5s
+    # Envoy timeout and retry policy (optional)
+    appendHeaders:
+      x-envoy-upstream-rq-timeout-ms: "15000"
+      x-envoy-max-retries: "10"
+      x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
     # retry policy when a HTTP request fails (optional)
     retries:
       attempts: 3
@@ -174,10 +176,10 @@ spec:
             prefix: /
       rewrite:
         uri: /
-      timeout: 5s
-      retries:
-        attempts: 3
-        perTryTimeout: 3s
+      appendHeaders:
+        x-envoy-upstream-rq-timeout-ms: "15000"
+        x-envoy-max-retries: "10"
+        x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
       route:
         - destination:
             host: frontend-primary
diff --git a/docs/gitbook/tutorials/zero-downtime-deployments.md b/docs/gitbook/tutorials/zero-downtime-deployments.md
@@ -0,0 +1,206 @@
+# Zero downtime deployments
+
+This is a list of things you should consider when dealing with a high traffic production environment if you want to
+minimise the impact of rolling updates and downscaling.
+
+### Deployment strategy
+
+Limit the number of unavailable pods during a rolling update:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+spec:
+  progressDeadlineSeconds: 120
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+```
+
+The default progress deadline for a deployment is ten minutes.
+You should consider adjusting this value to make the deployment process fail faster.
+
+### Liveness health check
+
+You application should expose a HTTP endpoint that Kubernetes can call to determine if 
+your app transitioned to a broken state from which it can't recover and needs to be restarted.
+
+```yaml
+readinessProbe:
+  exec:
+    command:
+    - wget
+    - --quiet
+    - --tries=1
+    - --timeout=4
+    - --spider
+    - http://localhost:8080/healthz
+  timeoutSeconds: 5
+  initialDelaySeconds: 5
+```
+
+If you've enabled mTLS, you'll have to use `exec` for liveness and readiness checks since 
+kubelet is not part of the service mesh and doesn't have access to the TLS cert.
+
+### Readiness health check
+
+You application should expose a HTTP endpoint that Kubernetes can call to determine if 
+your app is ready to receive traffic.
+
+```yaml
+livenessProbe:
+  exec:
+    command:
+    - wget
+    - --quiet
+    - --tries=1
+    - --timeout=4
+    - --spider
+    - http://localhost:8080/readyz
+  timeoutSeconds: 5
+  initialDelaySeconds: 5
+  periodSeconds: 5
+```
+
+If your app depends on external services, you should check if those services are available before allowing Kubernetes
+to route traffic to an app instance. Keep in mind that the Envoy sidecar can have a slower startup than your app.
+This means that on application start you should retry for at least a couple of seconds any external connection.
+
+### Graceful shutdown
+
+Before a pod gets terminated, Kubernetes sends a `SIGTERM` signal to every container and waits for period of 
+time (30s by default) for all containers to exit gracefully. If your app doesn't handle the `SIGTERM` signal or if it 
+doesn't exit within the grace period, Kubernetes will kill the container and any inflight requests that your app is 
+processing will fail.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+spec:
+  template:
+    spec:
+      terminationGracePeriodSeconds: 60
+      containers:
+      - name: app
+        lifecycle:
+          preStop:
+            exec:
+              command:
+              - sleep
+              - "10"
+```
+
+Your app container should have a `preStop` hook that delays the container shutdown.
+This will allow the service mesh to drain the traffic and remove this pod from all other Envoy sidecars before your app 
+becomes unavailable.
+
+### Delay Envoy shutdown
+
+Even if your app reacts to `SIGTERM` and tries to complete the inflight requests before shutdown, that 
+doesn't mean that the response will make it back to the caller. If the Envoy sidecar shuts down before your app, then 
+the caller will receive a 503 error.
+
+To mitigate this issue you can add a `preStop` hook to the Istio proxy and wait for the main app to exist before Envoy exists.
+
+```bash
+#!/bin/bash
+set -e
+if ! pidof envoy &>/dev/null; then
+  exit 0
+fi
+
+if ! pidof pilot-agent &>/dev/null; then
+  exit 0
+fi
+
+while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l | xargs) -ne 0 ]; do
+  sleep 1;
+done
+
+exit 0
+```
+
+You'll have to build your own Envoy docker image with the above script and
+modify the Istio injection webhook with the `preStop` directive. 
+
+Thanks to Stono for his excellent [tips](https://github.com/istio/istio/issues/12183) on minimising 503s. 
+
+### Resource requests and limits
+
+Setting CPU and memory requests/limits for all workloads is a mandatory step if you're running a production system.
+Without limits your nodes could run out of memory or become unresponsive due to CPU exhausting.
+Without CPU and memory requests,
+the Kubernetes scheduler will not be able to make decisions about which nodes to place pods on.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+spec:
+  template:
+    spec:
+      containers:
+      - name: app
+        resources:
+          limits:
+            cpu: 1000m
+            memory: 1Gi
+          requests:
+            cpu: 100m
+            memory: 128Mi
+```
+
+Note that without resource requests the horizontal pod autoscaler can't determine when to scale your app.
+
+### Autoscaling
+
+A production environment should be able to handle traffic bursts without impacting the quality of service.
+This can be achieved with Kubernetes autoscaling capabilities.
+Autoscaling in Kubernetes has two dimensions: the Cluster Autoscaler that deals with node scaling operations and
+the Horizontal Pod Autoscaler that automatically scales the number of pods in a deployment.
+
+```yaml
+apiVersion: autoscaling/v2beta1
+kind: HorizontalPodAutoscaler
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: app
+  minReplicas: 2
+  maxReplicas: 4
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      targetAverageValue: 900m
+  - type: Resource
+    resource:
+      name: memory
+      targetAverageValue: 768Mi
+```
+
+The above HPA ensures your app will be scaled up before the pods reach the CPU or memory limits.
+
+### Ingress retries
+
+To minimise the impact of downscaling operations you can make use of Envoy retry capabilities.
+
+```yaml
+apiVersion: flagger.app/v1alpha3
+kind: Canary
+spec:
+  service:
+    port: 9898
+    gateways:
+    - public-gateway.istio-system.svc.cluster.local
+    hosts:
+    - app.example.com
+    appendHeaders:
+      x-envoy-upstream-rq-timeout-ms: "15000"
+      x-envoy-max-retries: "10"
+      x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
+```
+
+When the HPA scales down your app, your users could run into 503 errors.
+The above configuration will make Envoy retry the HTTP requests that failed due to gateway errors.
diff --git a/pkg/apis/flagger/v1alpha3/types.go b/pkg/apis/flagger/v1alpha3/types.go
@@ -109,13 +109,14 @@ type CanaryStatus struct {
 // CanaryService is used to create ClusterIP services
 // and Istio Virtual Service
 type CanaryService struct {
-	Port     int32                            `json:"port"`
-	Gateways []string                         `json:"gateways"`
-	Hosts    []string                         `json:"hosts"`
-	Match    []istiov1alpha3.HTTPMatchRequest `json:"match,omitempty"`
-	Rewrite  *istiov1alpha3.HTTPRewrite       `json:"rewrite,omitempty"`
-	Timeout  string                           `json:"timeout,omitempty"`
-	Retries  *istiov1alpha3.HTTPRetry         `json:"retries,omitempty"`
+	Port          int32                            `json:"port"`
+	Gateways      []string                         `json:"gateways"`
+	Hosts         []string                         `json:"hosts"`
+	Match         []istiov1alpha3.HTTPMatchRequest `json:"match,omitempty"`
+	Rewrite       *istiov1alpha3.HTTPRewrite       `json:"rewrite,omitempty"`
+	Timeout       string                           `json:"timeout,omitempty"`
+	Retries       *istiov1alpha3.HTTPRetry         `json:"retries,omitempty"`
+	AppendHeaders map[string]string                `json:"appendHeaders,omitempty"`
 }
 
 // CanaryAnalysis is used to describe how the analysis should be done
diff --git a/pkg/apis/flagger/v1alpha3/zz_generated.deepcopy.go b/pkg/apis/flagger/v1alpha3/zz_generated.deepcopy.go
diff --git a/pkg/controller/router.go b/pkg/controller/router.go
@@ -203,11 +203,12 @@ func (c *CanaryRouter) syncVirtualService(cd *flaggerv1.Canary) error {
 		Gateways: gateways,
 		Http: []istiov1alpha3.HTTPRoute{
 			{
-				Match:   cd.Spec.Service.Match,
-				Rewrite: cd.Spec.Service.Rewrite,
-				Timeout: cd.Spec.Service.Timeout,
-				Retries: cd.Spec.Service.Retries,
-				Route:   route,
+				Match:         cd.Spec.Service.Match,
+				Rewrite:       cd.Spec.Service.Rewrite,
+				Timeout:       cd.Spec.Service.Timeout,
+				Retries:       cd.Spec.Service.Retries,
+				AppendHeaders: cd.Spec.Service.AppendHeaders,
+				Route:         route,
 			},
 		},
 	}
@@ -319,11 +320,12 @@ func (c *CanaryRouter) SetRoutes(
 	vsCopy := vs.DeepCopy()
 	vsCopy.Spec.Http = []istiov1alpha3.HTTPRoute{
 		{
-			Match:   cd.Spec.Service.Match,
-			Rewrite: cd.Spec.Service.Rewrite,
-			Timeout: cd.Spec.Service.Timeout,
-			Retries: cd.Spec.Service.Retries,
-			Route:   []istiov1alpha3.DestinationWeight{primary, canary},
+			Match:         cd.Spec.Service.Match,
+			Rewrite:       cd.Spec.Service.Rewrite,
+			Timeout:       cd.Spec.Service.Timeout,
+			Retries:       cd.Spec.Service.Retries,
+			AppendHeaders: cd.Spec.Service.AppendHeaders,
+			Route:         []istiov1alpha3.DestinationWeight{primary, canary},
 		},
 	}
 
diff --git a/test/e2e-tests.sh b/test/e2e-tests.sh
@@ -33,6 +33,10 @@ spec:
   progressDeadlineSeconds: 60
   service:
     port: 9898
+    appendHeaders:
+      x-envoy-upstream-rq-timeout-ms: "15000"
+      x-envoy-max-retries: "10"
+      x-envoy-retry-on: "gateway-error,connect-failure,refused-stream"
   canaryAnalysis:
     interval: 15s
     threshold: 15

Original file line number	Diff line number	Diff line change
`@@ -17,3 +17,4 @@`
`17`	`17`	`## Tutorials`
`18`	`18`
`19`	`19`	`* [Canaries with Helm charts and GitOps](tutorials/canary-helm-gitops.md)`
	`20`	`+* [Zero downtime deployments](tutorials/zero-downtime-deployments.md)`