Merge pull request #13 from hjacobs/allocatable-resources

hjacobs · web-flow · commit 7625f3704fbc · 2017-02-12T12:49:37.000+01:00
Allocatable resources
diff --git a/README.rst b/README.rst
@@ -21,6 +21,8 @@ Goals:
 * respect Availability Zones, i.e. make sure that all AZs provide enough capacity
 * be deterministic and predictable, i.e. the ``DesiredCapacity`` is only calculated based on the current cluster state
 * scale down slowly to mitigate service disruptions, i.e. at most one node at a time
+* support "elastic" workloads like daily up/down scaling
+* support AWS Spot Fleet (not yet implemented)
 * require a minimum amount of configuration (preferably none)
 * keep it simple
 
@@ -32,6 +34,13 @@ This hack was created as a proof of concept and born out of frustration with the
 * it requires unnecessary configuration
 * the code is quite complex
 
+Disclaimer
+==========
+
+** Use at your own risk! **
+This autoscaler was only tested with Kubernetes version 1.5.2.
+There is no guarantee that it works in previous Kubernetes versions.
+
 
 How it works
 ============
@@ -48,7 +57,7 @@ The ``autoscale`` function performs the following task:
   * iterate through every ASG/AZ combination
   * use the calculated resource usage (sum of resource requests) and add the resource requests of any unassigned pods (pods not scheduled on any node yet)
   * apply the configured buffer values (10% extra for CPU and memory by default)
-  * find the capacity of the weakest node
+  * find the `allocatable capacity`_ of the weakest node
   * calculate the number of required nodes by adding up the capacity of the weakest node until the sum is greater than or equal to requested+buffer for both CPU and memory
   * sum up the number of required nodes from all AZ for the ASG
 
@@ -99,3 +108,4 @@ The following command line options are supported:
 
 
 .. _"official" cluster-autoscaler: https://github.com/kubernetes/contrib/tree/master/cluster-autoscaler
+.. _allocatable capacity: https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md
diff --git a/kube_aws_autoscaler/main.py b/kube_aws_autoscaler/main.py
@@ -47,9 +47,9 @@ def parse_resource(v: str):
     return int(match.group(1)) * factor
 
 
-def get_node_capacity_tuple(node: dict):
-    capacity = node['capacity']
-    return tuple(capacity[resource] for resource in RESOURCES)
+def get_node_allocatable_tuple(node: dict):
+    allocatable = node['allocatable']
+    return tuple(allocatable[resource] for resource in RESOURCES)
 
 
 def apply_buffer(requested: dict, buffer_percentage: dict, buffer_fixed: dict):
@@ -60,11 +60,11 @@ def apply_buffer(requested: dict, buffer_percentage: dict, buffer_fixed: dict):
 
 
 def find_weakest_node(nodes):
-    return sorted(nodes, key=get_node_capacity_tuple)[0]
+    return sorted(nodes, key=get_node_allocatable_tuple)[0]
 
 
-def is_sufficient(requested: dict, capacity: dict):
-    for resource, cap in capacity.items():
+def is_sufficient(requested: dict, allocatable: dict):
+    for resource, cap in allocatable.items():
         if requested.get(resource, 0) > cap:
             return False
     return True
@@ -86,13 +86,15 @@ def get_nodes(api) -> dict:
         region = node.labels['failure-domain.beta.kubernetes.io/region']
         zone = node.labels['failure-domain.beta.kubernetes.io/zone']
         instance_type = node.labels['beta.kubernetes.io/instance-type']
-        capacity = {}
-        for key, val in node.obj['status']['capacity'].items():
-            capacity[key] = parse_resource(val)
+        allocatable = {}
+        # Use the Node Allocatable Resources to account for any kube/system reservations:
+        # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md
+        for key, val in node.obj['status']['allocatable'].items():
+            allocatable[key] = parse_resource(val)
         instance_id = node.obj['spec']['externalID']
         obj = {'name': node.name,
                'region': region, 'zone': zone, 'instance_id': instance_id, 'instance_type': instance_type,
-               'capacity': capacity,
+               'allocatable': allocatable,
                'ready': is_node_ready(node),
                'unschedulable': node.obj['spec'].get('unschedulable', False),
                'master': node.labels.get('master', 'false') == 'true'}
@@ -201,10 +203,10 @@ def calculate_required_auto_scaling_group_sizes(nodes_by_asg_zone: dict, usage_b
         requested_with_buffer = apply_buffer(requested, buffer_percentage, buffer_fixed)
         weakest_node = find_weakest_node(nodes)
         required_nodes = 0
-        capacity = {resource: 0 for resource in RESOURCES}
-        while not is_sufficient(requested_with_buffer, capacity):
-            for resource in capacity:
-                capacity[resource] += weakest_node['capacity'][resource]
+        allocatable = {resource: 0 for resource in RESOURCES}
+        while not is_sufficient(requested_with_buffer, allocatable):
+            for resource in allocatable:
+                allocatable[resource] += weakest_node['allocatable'][resource]
             required_nodes += 1
 
         for node in nodes:
@@ -215,7 +217,7 @@ def calculate_required_auto_scaling_group_sizes(nodes_by_asg_zone: dict, usage_b
                 required_nodes += 1
 
         overprovisioned = {resource: 0 for resource in RESOURCES}
-        for resource, value in capacity.items():
+        for resource, value in allocatable.items():
             overprovisioned[resource] = value - requested[resource]
 
         if dump_info:
@@ -226,7 +228,7 @@ def calculate_required_auto_scaling_group_sizes(nodes_by_asg_zone: dict, usage_b
             logger.info('{}/{}: with buffer:   {}'.format(asg_name, zone,
                         ' '.join([format_resource(requested_with_buffer[r], r).rjust(10) for r in RESOURCES])))
             logger.info('{}/{}: weakest node:  {}'.format(asg_name, zone,
-                        ' '.join([format_resource(weakest_node['capacity'][r], r).rjust(10) for r in RESOURCES])))
+                        ' '.join([format_resource(weakest_node['allocatable'][r], r).rjust(10) for r in RESOURCES])))
             logger.info('{}/{}: overprovision: {}'.format(asg_name, zone,
                         ' '.join([format_resource(overprovisioned[r], r).rjust(10) for r in RESOURCES])))
             logger.info('{}/{}: => {} nodes required (current: {})'.format(asg_name, zone, required_nodes, len(nodes)))
diff --git a/tests/test_autoscaler.py b/tests/test_autoscaler.py
@@ -66,20 +66,20 @@ def test_calculate_usage_by_asg_zone():
 
 def test_calculate_required_auto_scaling_group_sizes():
     assert calculate_required_auto_scaling_group_sizes({}, {}, {}, {}) == {}
-    node = {'capacity': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': False, 'master': False}
+    node = {'allocatable': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': False, 'master': False}
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {}, {}, {}) == {'a1': 0}
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('a1', 'z1'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 1}
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('unknown', 'unknown'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 1}
 
 
 def test_calculate_required_auto_scaling_group_sizes_cordon():
-    node = {'name': 'mynode', 'capacity': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'InService'}
+    node = {'name': 'mynode', 'allocatable': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'InService'}
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {}, {}, {}) == {'a1': 1}
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('a1', 'z1'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 2}
 
 
 def test_calculate_required_auto_scaling_group_sizes_unschedulable_terminating():
-    node = {'name': 'mynode', 'capacity': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'Terminating'}
+    node = {'name': 'mynode', 'allocatable': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'Terminating'}
     # do not compensate if the instance is terminating.. (it will probably be replaced by ASG)
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {}, {}, {}) == {'a1': 0}
     assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('a1', 'z1'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 1}
@@ -246,7 +246,7 @@ def test_get_nodes(monkeypatch):
         'beta.kubernetes.io/instance-type': 'x1.mega'
     }
     node.obj = {
-        'status': {'capacity': {'cpu': '2', 'memory': '16Gi', 'pods': '10'}},
+        'status': {'allocatable': {'cpu': '2', 'memory': '16Gi', 'pods': '10'}},
         'spec': {'externalID': 'i-123'}
     }
 
@@ -257,7 +257,7 @@ def test_get_nodes(monkeypatch):
     assert get_nodes(api) == {'n1': {
         'name': 'n1',
         'region': 'eu-north-1', 'zone': 'eu-north-1a', 'instance_id': 'i-123', 'instance_type': 'x1.mega',
-        'capacity': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
+        'allocatable': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
         'ready': False,
         'unschedulable': False,
         'master': False}}
@@ -278,7 +278,7 @@ def test_autoscale(monkeypatch):
     get_nodes.return_value = {'n1': {
                 'name': 'n1',
                 'region': 'eu-north-1', 'zone': 'eu-north-1a', 'instance_id': 'i-123', 'instance_type': 'x1.mega',
-                'capacity': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
+                'allocatable': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
                 'ready': True,
                 'unschedulable': False,
                 'master': False}}
@@ -309,7 +309,7 @@ def test_autoscale_node_without_asg(monkeypatch):
     get_nodes.return_value = {'n1': {
                 'name': 'n1',
                 'region': 'eu-north-1', 'zone': 'eu-north-1a', 'instance_id': 'i-123', 'instance_type': 'x1.mega',
-                'capacity': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
+                'allocatable': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
                 'ready': True,
                 'unschedulable': False,
                 'master': False}}