From cadfd56304e26405f05584b32f069b3726ea4242 Mon Sep 17 00:00:00 2001
From: James Martin <jmartin@ansible.com>
Date: Thu, 8 Jan 2015 17:24:44 -0500
Subject: [PATCH] check lb for instance healthy and some refactoring of rolling
 replace functions.

Optionally waiting for new instances to be in viable state on ASG creation/update.

Properly updating properties and terminating group.
---
 lib/ansible/modules/cloud/amazon/ec2_asg.py | 134 +++++++++++++-------
 1 file changed, 88 insertions(+), 46 deletions(-)

diff --git a/lib/ansible/modules/cloud/amazon/ec2_asg.py b/lib/ansible/modules/cloud/amazon/ec2_asg.py
index c9c7987170a..37e009ead27 100644
--- a/lib/ansible/modules/cloud/amazon/ec2_asg.py
+++ b/lib/ansible/modules/cloud/amazon/ec2_asg.py
@@ -13,7 +13,6 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with Ansible.  If not, see <http://www.gnu.org/licenses/>.
-
 DOCUMENTATION = """
 ---
 module: ec2_asg
@@ -115,6 +114,12 @@ options:
       - how long before wait instances to become viable when replaced.  Used in concjunction with instance_ids option.
     default: 300
     version_added: "1.8"
+  wait_for_instances:
+    description:
+      - Wait for the ASG instances to be in a ready state before exiting.  If instances are behind an ELB, it will wait until the instances are considered by the ELB.
+    version_added: "1.9"
+    default: yes
+    required: False
 extends_documentation_fragment: aws
 """
 
@@ -254,8 +259,47 @@ def get_properties(autoscaling_group):
     return properties
 
 
-def create_autoscaling_group(connection, module):
+def wait_for_elb(asg_connection, module, group_name):
+    region, ec2_url, aws_connect_params = get_aws_connection_info(module)
+    wait_timeout = module.params.get('wait_timeout')
 
+    # if the health_check_type is ELB, we want to query the ELBs directly for instance
+    # status as to avoid health_check_grace period that is awarded to ASG instances
+    as_group = asg_connection.get_all_groups(names=[group_name])[0]
+
+    if as_group.load_balancers and as_group.health_check_type == 'ELB':
+        try:
+            elb_connection = connect_to_aws(boto.ec2.elb, region, **aws_connect_params)
+        except boto.exception.NoAuthHandlerFound, e:
+            module.fail_json(msg=str(e))
+        
+        wait_timeout = time.time() + wait_timeout
+        healthy_instances = {}
+
+        while len(healthy_instances.keys()) < as_group.min_size and wait_timeout > time.time():
+            as_group = asg_connection.get_all_groups(names=[group_name])[0]
+            props = get_properties(as_group)
+            # get healthy, inservice instances from ASG
+            instances = []
+            for instance, settings in props['instance_facts'].items():
+                if settings['lifecycle_state'] == 'InService' and settings['health_status'] == 'Healthy':
+                    instances.append(instance)
+            for lb in as_group.load_balancers:
+                # we catch a race condition that sometimes happens if the instance exists in the ASG
+                # but has not yet show up in the ELB
+                try:
+                    lb_instances = elb_connection.describe_instance_health(lb, instances=instances)
+                except boto.exception.InvalidInstance, e:
+                    pass
+                for i in lb_instances:
+                    if i.state == "InService":
+                        healthy_instances[i.instance_id] = i.state
+            time.sleep(10)
+        if wait_timeout <= time.time():
+        # waiting took too long
+            module.fail_json(msg = "Waited too long for ELB instances to be healthy. %s" % time.asctime())
+
+def create_autoscaling_group(connection, module):
     group_name = module.params.get('name')
     load_balancers = module.params['load_balancers']
     availability_zones = module.params['availability_zones']
@@ -267,8 +311,9 @@ def create_autoscaling_group(connection, module):
     set_tags = module.params.get('tags')
     health_check_period = module.params.get('health_check_period')
     health_check_type = module.params.get('health_check_type')
-    
+    wait_for_instances = module.params.get('wait_for_instances')
     as_groups = connection.get_all_groups(names=[group_name])
+    wait_timeout = module.params.get('wait_timeout')
 
     if not vpc_zone_identifier and not availability_zones:
         region, ec2_url, aws_connect_params = get_aws_connection_info(module)
@@ -315,7 +360,11 @@ def create_autoscaling_group(connection, module):
 
         try:
             connection.create_auto_scaling_group(ag)
-            asg_properties = get_properties(ag)
+            if wait_for_instances == True:
+                wait_for_new_instances(module, connection, group_name, wait_timeout, desired_capacity, 'viable_instances')
+                wait_for_elb(connection, module, group_name)
+            as_group = connection.get_all_groups(names=[group_name])[0]
+            asg_properties = get_properties(as_group)
             changed = True
             return(changed, asg_properties)
         except BotoServerError, e:
@@ -375,13 +424,22 @@ def create_autoscaling_group(connection, module):
             changed = True
             as_group.load_balancers = module.params.get('load_balancers')
 
-        try:
-            if changed:
+
+        if changed:
+            try:
                 as_group.update()
+            except BotoServerError, e:
+                module.fail_json(msg=str(e))
+
+        if wait_for_instances == True:
+            wait_for_new_instances(module, connection, group_name, wait_timeout, desired_capacity, 'viable_instances')
+            wait_for_elb(connection, module, group_name)
+        try:
+            as_group = connection.get_all_groups(names=[group_name])[0]
             asg_properties = get_properties(as_group)
-            return(changed, asg_properties)
         except BotoServerError, e:
             module.fail_json(msg=str(e))
+        return(changed, asg_properties)
 
 
 def delete_autoscaling_group(connection, module):
@@ -403,6 +461,8 @@ def delete_autoscaling_group(connection, module):
             time.sleep(10)
 
         group.delete()
+        while len(connection.get_all_groups(names=[group_name])):
+                time.sleep(5)
         changed=True
         return changed
     else:
@@ -414,7 +474,6 @@ def get_chunks(l, n):
         yield l[i:i+n]
 
 def replace(connection, module):
-
     batch_size = module.params.get('replace_batch_size')
     wait_timeout = module.params.get('wait_timeout')
     group_name = module.params.get('name')
@@ -424,20 +483,11 @@ def replace(connection, module):
 
     # FIXME: we need some more docs about this feature
     replace_instances = module.params.get('replace_instances')
-    
-    # wait for instance list to be populated on a newly provisioned ASG
-    instance_wait = time.time() + 30
-    while instance_wait > time.time():
-        as_group = connection.get_all_groups(names=[group_name])[0]
-        props = get_properties(as_group)
-        if props.has_key('instances'):
-            instances = props['instances']
-            break
-        time.sleep(10)
-    if instance_wait <= time.time():
-        # waiting took too long
-        module.fail_json(msg = "Waited too long for instances to appear. %s" % time.asctime())
-    # determine if we need to continue
+
+    as_group = connection.get_all_groups(names=[group_name])[0]
+    wait_for_new_instances(module, connection, as_group, wait_timeout, as_group.min_size, 'viable_instances')
+    props = get_properties(as_group)
+    instances = props['instances']
     replaceable = 0
     if replace_instances:
         instances = replace_instances
@@ -450,26 +500,24 @@ def replace(connection, module):
         return(changed, props)
         
     # set temporary settings and wait for them to be reached
+    as_group = connection.get_all_groups(names=[group_name])[0]
     as_group.max_size = max_size + batch_size
     as_group.min_size = min_size + batch_size
     as_group.desired_capacity = desired_capacity + batch_size
     as_group.update()
-    wait_timeout = time.time() + wait_timeout
-    while wait_timeout > time.time() and min_size + batch_size > props['viable_instances']:
-        time.sleep(10)
-        as_groups = connection.get_all_groups(names=[group_name])
-        as_group = as_groups[0]
-        props = get_properties(as_group)
-    if wait_timeout <= time.time():
-        # waiting took too long
-        module.fail_json(msg = "Waited too long for instances to appear. %s" % time.asctime())
+    wait_for_new_instances(module, connection, as_group, wait_timeout, as_group.min_size, 'viable_instances')
+    wait_for_elb(connection, module, as_group)
+    as_group = connection.get_all_groups(names=[group_name])[0]
+    props = get_properties(as_group)
     instances = props['instances']
     if replace_instances:
         instances = replace_instances
     for i in get_chunks(instances, batch_size):
-        replace_batch(connection, module, i)
+        terminate_batch(connection, module, i)
+        wait_for_new_instances(module, connection,  as_group, wait_timeout, as_group.min_size, 'viable_instances')
+        wait_for_elb(connection, module, group_name)
+        as_group = connection.get_all_groups(names=[group_name])[0]
     # return settings to normal
-    as_group = connection.get_all_groups(names=[group_name])[0]
     as_group.max_size = max_size 
     as_group.min_size = min_size 
     as_group.desired_capacity = desired_capacity
@@ -479,9 +527,7 @@ def replace(connection, module):
     changed=True
     return(changed, asg_properties)
 
-def replace_batch(connection, module, replace_instances):
-    
-    
+def terminate_batch(connection, module, replace_instances):
     group_name = module.params.get('name')
     wait_timeout = int(module.params.get('wait_timeout'))
     lc_check = module.params.get('lc_check')
@@ -526,27 +572,23 @@ def replace_batch(connection, module, replace_instances):
         # waiting took too long
         module.fail_json(msg = "Waited too long for old instances to terminate. %s" % time.asctime())
 
+def wait_for_new_instances(module, connection, group_name, wait_timeout, desired_size, prop):
+
     # make sure we have the latest stats after that last loop.
     as_group = connection.get_all_groups(names=[group_name])[0]
     props = get_properties(as_group)
-
     # now we make sure that we have enough instances in a viable state
     wait_timeout = time.time() + wait_timeout
-    while wait_timeout > time.time() and props['min_size'] > props['viable_instances']:
+    while wait_timeout > time.time() and desired_size > props[prop]:
         time.sleep(10)
         as_groups = connection.get_all_groups(names=[group_name])
         as_group = as_groups[0]
         props = get_properties(as_group)
-
     if wait_timeout <= time.time():
         # waiting took too long
         module.fail_json(msg = "Waited too long for new instances to become viable. %s" % time.asctime())
 
-    # collect final stats info
-    as_group = connection.get_all_groups(names=[group_name])[0]
-    asg_properties = get_properties(as_group)
-
-
+    return props
 
 def main():
     argument_spec = ec2_argument_spec()
@@ -569,6 +611,7 @@ def main():
             tags=dict(type='list', default=[]),
             health_check_period=dict(type='int', default=300),
             health_check_type=dict(default='EC2', choices=['EC2', 'ELB']),
+            wait_for_instances=dict(type='bool', default=True)
         ),
     )
     
@@ -576,7 +619,6 @@ def main():
         argument_spec=argument_spec, 
         mutually_exclusive = [['replace_all_instances', 'replace_instances']]
     )
-
     state = module.params.get('state')
     replace_instances = module.params.get('replace_instances')
     replace_all_instances = module.params.get('replace_all_instances')
@@ -588,7 +630,6 @@ def main():
     except boto.exception.NoAuthHandlerFound, e:
         module.fail_json(msg=str(e))
     changed = create_changed = replace_changed = False
-    
 
     if state == 'present':
         create_changed, asg_properties=create_autoscaling_group(connection, module)
@@ -601,4 +642,5 @@ def main():
         changed = True
     module.exit_json( changed = changed, **asg_properties )
 
+
 main()