check lb for instance healthy and some refactoring of rolling replace functions.

Optionally waiting for new instances to be in viable state on ASG creation/update.

Properly updating properties and terminating group.
This commit is contained in:
James Martin 2015-01-08 17:24:44 -05:00 committed by Matt Clay
parent 7f8094e977
commit cadfd56304

View file

@ -13,7 +13,6 @@
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with Ansible. If not, see <http://www.gnu.org/licenses/>. # along with Ansible. If not, see <http://www.gnu.org/licenses/>.
DOCUMENTATION = """ DOCUMENTATION = """
--- ---
module: ec2_asg module: ec2_asg
@ -115,6 +114,12 @@ options:
- how long before wait instances to become viable when replaced. Used in concjunction with instance_ids option. - how long before wait instances to become viable when replaced. Used in concjunction with instance_ids option.
default: 300 default: 300
version_added: "1.8" version_added: "1.8"
wait_for_instances:
description:
- Wait for the ASG instances to be in a ready state before exiting. If instances are behind an ELB, it will wait until the instances are considered by the ELB.
version_added: "1.9"
default: yes
required: False
extends_documentation_fragment: aws extends_documentation_fragment: aws
""" """
@ -254,8 +259,47 @@ def get_properties(autoscaling_group):
return properties return properties
def create_autoscaling_group(connection, module): def wait_for_elb(asg_connection, module, group_name):
region, ec2_url, aws_connect_params = get_aws_connection_info(module)
wait_timeout = module.params.get('wait_timeout')
# if the health_check_type is ELB, we want to query the ELBs directly for instance
# status as to avoid health_check_grace period that is awarded to ASG instances
as_group = asg_connection.get_all_groups(names=[group_name])[0]
if as_group.load_balancers and as_group.health_check_type == 'ELB':
try:
elb_connection = connect_to_aws(boto.ec2.elb, region, **aws_connect_params)
except boto.exception.NoAuthHandlerFound, e:
module.fail_json(msg=str(e))
wait_timeout = time.time() + wait_timeout
healthy_instances = {}
while len(healthy_instances.keys()) < as_group.min_size and wait_timeout > time.time():
as_group = asg_connection.get_all_groups(names=[group_name])[0]
props = get_properties(as_group)
# get healthy, inservice instances from ASG
instances = []
for instance, settings in props['instance_facts'].items():
if settings['lifecycle_state'] == 'InService' and settings['health_status'] == 'Healthy':
instances.append(instance)
for lb in as_group.load_balancers:
# we catch a race condition that sometimes happens if the instance exists in the ASG
# but has not yet show up in the ELB
try:
lb_instances = elb_connection.describe_instance_health(lb, instances=instances)
except boto.exception.InvalidInstance, e:
pass
for i in lb_instances:
if i.state == "InService":
healthy_instances[i.instance_id] = i.state
time.sleep(10)
if wait_timeout <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too long for ELB instances to be healthy. %s" % time.asctime())
def create_autoscaling_group(connection, module):
group_name = module.params.get('name') group_name = module.params.get('name')
load_balancers = module.params['load_balancers'] load_balancers = module.params['load_balancers']
availability_zones = module.params['availability_zones'] availability_zones = module.params['availability_zones']
@ -267,8 +311,9 @@ def create_autoscaling_group(connection, module):
set_tags = module.params.get('tags') set_tags = module.params.get('tags')
health_check_period = module.params.get('health_check_period') health_check_period = module.params.get('health_check_period')
health_check_type = module.params.get('health_check_type') health_check_type = module.params.get('health_check_type')
wait_for_instances = module.params.get('wait_for_instances')
as_groups = connection.get_all_groups(names=[group_name]) as_groups = connection.get_all_groups(names=[group_name])
wait_timeout = module.params.get('wait_timeout')
if not vpc_zone_identifier and not availability_zones: if not vpc_zone_identifier and not availability_zones:
region, ec2_url, aws_connect_params = get_aws_connection_info(module) region, ec2_url, aws_connect_params = get_aws_connection_info(module)
@ -315,7 +360,11 @@ def create_autoscaling_group(connection, module):
try: try:
connection.create_auto_scaling_group(ag) connection.create_auto_scaling_group(ag)
asg_properties = get_properties(ag) if wait_for_instances == True:
wait_for_new_instances(module, connection, group_name, wait_timeout, desired_capacity, 'viable_instances')
wait_for_elb(connection, module, group_name)
as_group = connection.get_all_groups(names=[group_name])[0]
asg_properties = get_properties(as_group)
changed = True changed = True
return(changed, asg_properties) return(changed, asg_properties)
except BotoServerError, e: except BotoServerError, e:
@ -375,14 +424,23 @@ def create_autoscaling_group(connection, module):
changed = True changed = True
as_group.load_balancers = module.params.get('load_balancers') as_group.load_balancers = module.params.get('load_balancers')
try:
if changed: if changed:
try:
as_group.update() as_group.update()
asg_properties = get_properties(as_group)
return(changed, asg_properties)
except BotoServerError, e: except BotoServerError, e:
module.fail_json(msg=str(e)) module.fail_json(msg=str(e))
if wait_for_instances == True:
wait_for_new_instances(module, connection, group_name, wait_timeout, desired_capacity, 'viable_instances')
wait_for_elb(connection, module, group_name)
try:
as_group = connection.get_all_groups(names=[group_name])[0]
asg_properties = get_properties(as_group)
except BotoServerError, e:
module.fail_json(msg=str(e))
return(changed, asg_properties)
def delete_autoscaling_group(connection, module): def delete_autoscaling_group(connection, module):
group_name = module.params.get('name') group_name = module.params.get('name')
@ -403,6 +461,8 @@ def delete_autoscaling_group(connection, module):
time.sleep(10) time.sleep(10)
group.delete() group.delete()
while len(connection.get_all_groups(names=[group_name])):
time.sleep(5)
changed=True changed=True
return changed return changed
else: else:
@ -414,7 +474,6 @@ def get_chunks(l, n):
yield l[i:i+n] yield l[i:i+n]
def replace(connection, module): def replace(connection, module):
batch_size = module.params.get('replace_batch_size') batch_size = module.params.get('replace_batch_size')
wait_timeout = module.params.get('wait_timeout') wait_timeout = module.params.get('wait_timeout')
group_name = module.params.get('name') group_name = module.params.get('name')
@ -425,19 +484,10 @@ def replace(connection, module):
# FIXME: we need some more docs about this feature # FIXME: we need some more docs about this feature
replace_instances = module.params.get('replace_instances') replace_instances = module.params.get('replace_instances')
# wait for instance list to be populated on a newly provisioned ASG
instance_wait = time.time() + 30
while instance_wait > time.time():
as_group = connection.get_all_groups(names=[group_name])[0] as_group = connection.get_all_groups(names=[group_name])[0]
wait_for_new_instances(module, connection, as_group, wait_timeout, as_group.min_size, 'viable_instances')
props = get_properties(as_group) props = get_properties(as_group)
if props.has_key('instances'):
instances = props['instances'] instances = props['instances']
break
time.sleep(10)
if instance_wait <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too long for instances to appear. %s" % time.asctime())
# determine if we need to continue
replaceable = 0 replaceable = 0
if replace_instances: if replace_instances:
instances = replace_instances instances = replace_instances
@ -450,26 +500,24 @@ def replace(connection, module):
return(changed, props) return(changed, props)
# set temporary settings and wait for them to be reached # set temporary settings and wait for them to be reached
as_group = connection.get_all_groups(names=[group_name])[0]
as_group.max_size = max_size + batch_size as_group.max_size = max_size + batch_size
as_group.min_size = min_size + batch_size as_group.min_size = min_size + batch_size
as_group.desired_capacity = desired_capacity + batch_size as_group.desired_capacity = desired_capacity + batch_size
as_group.update() as_group.update()
wait_timeout = time.time() + wait_timeout wait_for_new_instances(module, connection, as_group, wait_timeout, as_group.min_size, 'viable_instances')
while wait_timeout > time.time() and min_size + batch_size > props['viable_instances']: wait_for_elb(connection, module, as_group)
time.sleep(10) as_group = connection.get_all_groups(names=[group_name])[0]
as_groups = connection.get_all_groups(names=[group_name])
as_group = as_groups[0]
props = get_properties(as_group) props = get_properties(as_group)
if wait_timeout <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too long for instances to appear. %s" % time.asctime())
instances = props['instances'] instances = props['instances']
if replace_instances: if replace_instances:
instances = replace_instances instances = replace_instances
for i in get_chunks(instances, batch_size): for i in get_chunks(instances, batch_size):
replace_batch(connection, module, i) terminate_batch(connection, module, i)
# return settings to normal wait_for_new_instances(module, connection, as_group, wait_timeout, as_group.min_size, 'viable_instances')
wait_for_elb(connection, module, group_name)
as_group = connection.get_all_groups(names=[group_name])[0] as_group = connection.get_all_groups(names=[group_name])[0]
# return settings to normal
as_group.max_size = max_size as_group.max_size = max_size
as_group.min_size = min_size as_group.min_size = min_size
as_group.desired_capacity = desired_capacity as_group.desired_capacity = desired_capacity
@ -479,9 +527,7 @@ def replace(connection, module):
changed=True changed=True
return(changed, asg_properties) return(changed, asg_properties)
def replace_batch(connection, module, replace_instances): def terminate_batch(connection, module, replace_instances):
group_name = module.params.get('name') group_name = module.params.get('name')
wait_timeout = int(module.params.get('wait_timeout')) wait_timeout = int(module.params.get('wait_timeout'))
lc_check = module.params.get('lc_check') lc_check = module.params.get('lc_check')
@ -526,27 +572,23 @@ def replace_batch(connection, module, replace_instances):
# waiting took too long # waiting took too long
module.fail_json(msg = "Waited too long for old instances to terminate. %s" % time.asctime()) module.fail_json(msg = "Waited too long for old instances to terminate. %s" % time.asctime())
def wait_for_new_instances(module, connection, group_name, wait_timeout, desired_size, prop):
# make sure we have the latest stats after that last loop. # make sure we have the latest stats after that last loop.
as_group = connection.get_all_groups(names=[group_name])[0] as_group = connection.get_all_groups(names=[group_name])[0]
props = get_properties(as_group) props = get_properties(as_group)
# now we make sure that we have enough instances in a viable state # now we make sure that we have enough instances in a viable state
wait_timeout = time.time() + wait_timeout wait_timeout = time.time() + wait_timeout
while wait_timeout > time.time() and props['min_size'] > props['viable_instances']: while wait_timeout > time.time() and desired_size > props[prop]:
time.sleep(10) time.sleep(10)
as_groups = connection.get_all_groups(names=[group_name]) as_groups = connection.get_all_groups(names=[group_name])
as_group = as_groups[0] as_group = as_groups[0]
props = get_properties(as_group) props = get_properties(as_group)
if wait_timeout <= time.time(): if wait_timeout <= time.time():
# waiting took too long # waiting took too long
module.fail_json(msg = "Waited too long for new instances to become viable. %s" % time.asctime()) module.fail_json(msg = "Waited too long for new instances to become viable. %s" % time.asctime())
# collect final stats info return props
as_group = connection.get_all_groups(names=[group_name])[0]
asg_properties = get_properties(as_group)
def main(): def main():
argument_spec = ec2_argument_spec() argument_spec = ec2_argument_spec()
@ -569,6 +611,7 @@ def main():
tags=dict(type='list', default=[]), tags=dict(type='list', default=[]),
health_check_period=dict(type='int', default=300), health_check_period=dict(type='int', default=300),
health_check_type=dict(default='EC2', choices=['EC2', 'ELB']), health_check_type=dict(default='EC2', choices=['EC2', 'ELB']),
wait_for_instances=dict(type='bool', default=True)
), ),
) )
@ -576,7 +619,6 @@ def main():
argument_spec=argument_spec, argument_spec=argument_spec,
mutually_exclusive = [['replace_all_instances', 'replace_instances']] mutually_exclusive = [['replace_all_instances', 'replace_instances']]
) )
state = module.params.get('state') state = module.params.get('state')
replace_instances = module.params.get('replace_instances') replace_instances = module.params.get('replace_instances')
replace_all_instances = module.params.get('replace_all_instances') replace_all_instances = module.params.get('replace_all_instances')
@ -589,7 +631,6 @@ def main():
module.fail_json(msg=str(e)) module.fail_json(msg=str(e))
changed = create_changed = replace_changed = False changed = create_changed = replace_changed = False
if state == 'present': if state == 'present':
create_changed, asg_properties=create_autoscaling_group(connection, module) create_changed, asg_properties=create_autoscaling_group(connection, module)
elif state == 'absent': elif state == 'absent':
@ -601,4 +642,5 @@ def main():
changed = True changed = True
module.exit_json( changed = changed, **asg_properties ) module.exit_json( changed = changed, **asg_properties )
main() main()