Merge pull request #8901 from jsmartin/ec2_replace_all

Rolling Instance Replace.  Fixes #8501.
This commit is contained in:
Michael DeHaan 2014-09-10 17:42:35 -04:00
commit 717b53cad9

View file

@ -57,6 +57,30 @@ options:
description: description:
- Desired number of instances in group - Desired number of instances in group
required: false required: false
replace_all_instances:
description:
- In a rolling fashion, replace all instances with an old launch configuration with one from the current launch configuraiton.
required: false
version_added: "1.8"
default: False
replace_batch_size:
description:
- Number of instances you'd like to replace at a time. Used with replace_all_instances.
required: false
version_added: "1.8"
default: 1
replace_instances:
description:
- List of instance_ids belonging to the named ASG that you would like to terminate and be replaced with instances matching the current launch configuration.
required: false
version_added: "1.8"
default: None
lc_check:
description:
- Check to make sure instances that are being replaced with replace_instances do not aready have the current launch_config.
required: false
version_added: "1.8"
default: True
region: region:
description: description:
- The AWS region to use. If not specified then the value of the EC2_REGION environment variable, if any, is used. - The AWS region to use. If not specified then the value of the EC2_REGION environment variable, if any, is used.
@ -86,6 +110,11 @@ options:
default: EC2 default: EC2
version_added: "1.7" version_added: "1.7"
choices: ['EC2', 'ELB'] choices: ['EC2', 'ELB']
wait_timeout:
description:
- how long before wait instances to become viable when replaced. Used in concjunction with instance_ids option.
default: 300
version_added: "1.8"
extends_documentation_fragment: aws extends_documentation_fragment: aws
""" """
@ -109,6 +138,51 @@ deprecated method of expressing tags:
value: production value: production
propagate_at_launch: no propagate_at_launch: no
Example of how to assign a new launch config to an ASG and terminate old instances.
All instances in "myasg" that do not have the launch configuration named "my_new_lc" will be terminated in
a rolling fashion with instances using the current launch configuration, "my_new_lc".
This could also be considered a rolling deploy of a pre-baked AMI.
If this is a newly created group, the instances will not be replaced since all instances
will have the current launch configuration.
- name: create launch config
ec2_lc:
name: my_new_lc
image_id: ami-lkajsf
key_name: mykey
region: us-east-1
security_groups: sg-23423
instance_type: m1.small
assign_public_ip: yes
- ec2_asg:
name: myasg
launch_config_name: my_new_lc
health_check_period: 60
health_check_type: ELB
replace_all_instances: yes
min_size: 5
max_size: 5
desired_capacity: 5
region: us-east-1
If you only wanted to replace a couple of instances instead of all of them, supply a list
to "replace_instances":
- ec2_asg:
name: myasg
launch_config_name: my_new_lc
health_check_period: 60
health_check_type: ELB
replace_instances:
- i-b345231
- i-24c2931
min_size: 5
max_size: 5
desired_capacity: 5
region: us-east-1
''' '''
import sys import sys
@ -130,6 +204,8 @@ ASG_ATTRIBUTES = ('availability_zones', 'default_cooldown', 'desired_capacity',
'load_balancers', 'max_size', 'min_size', 'name', 'placement_group', 'load_balancers', 'max_size', 'min_size', 'name', 'placement_group',
'tags', 'termination_policies', 'vpc_zone_identifier') 'tags', 'termination_policies', 'vpc_zone_identifier')
INSTANCE_ATTRIBUTES = ('instance_id', 'health_status', 'lifecycle_state', 'launch_config_name')
def enforce_required_arguments(module): def enforce_required_arguments(module):
''' As many arguments are not required for autoscale group deletion ''' As many arguments are not required for autoscale group deletion
they cannot be mandatory arguments for the module, so we enforce they cannot be mandatory arguments for the module, so we enforce
@ -144,8 +220,33 @@ def enforce_required_arguments(module):
def get_properties(autoscaling_group): def get_properties(autoscaling_group):
properties = dict((attr, getattr(autoscaling_group, attr)) for attr in ASG_ATTRIBUTES) properties = dict((attr, getattr(autoscaling_group, attr)) for attr in ASG_ATTRIBUTES)
properties['healthy_instances'] = 0
properties['in_service_instances'] = 0
properties['unhealthy_instances'] = 0
properties['pending_instances'] = 0
properties['viable_instances'] = 0
properties['terminating_instances'] = 0
if autoscaling_group.instances: if autoscaling_group.instances:
properties['instances'] = [i.instance_id for i in autoscaling_group.instances] properties['instances'] = [i.instance_id for i in autoscaling_group.instances]
instance_facts = {}
for i in autoscaling_group.instances:
instance_facts[i.instance_id] = {'health_status': i.health_status,
'lifecycle_state': i.lifecycle_state,
'launch_config_name': i.launch_config_name }
if i.health_status == 'Healthy' and i.lifecycle_state == 'InService':
properties['viable_instances'] += 1
if i.health_status == 'Healthy':
properties['healthy_instances'] += 1
else:
properties['unhealthy_instances'] += 1
if i.lifecycle_state == 'InService':
properties['in_service_instances'] += 1
if i.lifecycle_state == 'Terminating':
properties['terminating_instances'] += 1
if i.lifecycle_state == 'Pending':
properties['pending_instances'] += 1
properties['instance_facts'] = instance_facts
properties['load_balancers'] = autoscaling_group.load_balancers properties['load_balancers'] = autoscaling_group.load_balancers
return properties return properties
@ -210,16 +311,30 @@ def create_autoscaling_group(connection, module):
try: try:
connection.create_auto_scaling_group(ag) connection.create_auto_scaling_group(ag)
asg_properties = get_properties(ag) asg_properties = get_properties(ag)
module.exit_json(changed=True, **asg_properties) changed = True
return(changed, asg_properties)
except BotoServerError, e: except BotoServerError, e:
module.fail_json(msg=str(e)) module.fail_json(msg=str(e))
else: else:
as_group = as_groups[0] as_group = as_groups[0]
changed = False changed = False
for attr in ASG_ATTRIBUTES: for attr in ASG_ATTRIBUTES:
if module.params.get(attr) and getattr(as_group, attr) != module.params.get(attr): if module.params.get(attr):
module_attr = module.params.get(attr)
group_attr = getattr(as_group, attr)
# we do this because AWS and the module may return the same list
# sorted differently
try:
module_attr.sort()
except:
pass
try:
group_attr.sort()
except:
pass
if group_attr != module_attr:
changed = True changed = True
setattr(as_group, attr, module.params.get(attr)) setattr(as_group, attr, module_attr)
if len(set_tags) > 0: if len(set_tags) > 0:
existing_tags = as_group.tags existing_tags = as_group.tags
@ -256,10 +371,11 @@ def create_autoscaling_group(connection, module):
if changed: if changed:
as_group.update() as_group.update()
asg_properties = get_properties(as_group) asg_properties = get_properties(as_group)
module.exit_json(changed=changed, **asg_properties) return(changed, asg_properties)
except BotoServerError, e: except BotoServerError, e:
module.fail_json(msg=str(e)) module.fail_json(msg=str(e))
result = as_groups[0] result = as_groups[0]
module.exit_json(changed=changed, name=result.name, module.exit_json(changed=changed, name=result.name,
autoscaling_group_arn=result.autoscaling_group_arn, autoscaling_group_arn=result.autoscaling_group_arn,
@ -274,6 +390,7 @@ def create_autoscaling_group(connection, module):
load_balancers=result.load_balancers, load_balancers=result.load_balancers,
min_size=result.min_size, max_size=result.max_size, min_size=result.min_size, max_size=result.max_size,
placement_group=result.placement_group, placement_group=result.placement_group,
wait_timeout = dict(default=300),
tags=result.tags, tags=result.tags,
termination_policies=result.termination_policies, termination_policies=result.termination_policies,
vpc_zone_identifier=result.vpc_zone_identifier) vpc_zone_identifier=result.vpc_zone_identifier)
@ -298,9 +415,148 @@ def delete_autoscaling_group(connection, module):
time.sleep(10) time.sleep(10)
group.delete() group.delete()
module.exit_json(changed=True) changed=True
return changed
else: else:
module.exit_json(changed=False) changed=False
return changed
def get_chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def replace(connection, module):
batch_size = module.params.get('replace_batch_size')
wait_timeout = module.params.get('wait_timeout')
group_name = module.params.get('group_name')
max_size = module.params.get('max_size')
min_size = module.params.get('min_size')
desired_capacity = module.params.get('desired_capacity')
replace_instances = module.params.get('replace_instances')
# wait for instance list to be populated on a newly provisioned ASG
instance_wait = time.time() + 30
while instance_wait > time.time():
as_group = connection.get_all_groups(names=[group_name])[0]
props = get_properties(as_group)
if props.has_key('instances'):
instances = props['instances']
break
time.sleep(10)
if instance_wait <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too for instances to appear. %s" % time.asctime())
# determine if we need to continue
replaceable = 0
if replace_instances:
instances = replace_instances
for k in props['instance_facts'].keys():
if k in instances:
if props['instance_facts'][k]['launch_config_name'] != props['launch_config_name']:
replaceable += 1
if replaceable == 0:
changed = False
return(changed, props)
# set temporary settings and wait for them to be reached
as_group.max_size = max_size + batch_size
as_group.min_size = min_size + batch_size
as_group.desired_capacity = desired_capacity + batch_size
as_group.update()
wait_timeout = time.time() + wait_timeout
while wait_timeout > time.time() and min_size + batch_size > props['viable_instances']:
time.sleep(10)
as_groups = connection.get_all_groups(names=[group_name])
as_group = as_groups[0]
props = get_properties(as_group)
if wait_timeout <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too for instances to appear. %s" % time.asctime())
instances = props['instances']
if replace_instances:
instances = replace_instances
for i in get_chunks(instances, batch_size):
replace_batch(connection, module, i)
# return settings to normal
as_group = connection.get_all_groups(names=[group_name])[0]
as_group.max_size = max_size
as_group.min_size = min_size
as_group.desired_capacity = desired_capacity
as_group.update()
as_group = connection.get_all_groups(names=[group_name])[0]
asg_properties = get_properties(as_group)
changed=True
return(changed, asg_properties)
def replace_batch(connection, module, replace_instances):
group_name = module.params.get('group_name')
wait_timeout = int(module.params.get('wait_timeout'))
lc_check = module.params.get('lc_check')
as_group = connection.get_all_groups(names=[group_name])[0]
props = get_properties(as_group)
# check to make sure instances given are actually in the given ASG
# and they have a non-current launch config
old_instances = []
instances = ( inst_id for inst_id in replace_instances if inst_id in props['instances'])
if lc_check:
for i in instances:
if props['instance_facts'][i]['launch_config_name'] != props['launch_config_name']:
old_instances.append(i)
else:
old_instances = instances
# set all instances given to unhealthy
for instance_id in old_instances:
connection.set_instance_health(instance_id,'Unhealthy')
# we wait to make sure the machines we marked as Unhealthy are
# no longer in the list
count = 1
wait_timeout = time.time() + wait_timeout
while wait_timeout > time.time() and count > 0:
count = 0
as_group = connection.get_all_groups(names=[group_name])[0]
props = get_properties(as_group)
instance_facts = props['instance_facts']
instances = ( i for i in instance_facts if i in old_instances)
for i in instances:
if ( instance_facts[i]['lifecycle_state'] == 'Terminating'
or instance_facts[i]['health_status'] == 'Unhealthy' ):
count += 1
time.sleep(10)
if wait_timeout <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too long for old instances to terminate. %s" % time.asctime())
# make sure we have the latest stats after that last loop.
as_group = connection.get_all_groups(names=[group_name])[0]
props = get_properties(as_group)
# now we make sure that we have enough instances in a viable state
wait_timeout = time.time() + wait_timeout
while wait_timeout > time.time() and props['min_size'] > props['viable_instances']:
time.sleep(10)
as_groups = connection.get_all_groups(names=[group_name])
as_group = as_groups[0]
props = get_properties(as_group)
if wait_timeout <= time.time():
# waiting took too long
module.fail_json(msg = "Waited too long for new instances to become viable. %s" % time.asctime())
# collect final stats info
as_group = connection.get_all_groups(names=[group_name])[0]
asg_properties = get_properties(as_group)
def main(): def main():
@ -315,6 +571,11 @@ def main():
max_size=dict(type='int'), max_size=dict(type='int'),
desired_capacity=dict(type='int'), desired_capacity=dict(type='int'),
vpc_zone_identifier=dict(type='str'), vpc_zone_identifier=dict(type='str'),
replace_batch_size=dict(type='int', default=1),
replace_all_instances=dict(type='bool', default=False),
replace_instances=dict(type='list', default=[]),
lc_check=dict(type='bool', default=True),
wait_timeout=dict(type='int', default=300),
state=dict(default='present', choices=['present', 'absent']), state=dict(default='present', choices=['present', 'absent']),
tags=dict(type='list', default=[]), tags=dict(type='list', default=[]),
health_check_period=dict(type='int', default=300), health_check_period=dict(type='int', default=300),
@ -324,7 +585,8 @@ def main():
module = AnsibleModule(argument_spec=argument_spec) module = AnsibleModule(argument_spec=argument_spec)
state = module.params.get('state') state = module.params.get('state')
replace_instances = module.params.get('replace_instances')
replace_all_instances = module.params.get('replace_all_instances')
region, ec2_url, aws_connect_params = get_aws_connection_info(module) region, ec2_url, aws_connect_params = get_aws_connection_info(module)
try: try:
connection = connect_to_aws(boto.ec2.autoscale, region, **aws_connect_params) connection = connect_to_aws(boto.ec2.autoscale, region, **aws_connect_params)
@ -332,10 +594,18 @@ def main():
module.fail_json(msg="failed to connect to AWS for the given region: %s" % str(region)) module.fail_json(msg="failed to connect to AWS for the given region: %s" % str(region))
except boto.exception.NoAuthHandlerFound, e: except boto.exception.NoAuthHandlerFound, e:
module.fail_json(msg=str(e)) module.fail_json(msg=str(e))
changed = False
if replace_all_instances and replace_instances:
module.fail_json(msg="You can't use replace_instances and replace_all_instances in the same task.")
if state == 'present': if state == 'present':
create_autoscaling_group(connection, module) create_changed, asg_properties=create_autoscaling_group(connection, module)
if replace_all_instances or replace_instances:
replace_changed, asg_properties=replace(connection, module)
elif state == 'absent': elif state == 'absent':
delete_autoscaling_group(connection, module) changed = delete_autoscaling_group(connection, module)
module.exit_json( changed = changed )
if create_changed or replace_changed:
changed = True
module.exit_json( changed = changed, **asg_properties )
main() main()