From 135b92bf4e1710f9fbad54725228a4335a027e80 Mon Sep 17 00:00:00 2001
From: Ryan Brown <sb@ryansb.com>
Date: Fri, 30 Sep 2016 11:26:11 -0400
Subject: [PATCH] Check status of finished spot instance requests (#4990)

Per #3877, the code to wait for spot instance requests to finish would
hang for the full wait time if any spot request failed for any reason.
This commit introduces status checks for spot requests, so if the
request fails, finishes, or is cancelled the task will fail/succeed
accordingly.

One edge case introduced here is tha if a user terminates the instance
associated with the request manually it won't fail the play, under the
presumption that the user *wants* the instance terminated.
---
 lib/ansible/modules/cloud/amazon/ec2.py | 75 +++++++++++++++++++------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/lib/ansible/modules/cloud/amazon/ec2.py b/lib/ansible/modules/cloud/amazon/ec2.py
index c8649f3d3d9..eb56684b044 100755
--- a/lib/ansible/modules/cloud/amazon/ec2.py
+++ b/lib/ansible/modules/cloud/amazon/ec2.py
@@ -793,6 +793,63 @@ def boto_supports_param_in_spot_request(ec2, param):
     method = getattr(ec2, 'request_spot_instances')
     return param in method.func_code.co_varnames
 
+def await_spot_requests(module, ec2, spot_requests, count):
+    """
+    Wait for a group of spot requests to be fulfilled, or fail.
+
+    module: Ansible module object
+    ec2: authenticated ec2 connection object
+    spot_requests: boto.ec2.spotinstancerequest.SpotInstanceRequest object returned by ec2.request_spot_instances
+    count: Total number of instances to be created by the spot requests
+
+    Returns:
+        list of instance ID's created by the spot request(s)
+    """
+    spot_wait_timeout = int(module.params.get('spot_wait_timeout'))
+    wait_complete = time.time() + spot_wait_timeout
+
+    spot_req_inst_ids = dict()
+    while time.time() < wait_complete:
+        reqs = ec2.get_all_spot_instance_requests()
+        for sirb in spot_requests:
+            if sirb.id in spot_req_inst_ids:
+                continue
+            for sir in reqs:
+                if sir.id != sirb.id:
+                    continue # this is not our spot instance
+                if sir.instance_id is not None:
+                    spot_req_inst_ids[sirb.id] = sir.instance_id
+                elif sir.state == 'open':
+                    continue # still waiting, nothing to do here
+                elif sir.state == 'active':
+                    continue # Instance is created already, nothing to do here
+                elif sir.state == 'failed':
+                    module.fail_json(msg="Spot instance request %s failed with status %s and fault %s:%s" % (
+                        sir.id, sir.status.code, sir.fault.code, sir.fault.message))
+                elif sir.state == 'cancelled':
+                    module.fail_json(msg="Spot instance request %s was cancelled before it could be fulfilled." % sir.id)
+                elif sir.state == 'closed':
+                    # instance is terminating or marked for termination
+                    # this may be intentional on the part of the operator,
+                    # or it may have been terminated by AWS due to capacity,
+                    # price, or group constraints in this case, we'll fail
+                    # the module if the reason for the state is anything
+                    # other than termination by user. Codes are documented at
+                    # http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-bid-status.html
+                    if sir.status.code == 'instance-terminated-by-user':
+                        # do nothing, since the user likely did this on purpose
+                        pass
+                    else:
+                        spot_msg = "Spot instance request %s was closed by AWS with the status %s and fault %s:%s"
+                        module.fail_json(msg=spot_msg % (sir.id, sir.status.code, sir.fault.code, sir.fault.message))
+
+        if len(spot_req_inst_ids) < count:
+            time.sleep(5)
+        else:
+            return spot_req_inst_ids.values()
+    module.fail_json(msg = "wait for spot requests timeout on %s" % time.asctime())
+
+
 def enforce_count(module, ec2, vpc):
 
     exact_count = module.params.get('exact_count')
@@ -1103,23 +1160,7 @@ def create_instances(module, ec2, vpc, override_count=None):
 
                 # Now we have to do the intermediate waiting
                 if wait:
-                    spot_req_inst_ids = dict()
-                    spot_wait_timeout = time.time() + spot_wait_timeout
-                    while spot_wait_timeout > time.time():
-                        reqs = ec2.get_all_spot_instance_requests()
-                        for sirb in res:
-                            if sirb.id in spot_req_inst_ids:
-                                continue
-                            for sir in reqs:
-                                if sir.id == sirb.id and sir.instance_id is not None:
-                                    spot_req_inst_ids[sirb.id] = sir.instance_id
-                        if len(spot_req_inst_ids) < count:
-                            time.sleep(5)
-                        else:
-                            break
-                    if spot_wait_timeout <= time.time():
-                        module.fail_json(msg = "wait for spot requests timeout on %s" % time.asctime())
-                    instids = spot_req_inst_ids.values()
+                    instids = await_spot_requests(module, ec2, res, count)
         except boto.exception.BotoServerError as e:
             module.fail_json(msg = "Instance creation failed => %s: %s" % (e.error_code, e.error_message))