calculate max fail against all hosts in batch

currently it is doing only from the 'active' hosts in the batch which means
the percentage goes up as hosts fail instead of staying the same.
added debug info for max fail

fixes #32255
This commit is contained in:
Brian Coca 2017-10-30 16:39:15 -04:00 committed by Toshio Kuratomi
parent 41685fb516
commit 4fb9e54c50
3 changed files with 13 additions and 23 deletions

View file

@ -203,7 +203,9 @@ class PlayIterator:
self._host_states = {}
start_at_matched = False
for host in inventory.get_hosts(self._play.hosts):
batch = inventory.get_hosts(self._play.hosts)
self.batch_size = len(batch)
for host in batch:
self._host_states[host.name] = HostState(blocks=self._blocks)
# if we're looking to start at a specific task, iterate through
# the tasks for this host until we find the specified task

View file

@ -242,22 +242,6 @@ class TaskQueueManager:
loader=self._loader,
)
# Fork # of forks, # of hosts or serial, whichever is lowest
num_hosts = len(self._inventory.get_hosts(new_play.hosts, ignore_restrictions=True))
max_serial = 0
if new_play.serial:
# the play has not been post_validated here, so we may need
# to convert the scalar value to a list at this point
serial_items = new_play.serial
if not isinstance(serial_items, list):
serial_items = [serial_items]
max_serial = max([pct_to_int(x, num_hosts) for x in serial_items])
contenders = [self._options.forks, max_serial, num_hosts]
contenders = [v for v in contenders if v is not None and v > 0]
self._initialize_processes(min(contenders))
play_context = PlayContext(new_play, self._options, self.passwords, self._connection_lockfile.fileno())
for callback_plugin in self._callback_plugins:
if hasattr(callback_plugin, 'set_play_context'):
@ -268,11 +252,6 @@ class TaskQueueManager:
# initialize the shared dictionary containing the notified handlers
self._initialize_notified_handlers(new_play)
# load the specified strategy (or the default linear one)
strategy = strategy_loader.get(new_play.strategy, self)
if strategy is None:
raise AnsibleError("Invalid play strategy specified: %s" % new_play.strategy, obj=play._ds)
# build the iterator
iterator = PlayIterator(
inventory=self._inventory,
@ -283,6 +262,14 @@ class TaskQueueManager:
start_at_done=self._start_at_done,
)
# adjust to # of workers to configured forks or size of batch, whatever is lower
self._initialize_processes(min(self._options.forks, iterator.batch_size))
# load the specified strategy (or the default linear one)
strategy = strategy_loader.get(new_play.strategy, self)
if strategy is None:
raise AnsibleError("Invalid play strategy specified: %s" % new_play.strategy, obj=play._ds)
# Because the TQM may survive multiple play runs, we start by marking
# any hosts as failed in the iterator here which may have been marked
# as failed in previous runs. Then we clear the internal list of failed

View file

@ -401,7 +401,7 @@ class StrategyModule(StrategyBase):
if iterator._play.max_fail_percentage is not None and len(results) > 0:
percentage = iterator._play.max_fail_percentage / 100.0
if (len(self._tqm._failed_hosts) / len(results)) > percentage:
if (len(self._tqm._failed_hosts) / iterator.batch_size) > percentage:
for host in hosts_left:
# don't double-mark hosts, or the iterator will potentially
# fail them out of the rescue/always states
@ -410,6 +410,7 @@ class StrategyModule(StrategyBase):
iterator.mark_host_failed(host)
self._tqm.send_callback('v2_playbook_on_no_hosts_remaining')
result |= self._tqm.RUN_FAILED_BREAK_PLAY
display.debug('(%s failed / %s total )> %s max fail' % (len(self._tqm._failed_hosts), iterator.batch_size, percentage))
display.debug("done checking for max_fail_percentage")
display.debug("checking to see if all hosts have failed and the running result is not ok")