From ba2b1a6bf6f4a1d5ef975789caa380e72b0a4e77 Mon Sep 17 00:00:00 2001 From: Brian Coca Date: Tue, 1 Jun 2021 14:52:22 -0400 Subject: [PATCH] fact gathering, ensure we get some linux hardware info even if some fails/times out (#74714) * Add resiliency to linux hw fact gathering Now traps unexpected exceptions on mounts and continues gathiering other info. Also gives more info on why mount info gathering failed. more info if debugging --- changelogs/fragments/linux_hw_facts_fix.yml | 2 + .../module_utils/facts/hardware/linux.py | 51 +++++++++++-------- 2 files changed, 31 insertions(+), 22 deletions(-) create mode 100644 changelogs/fragments/linux_hw_facts_fix.yml diff --git a/changelogs/fragments/linux_hw_facts_fix.yml b/changelogs/fragments/linux_hw_facts_fix.yml new file mode 100644 index 00000000000..075007c5bbe --- /dev/null +++ b/changelogs/fragments/linux_hw_facts_fix.yml @@ -0,0 +1,2 @@ +bugfixes: + - setup, while gathering linux hardware facts be more resilient to errors and try to return more info. diff --git a/lib/ansible/module_utils/facts/hardware/linux.py b/lib/ansible/module_utils/facts/hardware/linux.py index 0829d495c98..1beb9a5cde9 100644 --- a/lib/ansible/module_utils/facts/hardware/linux.py +++ b/lib/ansible/module_utils/facts/hardware/linux.py @@ -98,7 +98,7 @@ class LinuxHardware(Hardware): try: mount_facts = self.get_mount_facts() except timeout.TimeoutError: - pass + self.module.warn("No mount facts were gathered due to timeout.") hardware_facts.update(cpu_facts) hardware_facts.update(memory_facts) @@ -564,31 +564,38 @@ class LinuxHardware(Hardware): # wait for workers and get results while results: - for mount in results: + for mount in list(results): + done = False res = results[mount]['extra'] - if res.ready(): - if res.successful(): - mount_size, uuid = res.get() - if mount_size: - results[mount]['info'].update(mount_size) - results[mount]['info']['uuid'] = uuid or 'N/A' - else: - # give incomplete data - errmsg = to_text(res.get()) - self.module.warn("Error prevented getting extra info for mount %s: %s." % (mount, errmsg)) - results[mount]['info']['note'] = 'Could not get extra information: %s.' % (errmsg) + try: + if res.ready(): + done = True + if res.successful(): + mount_size, uuid = res.get() + if mount_size: + results[mount]['info'].update(mount_size) + results[mount]['info']['uuid'] = uuid or 'N/A' + else: + # failed, try to find out why, if 'res.successful' we know there are no exceptions + results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get())) + elif time.time() > results[mount]['timelimit']: + done = True + results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get())) + except Exception as e: + import traceback + done = True + results[mount]['info'] = 'N/A' + self.module.warn("Error prevented getting extra info for mount %s: [%s] %s." % (mount, type(e), to_text(e))) + self.module.debug(traceback.format_exc()) + + if done: + # move results outside and make loop only handle pending mounts.append(results[mount]['info']) del results[mount] - break - elif time.time() > results[mount]['timelimit']: - results[mount]['info']['note'] = 'Timed out while attempting to get extra information.' - mounts.append(results[mount]['info']) - del results[mount] - break - else: - # avoid cpu churn - time.sleep(0.1) + + # avoid cpu churn, sleep between retrying for loop with remaining mounts + time.sleep(0.1) return {'mounts': mounts}