fact gathering, ensure we get some linux hardware info even if some fails/times out (#74714)

* Add resiliency to linux hw fact gathering

 Now traps unexpected exceptions on mounts and continues
 gathiering other info.
 Also gives more info on why mount info gathering failed.
 more info if debugging
This commit is contained in:
Brian Coca 2021-06-01 14:52:22 -04:00 committed by GitHub
parent 8d39332c3d
commit ba2b1a6bf6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 22 deletions

View file

@ -0,0 +1,2 @@
bugfixes:
- setup, while gathering linux hardware facts be more resilient to errors and try to return more info.

View file

@ -98,7 +98,7 @@ class LinuxHardware(Hardware):
try: try:
mount_facts = self.get_mount_facts() mount_facts = self.get_mount_facts()
except timeout.TimeoutError: except timeout.TimeoutError:
pass self.module.warn("No mount facts were gathered due to timeout.")
hardware_facts.update(cpu_facts) hardware_facts.update(cpu_facts)
hardware_facts.update(memory_facts) hardware_facts.update(memory_facts)
@ -564,30 +564,37 @@ class LinuxHardware(Hardware):
# wait for workers and get results # wait for workers and get results
while results: while results:
for mount in results: for mount in list(results):
done = False
res = results[mount]['extra'] res = results[mount]['extra']
try:
if res.ready(): if res.ready():
done = True
if res.successful(): if res.successful():
mount_size, uuid = res.get() mount_size, uuid = res.get()
if mount_size: if mount_size:
results[mount]['info'].update(mount_size) results[mount]['info'].update(mount_size)
results[mount]['info']['uuid'] = uuid or 'N/A' results[mount]['info']['uuid'] = uuid or 'N/A'
else: else:
# give incomplete data # failed, try to find out why, if 'res.successful' we know there are no exceptions
errmsg = to_text(res.get()) results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
self.module.warn("Error prevented getting extra info for mount %s: %s." % (mount, errmsg))
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (errmsg)
mounts.append(results[mount]['info'])
del results[mount]
break
elif time.time() > results[mount]['timelimit']: elif time.time() > results[mount]['timelimit']:
results[mount]['info']['note'] = 'Timed out while attempting to get extra information.' done = True
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
except Exception as e:
import traceback
done = True
results[mount]['info'] = 'N/A'
self.module.warn("Error prevented getting extra info for mount %s: [%s] %s." % (mount, type(e), to_text(e)))
self.module.debug(traceback.format_exc())
if done:
# move results outside and make loop only handle pending
mounts.append(results[mount]['info']) mounts.append(results[mount]['info'])
del results[mount] del results[mount]
break
else: # avoid cpu churn, sleep between retrying for loop with remaining mounts
# avoid cpu churn
time.sleep(0.1) time.sleep(0.1)
return {'mounts': mounts} return {'mounts': mounts}