fact gathering, ensure we get some linux hardware info even if some fails/times out (#74714)

* Add resiliency to linux hw fact gathering

 Now traps unexpected exceptions on mounts and continues
 gathiering other info.
 Also gives more info on why mount info gathering failed.
 more info if debugging
This commit is contained in:
Brian Coca 2021-06-01 14:52:22 -04:00 committed by GitHub
parent 8d39332c3d
commit ba2b1a6bf6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 22 deletions

View file

@ -0,0 +1,2 @@
bugfixes:
- setup, while gathering linux hardware facts be more resilient to errors and try to return more info.

View file

@ -98,7 +98,7 @@ class LinuxHardware(Hardware):
try:
mount_facts = self.get_mount_facts()
except timeout.TimeoutError:
pass
self.module.warn("No mount facts were gathered due to timeout.")
hardware_facts.update(cpu_facts)
hardware_facts.update(memory_facts)
@ -564,31 +564,38 @@ class LinuxHardware(Hardware):
# wait for workers and get results
while results:
for mount in results:
for mount in list(results):
done = False
res = results[mount]['extra']
if res.ready():
if res.successful():
mount_size, uuid = res.get()
if mount_size:
results[mount]['info'].update(mount_size)
results[mount]['info']['uuid'] = uuid or 'N/A'
else:
# give incomplete data
errmsg = to_text(res.get())
self.module.warn("Error prevented getting extra info for mount %s: %s." % (mount, errmsg))
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (errmsg)
try:
if res.ready():
done = True
if res.successful():
mount_size, uuid = res.get()
if mount_size:
results[mount]['info'].update(mount_size)
results[mount]['info']['uuid'] = uuid or 'N/A'
else:
# failed, try to find out why, if 'res.successful' we know there are no exceptions
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
elif time.time() > results[mount]['timelimit']:
done = True
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
except Exception as e:
import traceback
done = True
results[mount]['info'] = 'N/A'
self.module.warn("Error prevented getting extra info for mount %s: [%s] %s." % (mount, type(e), to_text(e)))
self.module.debug(traceback.format_exc())
if done:
# move results outside and make loop only handle pending
mounts.append(results[mount]['info'])
del results[mount]
break
elif time.time() > results[mount]['timelimit']:
results[mount]['info']['note'] = 'Timed out while attempting to get extra information.'
mounts.append(results[mount]['info'])
del results[mount]
break
else:
# avoid cpu churn
time.sleep(0.1)
# avoid cpu churn, sleep between retrying for loop with remaining mounts
time.sleep(0.1)
return {'mounts': mounts}