fact gathering, ensure we get some linux hardware info even if some fails/times out (#74714)
* Add resiliency to linux hw fact gathering Now traps unexpected exceptions on mounts and continues gathiering other info. Also gives more info on why mount info gathering failed. more info if debugging
This commit is contained in:
parent
8d39332c3d
commit
ba2b1a6bf6
2 changed files with 31 additions and 22 deletions
2
changelogs/fragments/linux_hw_facts_fix.yml
Normal file
2
changelogs/fragments/linux_hw_facts_fix.yml
Normal file
|
@ -0,0 +1,2 @@
|
|||
bugfixes:
|
||||
- setup, while gathering linux hardware facts be more resilient to errors and try to return more info.
|
|
@ -98,7 +98,7 @@ class LinuxHardware(Hardware):
|
|||
try:
|
||||
mount_facts = self.get_mount_facts()
|
||||
except timeout.TimeoutError:
|
||||
pass
|
||||
self.module.warn("No mount facts were gathered due to timeout.")
|
||||
|
||||
hardware_facts.update(cpu_facts)
|
||||
hardware_facts.update(memory_facts)
|
||||
|
@ -564,30 +564,37 @@ class LinuxHardware(Hardware):
|
|||
|
||||
# wait for workers and get results
|
||||
while results:
|
||||
for mount in results:
|
||||
for mount in list(results):
|
||||
done = False
|
||||
res = results[mount]['extra']
|
||||
try:
|
||||
if res.ready():
|
||||
done = True
|
||||
if res.successful():
|
||||
mount_size, uuid = res.get()
|
||||
if mount_size:
|
||||
results[mount]['info'].update(mount_size)
|
||||
results[mount]['info']['uuid'] = uuid or 'N/A'
|
||||
else:
|
||||
# give incomplete data
|
||||
errmsg = to_text(res.get())
|
||||
self.module.warn("Error prevented getting extra info for mount %s: %s." % (mount, errmsg))
|
||||
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (errmsg)
|
||||
# failed, try to find out why, if 'res.successful' we know there are no exceptions
|
||||
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
|
||||
|
||||
mounts.append(results[mount]['info'])
|
||||
del results[mount]
|
||||
break
|
||||
elif time.time() > results[mount]['timelimit']:
|
||||
results[mount]['info']['note'] = 'Timed out while attempting to get extra information.'
|
||||
done = True
|
||||
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
|
||||
except Exception as e:
|
||||
import traceback
|
||||
done = True
|
||||
results[mount]['info'] = 'N/A'
|
||||
self.module.warn("Error prevented getting extra info for mount %s: [%s] %s." % (mount, type(e), to_text(e)))
|
||||
self.module.debug(traceback.format_exc())
|
||||
|
||||
if done:
|
||||
# move results outside and make loop only handle pending
|
||||
mounts.append(results[mount]['info'])
|
||||
del results[mount]
|
||||
break
|
||||
else:
|
||||
# avoid cpu churn
|
||||
|
||||
# avoid cpu churn, sleep between retrying for loop with remaining mounts
|
||||
time.sleep(0.1)
|
||||
|
||||
return {'mounts': mounts}
|
||||
|
|
Loading…
Reference in a new issue