fact gathering, ensure we get some linux hardware info even if some fails/times out (#74714)
* Add resiliency to linux hw fact gathering Now traps unexpected exceptions on mounts and continues gathiering other info. Also gives more info on why mount info gathering failed. more info if debugging
This commit is contained in:
parent
8d39332c3d
commit
ba2b1a6bf6
2 changed files with 31 additions and 22 deletions
2
changelogs/fragments/linux_hw_facts_fix.yml
Normal file
2
changelogs/fragments/linux_hw_facts_fix.yml
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
bugfixes:
|
||||||
|
- setup, while gathering linux hardware facts be more resilient to errors and try to return more info.
|
|
@ -98,7 +98,7 @@ class LinuxHardware(Hardware):
|
||||||
try:
|
try:
|
||||||
mount_facts = self.get_mount_facts()
|
mount_facts = self.get_mount_facts()
|
||||||
except timeout.TimeoutError:
|
except timeout.TimeoutError:
|
||||||
pass
|
self.module.warn("No mount facts were gathered due to timeout.")
|
||||||
|
|
||||||
hardware_facts.update(cpu_facts)
|
hardware_facts.update(cpu_facts)
|
||||||
hardware_facts.update(memory_facts)
|
hardware_facts.update(memory_facts)
|
||||||
|
@ -564,31 +564,38 @@ class LinuxHardware(Hardware):
|
||||||
|
|
||||||
# wait for workers and get results
|
# wait for workers and get results
|
||||||
while results:
|
while results:
|
||||||
for mount in results:
|
for mount in list(results):
|
||||||
|
done = False
|
||||||
res = results[mount]['extra']
|
res = results[mount]['extra']
|
||||||
if res.ready():
|
try:
|
||||||
if res.successful():
|
if res.ready():
|
||||||
mount_size, uuid = res.get()
|
done = True
|
||||||
if mount_size:
|
if res.successful():
|
||||||
results[mount]['info'].update(mount_size)
|
mount_size, uuid = res.get()
|
||||||
results[mount]['info']['uuid'] = uuid or 'N/A'
|
if mount_size:
|
||||||
else:
|
results[mount]['info'].update(mount_size)
|
||||||
# give incomplete data
|
results[mount]['info']['uuid'] = uuid or 'N/A'
|
||||||
errmsg = to_text(res.get())
|
else:
|
||||||
self.module.warn("Error prevented getting extra info for mount %s: %s." % (mount, errmsg))
|
# failed, try to find out why, if 'res.successful' we know there are no exceptions
|
||||||
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (errmsg)
|
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
|
||||||
|
|
||||||
|
elif time.time() > results[mount]['timelimit']:
|
||||||
|
done = True
|
||||||
|
results[mount]['info']['note'] = 'Could not get extra information: %s.' % (to_text(res.get()))
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
done = True
|
||||||
|
results[mount]['info'] = 'N/A'
|
||||||
|
self.module.warn("Error prevented getting extra info for mount %s: [%s] %s." % (mount, type(e), to_text(e)))
|
||||||
|
self.module.debug(traceback.format_exc())
|
||||||
|
|
||||||
|
if done:
|
||||||
|
# move results outside and make loop only handle pending
|
||||||
mounts.append(results[mount]['info'])
|
mounts.append(results[mount]['info'])
|
||||||
del results[mount]
|
del results[mount]
|
||||||
break
|
|
||||||
elif time.time() > results[mount]['timelimit']:
|
# avoid cpu churn, sleep between retrying for loop with remaining mounts
|
||||||
results[mount]['info']['note'] = 'Timed out while attempting to get extra information.'
|
time.sleep(0.1)
|
||||||
mounts.append(results[mount]['info'])
|
|
||||||
del results[mount]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# avoid cpu churn
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
return {'mounts': mounts}
|
return {'mounts': mounts}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue