Start of work to use threading instead of forking

Rather than using multiprocessing (Process and Queue objects) to do worker tasks in Ansible: * Using concurrent.futures * Using ThreadProcessExecutor * Making PluginLoader thread-safe * Gutting a lot of code dealing with message passing
2017-05-30 15:44:41 -05:00 · 2017-05-30 15:44:41 -05:00 · f95160723d
parent a7229df469
commit f95160723d
3 changed files with 165 additions and 52 deletions
--- a/lib/ansible/executor/process/threading.py
+++ b/lib/ansible/executor/process/threading.py
@ -0,0 +1,105 @@
+# (c) 2012-2014, Michael DeHaan <michael.dehaan@gmail.com>
+#
+# This file is part of Ansible
+#
+# Ansible is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Ansible is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Ansible.  If not, see <http://www.gnu.org/licenses/>.
+
+# Make coding more python3-ish
+from __future__ import (absolute_import, division, print_function)
+__metaclass__ = type
+
+import os
+import sys
+import traceback
+
+from jinja2.exceptions import TemplateNotFound
+
+from ansible.errors import AnsibleConnectionFailure
+from ansible.executor.task_executor import TaskExecutor
+from ansible.executor.task_result import TaskResult
+from ansible.module_utils._text import to_text
+
+try:
+    from __main__ import display
+except ImportError:
+    from ansible.utils.display import Display
+    display = Display()
+
+__all__ = ['WorkerProcess']
+
+
+def run_worker(task_vars, host, task, play_context, loader, variable_manager, shared_loader_obj):
+    '''
+    The worker thread class, which uses TaskExecutor to run tasks
+    read from a job queue and pushes results into a results queue
+    for reading later.
+    '''
+
+    # import cProfile, pstats, StringIO
+    # pr = cProfile.Profile()
+    # pr.enable()
+
+    try:
+        # execute the task and build a TaskResult from the result
+        display.debug("running TaskExecutor() for %s/%s" % (host, task))
+        executor_result = TaskExecutor(
+            host,
+            task,
+            task_vars,
+            play_context,
+            None, #new_stdin
+            loader,
+            shared_loader_obj,
+            None, #rslt_q
+        ).run()
+
+        display.debug("done running TaskExecutor() for %s/%s" % (host, task))
+        task_result = TaskResult(
+            host,
+            task,
+            executor_result,
+        )
+
+        # put the result on the result queue
+        display.debug("sending task result")
+        return task_result
+
+    except AnsibleConnectionFailure:
+        return TaskResult(
+            host,
+            task,
+            dict(unreachable=True),
+        )
+
+    except Exception as e:
+        if not isinstance(e, (IOError, EOFError, KeyboardInterrupt, SystemExit)) or isinstance(e, TemplateNotFound):
+            try:
+                return TaskResult(
+                    host,
+                    task,
+                    dict(failed=True, exception=to_text(traceback.format_exc()), stdout=''),
+                )
+            except:
+                display.debug(u"WORKER EXCEPTION: %s" % to_text(e))
+                display.debug(u"WORKER TRACEBACK: %s" % to_text(traceback.format_exc()))
+
+    display.debug("WORKER PROCESS EXITING")
+
+    # pr.disable()
+    # s = StringIO.StringIO()
+    # sortby = 'time'
+    # ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+    # ps.print_stats()
+    # with open('worker_%06d.stats' % os.getpid(), 'w') as f:
+    #     f.write(s.getvalue())
--- a/lib/ansible/executor/task_queue_manager.py
+++ b/lib/ansible/executor/task_queue_manager.py
@ -19,10 +19,12 @@
 from __future__ import (absolute_import, division, print_function)
 __metaclass__ = type

-import multiprocessing
+import threading
 import os
 import tempfile

+from concurrent.futures import ThreadPoolExecutor as PoolExecutor
+
 from ansible import constants as C
 from ansible.errors import AnsibleError
 from ansible.executor.play_iterator import PlayIterator
@ -100,18 +102,21 @@ class TaskQueueManager:
        self._failed_hosts = dict()
        self._unreachable_hosts = dict()

-        self._final_q = multiprocessing.Queue()
-
        # A temporary file (opened pre-fork) used by connection
        # plugins for inter-process locking.
        self._connection_lockfile = tempfile.TemporaryFile()

+        self._executor = None
+
    def _initialize_processes(self, num):
+        # FIXME: be safe about creating this
+        self._executor = PoolExecutor(max_workers=num)
+        # FIXME: do we need a global lock for workers here instead of a per-worker?
        self._workers = []

        for i in range(num):
-            rslt_q = multiprocessing.Queue()
-            self._workers.append([None, rslt_q])
+            w_lock = threading.Lock()
+            self._workers.append([None, w_lock])

    def _initialize_notified_handlers(self, play):
        '''
@ -312,18 +317,13 @@ class TaskQueueManager:
    def cleanup(self):
        display.debug("RUNNING CLEANUP")
        self.terminate()
-        self._final_q.close()
        self._cleanup_processes()

    def _cleanup_processes(self):
        if hasattr(self, '_workers'):
-            for (worker_prc, rslt_q) in self._workers:
-                rslt_q.close()
-                if worker_prc and worker_prc.is_alive():
-                    try:
-                        worker_prc.terminate()
-                    except AttributeError:
-                        pass
+            for (w_thread, w_lock) in self._workers:
+                if w_thread and w_thread.is_running():
+                    w_thread.cancel()

    def clear_failed_hosts(self):
        self._failed_hosts = dict()
--- a/lib/ansible/plugins/strategy/init.py
+++ b/lib/ansible/plugins/strategy/init.py
@ -24,13 +24,13 @@ import threading
 import time

 from collections import deque
-from multiprocessing import Lock
 from jinja2.exceptions import UndefinedError

 from ansible import constants as C
 from ansible.errors import AnsibleError, AnsibleParserError, AnsibleUndefinedVariable
 from ansible.executor import action_write_locks
-from ansible.executor.process.worker import WorkerProcess
+#from ansible.executor.process.worker import WorkerProcess
+from ansible.executor.process.threading import run_worker
 from ansible.executor.task_result import TaskResult
 from ansible.inventory.host import Host
 from ansible.module_utils.six.moves import queue as Queue
@ -54,11 +54,6 @@ except ImportError:

 __all__ = ['StrategyBase']

-
-class StrategySentinel:
-    pass
-
-
 # TODO: this should probably be in the plugins/__init__.py, with
 #       a smarter mechanism to set all of the attributes based on
 #       the loaders created there
@ -75,23 +70,30 @@ class SharedPluginLoaderObj:
        self.lookup_loader = lookup_loader
        self.module_loader = module_loader

-_sentinel = StrategySentinel()
-
-
 def results_thread_main(strategy):
-    while True:
+    while not strategy._tqm._terminated:
        try:
-            result = strategy._final_q.get()
-            if isinstance(result, StrategySentinel):
-                break
-            else:
-                strategy._results_lock.acquire()
-                strategy._results.append(result)
-                strategy._results_lock.release()
-        except (IOError, EOFError):
-            break
-        except Queue.Empty:
+            did_work = False
+            for idx, slot in enumerate(strategy._tqm._workers):
+                (w_thread, w_lock) = slot
+                try:
+                    w_lock.acquire()
+                    if w_thread and w_thread.done():
+                        result = w_thread.result()
+                        try:
+                            strategy._results_lock.acquire()
+                            strategy._results.append(result)
+                        finally:
+                            strategy._results_lock.release()
+                        strategy._tqm._workers[idx] = [None, w_lock]
+                        did_work = True
+                finally:
+                    w_lock.release()
+            if not did_work:
+                time.sleep(C.DEFAULT_INTERNAL_POLL_INTERVAL)
+        except Exception as e:
            pass
+    print("RESULTS THREAD EXITED!!!")


 class StrategyBase:
@ -102,16 +104,15 @@ class StrategyBase:
    '''

    def __init__(self, tqm):
-        self._tqm = tqm
-        self._inventory = tqm.get_inventory()
-        self._workers = tqm.get_workers()
-        self._notified_handlers = tqm._notified_handlers
+        self._tqm                = tqm
+        self._inventory          = tqm.get_inventory()
+        self._workers            = tqm._workers
+        self._notified_handlers  = tqm._notified_handlers
        self._listening_handlers = tqm._listening_handlers
-        self._variable_manager = tqm.get_variable_manager()
-        self._loader = tqm.get_loader()
-        self._final_q = tqm._final_q
-        self._step = getattr(tqm._options, 'step', False)
-        self._diff = getattr(tqm._options, 'diff', False)
+        self._variable_manager   = tqm.get_variable_manager()
+        self._loader             = tqm.get_loader()
+        self._step               = getattr(tqm._options, 'step', False)
+        self._diff               = getattr(tqm._options, 'diff', False)

        # Backwards compat: self._display isn't really needed, just import the global display and use that.
        self._display = display
@ -133,7 +134,7 @@ class StrategyBase:
        self._results_thread.start()

    def cleanup(self):
-        self._final_q.put(_sentinel)
+        self._tqm.terminate()
        self._results_thread.join()

    def run(self, iterator, play_context, result=0):
@ -203,11 +204,10 @@ class StrategyBase:

        if task.action not in action_write_locks.action_write_locks:
            display.debug('Creating lock for %s' % task.action)
-            action_write_locks.action_write_locks[task.action] = Lock()
+            action_write_locks.action_write_locks[task.action] = threading.Lock()

        # and then queue the new task
        try:
-
            # create a dummy object with plugin loaders set as an easier
            # way to share them with the forked processes
            shared_loader_obj = SharedPluginLoaderObj()
@ -215,12 +215,20 @@ class StrategyBase:
            queued = False
            starting_worker = self._cur_worker
            while True:
-                (worker_prc, rslt_q) = self._workers[self._cur_worker]
-                if worker_prc is None or not worker_prc.is_alive():
-                    worker_prc = WorkerProcess(self._final_q, task_vars, host, task, play_context, self._loader, self._variable_manager, shared_loader_obj)
-                    self._workers[self._cur_worker][0] = worker_prc
-                    worker_prc.start()
-                    display.debug("worker is %d (out of %d available)" % (self._cur_worker + 1, len(self._workers)))
+                (w_thread, w_lock) = self._workers[self._cur_worker]
+                if w_thread is None:
+                    w_thread = self._tqm._executor.submit(
+                        run_worker,
+                        task_vars,
+                        host,
+                        task,
+                        play_context,
+                        self._loader,
+                        self._variable_manager,
+                        shared_loader_obj
+                    )
+                    self._workers[self._cur_worker][0] = w_thread
+                    display.debug("worker is %d (out of %d available)" % (self._cur_worker+1, len(self._workers)))
                    queued = True
                self._cur_worker += 1
                if self._cur_worker >= len(self._workers):