Add timeout support to ansible-test. (#53302)

* Add timeout support to ansible-test. * Fix ansible-test tar filename filter bug. * Update timeouts used on Shippable. * Kill subprocesses when parent process terminates. * Require explicit use of env --show option.
2019-03-05 11:58:13 -08:00 · 2019-03-05 11:58:13 -08:00 · a8d829d9c3
commit a8d829d9c3
parent 44b347aef5
9 changed files with 218 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -105,3 +105,4 @@ test/units/.coverage.*
 /test/integration/cloud-config-azure.yml
 /SYMLINK_CACHE.json
 changelogs/.plugin-cache.yaml
+.ansible-test-timeout.json
--- a/test/runner/lib/cli.py
+++ b/test/runner/lib/cli.py
@ -52,6 +52,7 @@ from lib.config import (
 from lib.env import (
    EnvConfig,
    command_env,
+    configure_timeout,
 )

 from lib.sanity import (
@ -95,6 +96,7 @@ def main():
        display.color = config.color
        display.info_stderr = (isinstance(config, SanityConfig) and config.lint) or (isinstance(config, IntegrationConfig) and config.list_targets)
        check_startup()
+        configure_timeout(config)

        display.info('RLIMIT_NOFILE: %s' % (CURRENT_RLIMIT_NOFILE,), verbosity=2)
        display.info('MAXFD: %d' % MAXFD, verbosity=2)
@ -509,6 +511,11 @@ def parse_args():
                     action='store_true',
                     help='dump environment to disk')

+    env.add_argument('--timeout',
+                     type=int,
+                     metavar='MINUTES',
+                     help='timeout for future ansible-test commands (0 clears)')
+
    if argcomplete:
        argcomplete.autocomplete(parser, always_complete_options=False, validator=lambda i, k: True)

--- a/test/runner/lib/constants.py
+++ b/test/runner/lib/constants.py
@ -3,3 +3,6 @@
 # Setting a low soft RLIMIT_NOFILE value will improve the performance of subprocess.Popen on Python 2.x when close_fds=True.
 # This will affect all Python subprocesses. It will also affect the current Python process if set before subprocess is imported for the first time.
 SOFT_RLIMIT_NOFILE = 1024
+
+# File used to track the ansible-test test execution timeout.
+TIMEOUT_PATH = '.ansible-test-timeout.json'
--- a/test/runner/lib/env.py
+++ b/test/runner/lib/env.py
@ -4,13 +4,17 @@ from __future__ import absolute_import, print_function

 import datetime
 import json
+import functools
 import os
 import platform
 import re
+import signal
 import sys
+import time

 from lib.config import (
    CommonConfig,
+    TestConfig,
 )

 from lib.util import (
@ -34,6 +38,18 @@ from lib.docker_util import (
    docker_version
 )

+from lib.thread import (
+    WrappedThread,
+)
+
+from lib.constants import (
+    TIMEOUT_PATH,
+)
+
+from lib.test import (
+    TestTimeout,
+)
+

 class EnvConfig(CommonConfig):
    """Configuration for the tools command."""
@ -43,14 +59,26 @@ class EnvConfig(CommonConfig):
        """
        super(EnvConfig, self).__init__(args, 'env')

-        self.show = args.show or not args.dump
+        self.show = args.show
        self.dump = args.dump
+        self.timeout = args.timeout


 def command_env(args):
    """
    :type args: EnvConfig
    """
+    show_dump_env(args)
+    set_timeout(args)
+
+
+def show_dump_env(args):
+    """
+    :type args: EnvConfig
+    """
+    if not args.show and not args.dump:
+        return
+
    data = dict(
        ansible=dict(
            version=get_ansible_version(args),
@ -84,6 +112,105 @@ def command_env(args):
            results_fd.write(json.dumps(data, sort_keys=True))


+def set_timeout(args):
+    """
+    :type args: EnvConfig
+    """
+    if args.timeout is None:
+        return
+
+    if args.timeout:
+        deadline = (datetime.datetime.utcnow() + datetime.timedelta(minutes=args.timeout)).strftime('%Y-%m-%dT%H:%M:%SZ')
+
+        display.info('Setting a %d minute test timeout which will end at: %s' % (args.timeout, deadline), verbosity=1)
+    else:
+        deadline = None
+
+        display.info('Clearing existing test timeout.', verbosity=1)
+
+    if args.explain:
+        return
+
+    if deadline:
+        data = dict(
+            duration=args.timeout,
+            deadline=deadline,
+        )
+
+        with open(TIMEOUT_PATH, 'w') as timeout_fd:
+            json.dump(data, timeout_fd, indent=4, sort_keys=True)
+    elif os.path.exists(TIMEOUT_PATH):
+        os.remove(TIMEOUT_PATH)
+
+
+def get_timeout():
+    """
+    :rtype: dict[str, any] | None
+    """
+    if not os.path.exists(TIMEOUT_PATH):
+        return None
+
+    with open(TIMEOUT_PATH, 'r') as timeout_fd:
+        data = json.load(timeout_fd)
+
+    data['deadline'] = datetime.datetime.strptime(data['deadline'], '%Y-%m-%dT%H:%M:%SZ')
+
+    return data
+
+
+def configure_timeout(args):
+    """
+    :type args: CommonConfig
+    """
+    if isinstance(args, TestConfig):
+        configure_test_timeout(args)  # only tests are subject to the timeout
+
+
+def configure_test_timeout(args):
+    """
+    :type args: TestConfig
+    """
+    timeout = get_timeout()
+
+    if not timeout:
+        return
+
+    timeout_start = datetime.datetime.utcnow()
+    timeout_duration = timeout['duration']
+    timeout_deadline = timeout['deadline']
+    timeout_remaining = timeout_deadline - timeout_start
+
+    test_timeout = TestTimeout(timeout_duration)
+
+    if timeout_remaining <= datetime.timedelta():
+        test_timeout.write(args)
+
+        raise ApplicationError('The %d minute test timeout expired %s ago at %s.' % (
+            timeout_duration, timeout_remaining * -1, timeout_deadline))
+
+    display.info('The %d minute test timeout expires in %s at %s.' % (
+        timeout_duration, timeout_remaining, timeout_deadline), verbosity=1)
+
+    def timeout_handler(_dummy1, _dummy2):
+        """Runs when SIGUSR1 is received."""
+        test_timeout.write(args)
+
+        raise ApplicationError('Tests aborted after exceeding the %d minute time limit.' % timeout_duration)
+
+    def timeout_waiter(timeout_seconds):
+        """
+        :type timeout_seconds: int
+        """
+        time.sleep(timeout_seconds)
+        os.kill(os.getpid(), signal.SIGUSR1)
+
+    signal.signal(signal.SIGUSR1, timeout_handler)
+
+    instance = WrappedThread(functools.partial(timeout_waiter, timeout_remaining.seconds))
+    instance.daemon = True
+    instance.start()
+
+
 def show_dict(data, verbose, root_verbosity=0, path=None):
    """
    :type data: dict[str, any]
--- a/test/runner/lib/pytar.py
+++ b/test/runner/lib/pytar.py
@ -11,6 +11,10 @@ from lib.util import (
    ABC,
 )

+from lib.constants import (
+    TIMEOUT_PATH,
+)
+
 # improve performance by disabling uid/gid lookups
 tarfile.pwd = None
 tarfile.grp = None
@ -45,6 +49,7 @@ class DefaultTarFilter(TarFilter):
        self.ignore_files = (
            '.gitignore',
            '.gitdir',
+            TIMEOUT_PATH,
        )

        self.ignore_extensions = (
@ -58,7 +63,7 @@ class DefaultTarFilter(TarFilter):
        :rtype: tarfile.TarInfo | None
        """
        filename = os.path.basename(item.path)
-        name, ext = os.path.splitext(filename)
+        ext = os.path.splitext(filename)[1]
        dirs = os.path.split(item.path)

        if not item.isdir():
@ -68,7 +73,7 @@ class DefaultTarFilter(TarFilter):
            if item.path.startswith('./docs/docsite/_build/'):
                return None

-        if name in self.ignore_files:
+        if filename in self.ignore_files:
            return None

        if ext in self.ignore_extensions:
--- a/test/runner/lib/test.py
+++ b/test/runner/lib/test.py
@ -57,15 +57,17 @@ def calculate_confidence(path, line, metadata):

 class TestResult(object):
    """Base class for test results."""
-    def __init__(self, command, test, python_version=None):
+    def __init__(self, command, test, python_version=None, enable_junit=False):
        """
        :type command: str
        :type test: str
        :type python_version: str
+        :type enable_junit: bool
        """
        self.command = command
        self.test = test
        self.python_version = python_version
+        self.enable_junit = enable_junit
        self.name = self.test or self.command

        if self.python_version:
@ -88,7 +90,7 @@ class TestResult(object):
        if args.lint:
            self.write_lint()

-        if args.junit:
+        if args.junit or self.enable_junit:
            if self.junit:
                self.write_junit(args)
            else:
@ -159,6 +161,38 @@ class TestResult(object):
            xml.write(report.encode('utf-8', 'strict'))


+class TestTimeout(TestResult):
+    """Test timeout."""
+    def __init__(self, timeout_duration):
+        """
+        :type timeout_duration: int
+        """
+        super(TestTimeout, self).__init__(command='timeout', test='', enable_junit=True)
+
+        self.timeout_duration = timeout_duration
+
+    def write_junit(self, args):
+        """
+        :type args: TestConfig
+        """
+        message = 'Tests were aborted after exceeding the %d minute time limit.' % self.timeout_duration
+        output = '''One or more of the following situations may be responsible:
+
+- Code changes have resulted in tests that hang or run for an excessive amount of time.
+- Tests have been added which exceed the time limit when combined with existing tests.
+- Test infrastructure and/or external dependencies are operating slower than normal.'''
+
+        if args.coverage:
+            output += '\n- Additional overhead from collecting code coverage has resulted in tests exceeding the time limit.'
+
+        output += '\n\nConsult the console log for additional details on where the timeout occurred.'
+
+        test_case = self.junit.TestCase(classname=self.command, name=self.name)
+        test_case.add_error_info(message, output)
+
+        self.save_junit(args, test_case)
+
+
 class TestSuccess(TestResult):
    """Test success."""
    def write_junit(self, args):
--- a/test/runner/lib/util.py
+++ b/test/runner/lib/util.py
@ -380,7 +380,9 @@ def raw_command(cmd, capture=False, env=None, data=None, cwd=None, explain=False
        stderr = None

    start = time.time()
+    process = None

+    try:
        try:
            process = subprocess.Popen(cmd, env=env, stdin=stdin, stdout=stdout, stderr=stderr, cwd=cwd)
        except OSError as ex:
@ -397,6 +399,11 @@ def raw_command(cmd, capture=False, env=None, data=None, cwd=None, explain=False
        else:
            process.wait()
            stdout_text, stderr_text = None, None
+    finally:
+        if process and process.returncode is None:
+            process.kill()
+            display.info('')  # the process we're interrupting may have completed a partial line of output
+            display.notice('Killed command to avoid an orphaned child process during handling of an unexpected exception.')

    status = process.returncode
    runtime = time.time() - start
--- a/test/utils/shippable/shippable.sh
+++ b/test/utils/shippable/shippable.sh
@ -116,6 +116,12 @@ function cleanup

 trap cleanup EXIT

-ansible-test env --dump --show --color -v
+if [[ "${COVERAGE:-}" ]]; then
+    timeout=60
+else
+    timeout=45
+fi
+
+ansible-test env --dump --show --timeout "${timeout}" --color -v

 "test/utils/shippable/${script}.sh" "${test}"
--- a/test/utils/shippable/units.sh
+++ b/test/utils/shippable/units.sh
@ -7,5 +7,13 @@ IFS='/:' read -ra args <<< "$1"

 version="${args[1]}"

+if [[ "${COVERAGE:-}" ]]; then
+    timeout=90
+else
+    timeout=10
+fi
+
+ansible-test env --timeout "${timeout}" --color -v
+
 # shellcheck disable=SC2086
 ansible-test units --color -v --docker default --python "${version}" ${COVERAGE:+"$COVERAGE"} ${CHANGED:+"$CHANGED"} \