Add timeout support to ansible-test. (#53302)

* Add timeout support to ansible-test.
* Fix ansible-test tar filename filter bug.
* Update timeouts used on Shippable.
* Kill subprocesses when parent process terminates.
* Require explicit use of env --show option.
This commit is contained in:
Matt Clay 2019-03-05 11:58:13 -08:00 committed by GitHub
parent 44b347aef5
commit a8d829d9c3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 218 additions and 20 deletions

1
.gitignore vendored
View file

@ -105,3 +105,4 @@ test/units/.coverage.*
/test/integration/cloud-config-azure.yml
/SYMLINK_CACHE.json
changelogs/.plugin-cache.yaml
.ansible-test-timeout.json

View file

@ -52,6 +52,7 @@ from lib.config import (
from lib.env import (
EnvConfig,
command_env,
configure_timeout,
)
from lib.sanity import (
@ -95,6 +96,7 @@ def main():
display.color = config.color
display.info_stderr = (isinstance(config, SanityConfig) and config.lint) or (isinstance(config, IntegrationConfig) and config.list_targets)
check_startup()
configure_timeout(config)
display.info('RLIMIT_NOFILE: %s' % (CURRENT_RLIMIT_NOFILE,), verbosity=2)
display.info('MAXFD: %d' % MAXFD, verbosity=2)
@ -509,6 +511,11 @@ def parse_args():
action='store_true',
help='dump environment to disk')
env.add_argument('--timeout',
type=int,
metavar='MINUTES',
help='timeout for future ansible-test commands (0 clears)')
if argcomplete:
argcomplete.autocomplete(parser, always_complete_options=False, validator=lambda i, k: True)

View file

@ -3,3 +3,6 @@
# Setting a low soft RLIMIT_NOFILE value will improve the performance of subprocess.Popen on Python 2.x when close_fds=True.
# This will affect all Python subprocesses. It will also affect the current Python process if set before subprocess is imported for the first time.
SOFT_RLIMIT_NOFILE = 1024
# File used to track the ansible-test test execution timeout.
TIMEOUT_PATH = '.ansible-test-timeout.json'

View file

@ -4,13 +4,17 @@ from __future__ import absolute_import, print_function
import datetime
import json
import functools
import os
import platform
import re
import signal
import sys
import time
from lib.config import (
CommonConfig,
TestConfig,
)
from lib.util import (
@ -34,6 +38,18 @@ from lib.docker_util import (
docker_version
)
from lib.thread import (
WrappedThread,
)
from lib.constants import (
TIMEOUT_PATH,
)
from lib.test import (
TestTimeout,
)
class EnvConfig(CommonConfig):
"""Configuration for the tools command."""
@ -43,14 +59,26 @@ class EnvConfig(CommonConfig):
"""
super(EnvConfig, self).__init__(args, 'env')
self.show = args.show or not args.dump
self.show = args.show
self.dump = args.dump
self.timeout = args.timeout
def command_env(args):
"""
:type args: EnvConfig
"""
show_dump_env(args)
set_timeout(args)
def show_dump_env(args):
"""
:type args: EnvConfig
"""
if not args.show and not args.dump:
return
data = dict(
ansible=dict(
version=get_ansible_version(args),
@ -84,6 +112,105 @@ def command_env(args):
results_fd.write(json.dumps(data, sort_keys=True))
def set_timeout(args):
"""
:type args: EnvConfig
"""
if args.timeout is None:
return
if args.timeout:
deadline = (datetime.datetime.utcnow() + datetime.timedelta(minutes=args.timeout)).strftime('%Y-%m-%dT%H:%M:%SZ')
display.info('Setting a %d minute test timeout which will end at: %s' % (args.timeout, deadline), verbosity=1)
else:
deadline = None
display.info('Clearing existing test timeout.', verbosity=1)
if args.explain:
return
if deadline:
data = dict(
duration=args.timeout,
deadline=deadline,
)
with open(TIMEOUT_PATH, 'w') as timeout_fd:
json.dump(data, timeout_fd, indent=4, sort_keys=True)
elif os.path.exists(TIMEOUT_PATH):
os.remove(TIMEOUT_PATH)
def get_timeout():
"""
:rtype: dict[str, any] | None
"""
if not os.path.exists(TIMEOUT_PATH):
return None
with open(TIMEOUT_PATH, 'r') as timeout_fd:
data = json.load(timeout_fd)
data['deadline'] = datetime.datetime.strptime(data['deadline'], '%Y-%m-%dT%H:%M:%SZ')
return data
def configure_timeout(args):
"""
:type args: CommonConfig
"""
if isinstance(args, TestConfig):
configure_test_timeout(args) # only tests are subject to the timeout
def configure_test_timeout(args):
"""
:type args: TestConfig
"""
timeout = get_timeout()
if not timeout:
return
timeout_start = datetime.datetime.utcnow()
timeout_duration = timeout['duration']
timeout_deadline = timeout['deadline']
timeout_remaining = timeout_deadline - timeout_start
test_timeout = TestTimeout(timeout_duration)
if timeout_remaining <= datetime.timedelta():
test_timeout.write(args)
raise ApplicationError('The %d minute test timeout expired %s ago at %s.' % (
timeout_duration, timeout_remaining * -1, timeout_deadline))
display.info('The %d minute test timeout expires in %s at %s.' % (
timeout_duration, timeout_remaining, timeout_deadline), verbosity=1)
def timeout_handler(_dummy1, _dummy2):
"""Runs when SIGUSR1 is received."""
test_timeout.write(args)
raise ApplicationError('Tests aborted after exceeding the %d minute time limit.' % timeout_duration)
def timeout_waiter(timeout_seconds):
"""
:type timeout_seconds: int
"""
time.sleep(timeout_seconds)
os.kill(os.getpid(), signal.SIGUSR1)
signal.signal(signal.SIGUSR1, timeout_handler)
instance = WrappedThread(functools.partial(timeout_waiter, timeout_remaining.seconds))
instance.daemon = True
instance.start()
def show_dict(data, verbose, root_verbosity=0, path=None):
"""
:type data: dict[str, any]

View file

@ -11,6 +11,10 @@ from lib.util import (
ABC,
)
from lib.constants import (
TIMEOUT_PATH,
)
# improve performance by disabling uid/gid lookups
tarfile.pwd = None
tarfile.grp = None
@ -45,6 +49,7 @@ class DefaultTarFilter(TarFilter):
self.ignore_files = (
'.gitignore',
'.gitdir',
TIMEOUT_PATH,
)
self.ignore_extensions = (
@ -58,7 +63,7 @@ class DefaultTarFilter(TarFilter):
:rtype: tarfile.TarInfo | None
"""
filename = os.path.basename(item.path)
name, ext = os.path.splitext(filename)
ext = os.path.splitext(filename)[1]
dirs = os.path.split(item.path)
if not item.isdir():
@ -68,7 +73,7 @@ class DefaultTarFilter(TarFilter):
if item.path.startswith('./docs/docsite/_build/'):
return None
if name in self.ignore_files:
if filename in self.ignore_files:
return None
if ext in self.ignore_extensions:

View file

@ -57,15 +57,17 @@ def calculate_confidence(path, line, metadata):
class TestResult(object):
"""Base class for test results."""
def __init__(self, command, test, python_version=None):
def __init__(self, command, test, python_version=None, enable_junit=False):
"""
:type command: str
:type test: str
:type python_version: str
:type enable_junit: bool
"""
self.command = command
self.test = test
self.python_version = python_version
self.enable_junit = enable_junit
self.name = self.test or self.command
if self.python_version:
@ -88,7 +90,7 @@ class TestResult(object):
if args.lint:
self.write_lint()
if args.junit:
if args.junit or self.enable_junit:
if self.junit:
self.write_junit(args)
else:
@ -159,6 +161,38 @@ class TestResult(object):
xml.write(report.encode('utf-8', 'strict'))
class TestTimeout(TestResult):
"""Test timeout."""
def __init__(self, timeout_duration):
"""
:type timeout_duration: int
"""
super(TestTimeout, self).__init__(command='timeout', test='', enable_junit=True)
self.timeout_duration = timeout_duration
def write_junit(self, args):
"""
:type args: TestConfig
"""
message = 'Tests were aborted after exceeding the %d minute time limit.' % self.timeout_duration
output = '''One or more of the following situations may be responsible:
- Code changes have resulted in tests that hang or run for an excessive amount of time.
- Tests have been added which exceed the time limit when combined with existing tests.
- Test infrastructure and/or external dependencies are operating slower than normal.'''
if args.coverage:
output += '\n- Additional overhead from collecting code coverage has resulted in tests exceeding the time limit.'
output += '\n\nConsult the console log for additional details on where the timeout occurred.'
test_case = self.junit.TestCase(classname=self.command, name=self.name)
test_case.add_error_info(message, output)
self.save_junit(args, test_case)
class TestSuccess(TestResult):
"""Test success."""
def write_junit(self, args):

View file

@ -380,7 +380,9 @@ def raw_command(cmd, capture=False, env=None, data=None, cwd=None, explain=False
stderr = None
start = time.time()
process = None
try:
try:
process = subprocess.Popen(cmd, env=env, stdin=stdin, stdout=stdout, stderr=stderr, cwd=cwd)
except OSError as ex:
@ -397,6 +399,11 @@ def raw_command(cmd, capture=False, env=None, data=None, cwd=None, explain=False
else:
process.wait()
stdout_text, stderr_text = None, None
finally:
if process and process.returncode is None:
process.kill()
display.info('') # the process we're interrupting may have completed a partial line of output
display.notice('Killed command to avoid an orphaned child process during handling of an unexpected exception.')
status = process.returncode
runtime = time.time() - start

View file

@ -116,6 +116,12 @@ function cleanup
trap cleanup EXIT
ansible-test env --dump --show --color -v
if [[ "${COVERAGE:-}" ]]; then
timeout=60
else
timeout=45
fi
ansible-test env --dump --show --timeout "${timeout}" --color -v
"test/utils/shippable/${script}.sh" "${test}"

View file

@ -7,5 +7,13 @@ IFS='/:' read -ra args <<< "$1"
version="${args[1]}"
if [[ "${COVERAGE:-}" ]]; then
timeout=90
else
timeout=10
fi
ansible-test env --timeout "${timeout}" --color -v
# shellcheck disable=SC2086
ansible-test units --color -v --docker default --python "${version}" ${COVERAGE:+"$COVERAGE"} ${CHANGED:+"$CHANGED"} \