[Task Manager] Add config switch around logging at different levels based on the state (#102804)

* Gate behind a config with warning message that helps users enable

* Update more files

* Fix docs formatting

* Preserve existing functionality

* Add in task type to the message

* Show multiple alert types that are over the threshold

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
Chris Roberson 2021-06-24 11:09:47 -04:00 committed by GitHub
parent 5a76c84dc9
commit dd072c3927
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 305 additions and 42 deletions

View file

@ -29,7 +29,13 @@ Task Manager runs background tasks by polling for work on an interval. You can
| The maximum number of tasks that this Kibana instance will run simultaneously. Defaults to 10.
Starting in 8.0, it will not be possible to set the value greater than 100.
| `xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds`
| `xpack.task_manager.`
`monitored_stats_health_verbose_log.enabled`
| This flag will enable automatic warn and error logging if task manager self detects a performance issue, such as the time between when a task is scheduled to execute and when it actually executes. Defaults to false.
| `xpack.task_manager.`
`monitored_stats_health_verbose_log.`
`warn_delayed_task_start_in_seconds`
| The amount of seconds we allow a task to delay before printing a warning server log. Defaults to 60.
|===

View file

@ -379,7 +379,8 @@ kibana_vars=(
xpack.task_manager.monitored_aggregated_stats_refresh_rate
xpack.task_manager.monitored_stats_required_freshness
xpack.task_manager.monitored_stats_running_average_window
xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_stats_health_verbose_log.enabled
xpack.task_manager.monitored_stats_health_verbose_log.warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_task_execution_thresholds
xpack.task_manager.poll_interval
xpack.task_manager.request_capacity

View file

@ -18,9 +18,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
@ -67,9 +70,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
@ -103,9 +109,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {
"alerting:always-fires": Object {

View file

@ -110,9 +110,12 @@ export const configSchema = schema.object(
defaultValue: {},
}),
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
monitored_stats_warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
monitored_stats_health_verbose_log: schema.object({
enabled: schema.boolean({ defaultValue: false }),
/* The amount of seconds we allow a task to delay before printing a warning server log */
warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
}),
},
{

View file

@ -37,7 +37,10 @@ describe('managed configuration', () => {
version_conflict_threshold: 80,
max_poll_inactivity_cycles: 10,
monitored_aggregated_stats_refresh_rate: 60000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 4000,
monitored_stats_running_average_window: 50,
request_capacity: 1000,

View file

@ -10,7 +10,7 @@ import { configSchema, TaskManagerConfig } from '../config';
import { HealthStatus } from '../monitoring';
import { TaskPersistence } from '../monitoring/task_run_statistics';
import { MonitoredHealth } from '../routes/health';
import { logHealthMetrics } from './log_health_metrics';
import { logHealthMetrics, resetLastLogLevel } from './log_health_metrics';
import { Logger } from '../../../../../src/core/server';
jest.mock('./calculate_health_status', () => ({
@ -20,12 +20,110 @@ jest.mock('./calculate_health_status', () => ({
describe('logHealthMetrics', () => {
afterEach(() => {
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// Reset the last state by running through this as OK
// (calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
resetLastLogLevel();
(calculateHealthStatus as jest.Mock<HealthStatus>).mockReset();
});
it('should log a warning message to enable verbose logging when the status goes from OK to Warning/Error', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// We must change from OK to Warning
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
logHealthMetrics(health, logger, config);
// We must change from OK to Error
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
expect((logger as jest.Mocked<Logger>).warn.mock.calls[1][0] as string).toBe(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
});
it('should not log a warning message to enable verbose logging when the status goes from Warning to OK', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// We must change from Warning to OK
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn).not.toHaveBeenCalled();
});
it('should not log a warning message to enable verbose logging when the status goes from Error to OK', () => {
// console.log('start', getLastLogLevel());
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// We must change from Error to OK
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn).not.toHaveBeenCalled();
});
it('should log as debug if status is OK', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
logHealthMetrics(health, logger, config);
const firstDebug = JSON.parse(
(logger as jest.Mocked<Logger>).debug.mock.calls[0][0].replace('Latest Monitored Stats: ', '')
);
expect(firstDebug).toMatchObject(health);
});
it('should log as debug if status is OK even if not enabled', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
@ -40,7 +138,10 @@ describe('logHealthMetrics', () => {
it('should log as warn if status is Warn', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
@ -62,7 +163,10 @@ describe('logHealthMetrics', () => {
it('should log as error if status is Error', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
@ -79,15 +183,26 @@ describe('logHealthMetrics', () => {
expect(logMessage).toMatchObject(health);
});
it('should log as warn if drift exceeds the threshold', () => {
it('should log as warn if drift exceeds the threshold for a single alert type', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
runtime: {
value: {
drift_by_type: {
'taskType:test': {
p99: 60000,
},
'taskType:test2': {
p99: 60000 - 1,
},
},
drift: {
p99: 60000,
},
@ -99,7 +214,50 @@ describe('logHealthMetrics', () => {
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected delay task start of 60s (which exceeds configured value of 60s)`
`Detected delay task start of 60s for task(s) \"taskType:test\" (which exceeds configured value of 60s)`
);
const secondMessage = JSON.parse(
((logger as jest.Mocked<Logger>).warn.mock.calls[1][0] as string).replace(
`Latest Monitored Stats: `,
''
)
);
expect(secondMessage).toMatchObject(health);
});
it('should log as warn if drift exceeds the threshold for multiple alert types', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
runtime: {
value: {
drift_by_type: {
'taskType:test': {
p99: 60000,
},
'taskType:test2': {
p99: 60000,
},
},
drift: {
p99: 60000,
},
},
},
},
});
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected delay task start of 60s for task(s) \"taskType:test, taskType:test2\" (which exceeds configured value of 60s)`
);
const secondMessage = JSON.parse(
@ -114,7 +272,10 @@ describe('logHealthMetrics', () => {
it('should log as debug if there are no stats', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = {
id: '1',
@ -135,7 +296,10 @@ describe('logHealthMetrics', () => {
it('should ignore capacity estimation status', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
@ -213,7 +377,14 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth {
p95: 2500,
p99: 3000,
},
drift_by_type: {},
drift_by_type: {
'taskType:test': {
p50: 1000,
p90: 2000,
p95: 2500,
p99: 3000,
},
},
load: {
p50: 1000,
p90: 2000,

View file

@ -12,11 +12,23 @@ import { TaskManagerConfig } from '../config';
import { MonitoredHealth } from '../routes/health';
import { calculateHealthStatus } from './calculate_health_status';
enum LogLevel {
Warn = 'warn',
Error = 'error',
Debug = 'debug',
}
let lastLogLevel: LogLevel | null = null;
export function resetLastLogLevel() {
lastLogLevel = null;
}
export function logHealthMetrics(
monitoredHealth: MonitoredHealth,
logger: Logger,
config: TaskManagerConfig
) {
let logLevel: LogLevel = LogLevel.Debug;
const enabled = config.monitored_stats_health_verbose_log.enabled;
const healthWithoutCapacity: MonitoredHealth = {
...monitoredHealth,
stats: {
@ -25,23 +37,54 @@ export function logHealthMetrics(
},
};
const statusWithoutCapacity = calculateHealthStatus(healthWithoutCapacity, config);
let logAsWarn = statusWithoutCapacity === HealthStatus.Warning;
const logAsError =
statusWithoutCapacity === HealthStatus.Error && !isEmpty(monitoredHealth.stats);
const driftInSeconds = (monitoredHealth.stats.runtime?.value.drift.p99 ?? 0) / 1000;
if (driftInSeconds >= config.monitored_stats_warn_delayed_task_start_in_seconds) {
logger.warn(
`Detected delay task start of ${driftInSeconds}s (which exceeds configured value of ${config.monitored_stats_warn_delayed_task_start_in_seconds}s)`
);
logAsWarn = true;
if (statusWithoutCapacity === HealthStatus.Warning) {
logLevel = LogLevel.Warn;
} else if (statusWithoutCapacity === HealthStatus.Error && !isEmpty(monitoredHealth.stats)) {
logLevel = LogLevel.Error;
}
if (logAsError) {
logger.error(`Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`);
} else if (logAsWarn) {
logger.warn(`Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`);
const message = `Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`;
if (enabled) {
const driftInSeconds = (monitoredHealth.stats.runtime?.value.drift.p99 ?? 0) / 1000;
if (
driftInSeconds >= config.monitored_stats_health_verbose_log.warn_delayed_task_start_in_seconds
) {
const taskTypes = Object.keys(monitoredHealth.stats.runtime?.value.drift_by_type ?? {})
.reduce((accum: string[], typeName) => {
if (
monitoredHealth.stats.runtime?.value.drift_by_type[typeName].p99 ===
monitoredHealth.stats.runtime?.value.drift.p99
) {
accum.push(typeName);
}
return accum;
}, [])
.join(', ');
logger.warn(
`Detected delay task start of ${driftInSeconds}s for task(s) "${taskTypes}" (which exceeds configured value of ${config.monitored_stats_health_verbose_log.warn_delayed_task_start_in_seconds}s)`
);
logLevel = LogLevel.Warn;
}
switch (logLevel) {
case LogLevel.Warn:
logger.warn(message);
break;
case LogLevel.Error:
logger.error(message);
break;
default:
logger.debug(message);
}
} else {
logger.debug(`Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`);
// This is legacy support - we used to always show this
logger.debug(message);
if (logLevel !== LogLevel.Debug && lastLogLevel === LogLevel.Debug) {
logger.warn(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
}
}
lastLogLevel = logLevel;
}

View file

@ -23,7 +23,10 @@ describe('Configuration Statistics Aggregator', () => {
max_poll_inactivity_cycles: 10,
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {
default: {

View file

@ -27,7 +27,10 @@ describe('createMonitoringStatsStream', () => {
max_poll_inactivity_cycles: 10,
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {
default: {

View file

@ -103,6 +103,9 @@ type ResultFrequencySummary = ResultFrequency & {
export interface SummarizedTaskRunStat extends JsonObject {
drift: AveragedStat;
drift_by_type: {
[alertType: string]: AveragedStat;
};
load: AveragedStat;
execution: {
duration: Record<string, AveragedStat>;

View file

@ -25,7 +25,10 @@ describe('TaskManagerPlugin', () => {
max_poll_inactivity_cycles: 10,
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 5000,
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {
@ -56,7 +59,10 @@ describe('TaskManagerPlugin', () => {
max_poll_inactivity_cycles: 10,
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 5000,
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {

View file

@ -45,7 +45,10 @@ describe('TaskPollingLifecycle', () => {
max_poll_inactivity_cycles: 10,
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 5000,
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {

View file

@ -67,7 +67,10 @@ describe('healthRoute', () => {
id,
getTaskManagerConfig({
monitored_stats_required_freshness: 1000,
monitored_stats_warn_delayed_task_start_in_seconds: 100,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 100,
},
monitored_aggregated_stats_refresh_rate: 60000,
})
);
@ -114,7 +117,10 @@ describe('healthRoute', () => {
id,
getTaskManagerConfig({
monitored_stats_required_freshness: 1000,
monitored_stats_warn_delayed_task_start_in_seconds: 120,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 120,
},
monitored_aggregated_stats_refresh_rate: 60000,
})
);
@ -173,7 +179,10 @@ describe('healthRoute', () => {
id,
getTaskManagerConfig({
monitored_stats_required_freshness: 1000,
monitored_stats_warn_delayed_task_start_in_seconds: 120,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 120,
},
monitored_aggregated_stats_refresh_rate: 60000,
})
);