[Metrics UI] Fix No Data alerts on Metric Threshold with groupBy (#111465) (#113021)

* [Metrics UI] Fix No Data alerts on Metric Threshold with groupBy

* Fix typecheck

* Add integration test for no data groupBy scenario

* Uncomment test files

* Uncomment test files

* Reset stored groups when groupBy parameter changes

* Test for groupBy arrays

* Fix initial No Data result when no groups are detected yet

* Fix linting error

* Fix detecting groups with doc count 0

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>

Co-authored-by: Zacqary Adam Xeper <Zacqary@users.noreply.github.com>
This commit is contained in:
Kibana Machine 2021-09-23 16:03:45 -04:00 committed by GitHub
parent 55d4791afd
commit 4c6ffc70e6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 289 additions and 39 deletions

View file

@ -64,6 +64,7 @@ export const evaluateAlert = <Params extends EvaluatedAlertParams = EvaluatedAle
esClient: ElasticsearchClient,
params: Params,
config: InfraSource['configuration'],
prevGroups: string[],
timeframe?: { start?: number; end: number }
) => {
const { criteria, groupBy, filterQuery, shouldDropPartialBuckets } = params;
@ -91,21 +92,53 @@ export const evaluateAlert = <Params extends EvaluatedAlertParams = EvaluatedAle
: [false];
};
return mapValues(currentValues, (points: any[] | typeof NaN | null) => {
if (isTooManyBucketsPreviewException(points)) throw points;
return {
...criterion,
metric: criterion.metric ?? DOCUMENT_COUNT_I18N,
currentValue: Array.isArray(points) ? last(points)?.value : NaN,
timestamp: Array.isArray(points) ? last(points)?.key : NaN,
shouldFire: pointsEvaluator(points, threshold, comparator),
shouldWarn: pointsEvaluator(points, warningThreshold, warningComparator),
isNoData: Array.isArray(points)
? points.map((point) => point?.value === null || point === null)
: [points === null],
isError: isNaN(Array.isArray(points) ? last(points)?.value : points),
};
});
// If any previous groups are no longer being reported, backfill them with null values
const currentGroups = Object.keys(currentValues);
const missingGroups = prevGroups.filter((g) => !currentGroups.includes(g));
if (currentGroups.length === 0 && missingGroups.length === 0) {
missingGroups.push(UNGROUPED_FACTORY_KEY);
}
const backfillTimestamp =
last(last(Object.values(currentValues)))?.key ?? new Date().toISOString();
const backfilledPrevGroups: Record<
string,
Array<{ key: string; value: number }>
> = missingGroups.reduce(
(result, group) => ({
...result,
[group]: [
{
key: backfillTimestamp,
value: criterion.aggType === Aggregators.COUNT ? 0 : null,
},
],
}),
{}
);
const currentValuesWithBackfilledPrevGroups = {
...currentValues,
...backfilledPrevGroups,
};
return mapValues(
currentValuesWithBackfilledPrevGroups,
(points: any[] | typeof NaN | null) => {
if (isTooManyBucketsPreviewException(points)) throw points;
return {
...criterion,
metric: criterion.metric ?? DOCUMENT_COUNT_I18N,
currentValue: Array.isArray(points) ? last(points)?.value : NaN,
timestamp: Array.isArray(points) ? last(points)?.key : NaN,
shouldFire: pointsEvaluator(points, threshold, comparator),
shouldWarn: pointsEvaluator(points, warningThreshold, warningComparator),
isNoData: Array.isArray(points)
? points.map((point) => point?.value === null || point === null)
: [points === null],
isError: isNaN(Array.isArray(points) ? last(points)?.value : points),
};
}
);
})
);
};
@ -119,7 +152,7 @@ const getMetric: (
filterQuery: string | undefined,
timeframe?: { start?: number; end: number },
shouldDropPartialBuckets?: boolean
) => Promise<Record<string, number[]>> = async function (
) => Promise<Record<string, Array<{ key: string; value: number }>>> = async function (
esClient,
params,
index,

View file

@ -37,10 +37,13 @@ let persistAlertInstances = false; // eslint-disable-line prefer-const
type TestRuleState = Record<string, unknown> & {
aRuleStateKey: string;
groups: string[];
groupBy?: string | string[];
};
const initialRuleState: TestRuleState = {
aRuleStateKey: 'INITIAL_RULE_STATE_VALUE',
groups: [],
};
const mockOptions = {
@ -90,6 +93,7 @@ const mockOptions = {
describe('The metric threshold alert type', () => {
describe('querying the entire infrastructure', () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
executor({
@ -157,20 +161,29 @@ describe('The metric threshold alert type', () => {
});
describe('querying with a groupBy parameter', () => {
const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
afterAll(() => clearInstances());
const execute = (
comparator: Comparator,
threshold: number[],
groupBy: string[] = ['something'],
metric?: string,
state?: any
) =>
executor({
...mockOptions,
services,
params: {
groupBy: 'something',
groupBy,
criteria: [
{
...baseNonCountCriterion,
comparator,
threshold,
metric: metric ?? baseNonCountCriterion.metric,
},
],
},
state: state ?? mockOptions.state.wrapped,
});
const instanceIdA = 'a';
const instanceIdB = 'b';
@ -194,9 +207,35 @@ describe('The metric threshold alert type', () => {
expect(mostRecentAction(instanceIdA).action.group).toBe('a');
expect(mostRecentAction(instanceIdB).action.group).toBe('b');
});
test('reports previous groups and the groupBy parameter in its state', async () => {
const stateResult = await execute(Comparator.GT, [0.75]);
expect(stateResult.groups).toEqual(expect.arrayContaining(['a', 'b']));
expect(stateResult.groupBy).toEqual(['something']);
});
test('persists previous groups that go missing, until the groupBy param changes', async () => {
const stateResult1 = await execute(Comparator.GT, [0.75], ['something'], 'test.metric.2');
expect(stateResult1.groups).toEqual(expect.arrayContaining(['a', 'b', 'c']));
const stateResult2 = await execute(
Comparator.GT,
[0.75],
['something'],
'test.metric.1',
stateResult1
);
expect(stateResult2.groups).toEqual(expect.arrayContaining(['a', 'b', 'c']));
const stateResult3 = await execute(
Comparator.GT,
[0.75],
['something', 'something-else'],
'test.metric.1',
stateResult2
);
expect(stateResult3.groups).toEqual(expect.arrayContaining(['a', 'b']));
});
});
describe('querying with multiple criteria', () => {
afterAll(() => clearInstances());
const execute = (
comparator: Comparator,
thresholdA: number[],
@ -257,6 +296,7 @@ describe('The metric threshold alert type', () => {
});
});
describe('querying with the count aggregator', () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
executor({
@ -279,8 +319,47 @@ describe('The metric threshold alert type', () => {
await execute(Comparator.LT, [0.5]);
expect(mostRecentAction(instanceID)).toBe(undefined);
});
describe('with a groupBy parameter', () => {
const executeGroupBy = (
comparator: Comparator,
threshold: number[],
sourceId: string = 'default',
state?: any
) =>
executor({
...mockOptions,
services,
params: {
sourceId,
groupBy: 'something',
criteria: [
{
...baseCountCriterion,
comparator,
threshold,
},
],
},
state: state ?? mockOptions.state.wrapped,
});
const instanceIdA = 'a';
const instanceIdB = 'b';
test('successfully detects and alerts on a document count of 0', async () => {
const resultState = await executeGroupBy(Comparator.LT_OR_EQ, [0]);
expect(mostRecentAction(instanceIdA)).toBe(undefined);
expect(mostRecentAction(instanceIdB)).toBe(undefined);
await executeGroupBy(Comparator.LT_OR_EQ, [0], 'empty-response', resultState);
expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id);
expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id);
await executeGroupBy(Comparator.LT_OR_EQ, [0]);
expect(mostRecentAction(instanceIdA)).toBe(undefined);
expect(mostRecentAction(instanceIdB)).toBe(undefined);
});
});
});
describe('querying with the p99 aggregator', () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
executor({
@ -306,6 +385,7 @@ describe('The metric threshold alert type', () => {
});
});
describe('querying with the p95 aggregator', () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
executor({
@ -332,6 +412,7 @@ describe('The metric threshold alert type', () => {
});
});
describe("querying a metric that hasn't reported data", () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = (alertOnNoData: boolean, sourceId: string = 'default') =>
executor({
@ -360,7 +441,51 @@ describe('The metric threshold alert type', () => {
});
});
describe('querying a groupBy alert that starts reporting no data, and then later reports data', () => {
afterAll(() => clearInstances());
const instanceID = '*';
const instanceIdA = 'a';
const instanceIdB = 'b';
const execute = (metric: string, state?: any) =>
executor({
...mockOptions,
services,
params: {
groupBy: 'something',
sourceId: 'default',
criteria: [
{
...baseNonCountCriterion,
comparator: Comparator.GT,
threshold: [0],
metric,
},
],
alertOnNoData: true,
},
state: state ?? mockOptions.state.wrapped,
});
const resultState: any[] = [];
test('first sends a No Data alert with the * group, but then reports groups when data is available', async () => {
resultState.push(await execute('test.metric.3'));
expect(mostRecentAction(instanceID).id).toBe(FIRED_ACTIONS.id);
resultState.push(await execute('test.metric.3', resultState.pop()));
expect(mostRecentAction(instanceID).id).toBe(FIRED_ACTIONS.id);
resultState.push(await execute('test.metric.1', resultState.pop()));
expect(mostRecentAction(instanceID)).toBe(undefined);
expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id);
expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id);
});
test('sends No Data alerts for the previously detected groups when they stop reporting data, but not the * group', async () => {
await execute('test.metric.3', resultState.pop());
expect(mostRecentAction(instanceID)).toBe(undefined);
expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id);
expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id);
});
});
describe("querying a rate-aggregated metric that hasn't reported data", () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = (sourceId: string = 'default') =>
executor({
@ -439,6 +564,7 @@ describe('The metric threshold alert type', () => {
*/
describe('querying a metric with a percentage metric', () => {
afterAll(() => clearInstances());
const instanceID = '*';
const execute = () =>
executor({
@ -497,7 +623,15 @@ const services: AlertServicesMock &
services.scopedClusterClient.asCurrentUser.search.mockImplementation((params?: any): any => {
const from = params?.body.query.bool.filter[0]?.range['@timestamp'].gte;
if (params.index === 'alternatebeat-*') return mocks.changedSourceIdResponse(from);
if (params.index === 'empty-response') return mocks.emptyMetricResponse;
const metric = params?.body.query.bool.filter[1]?.exists.field;
if (metric === 'test.metric.3') {
return elasticsearchClientMock.createSuccessTransportRequestPromise(
params?.body.aggs.aggregatedIntervals?.aggregations.aggregatedValueMax
? mocks.emptyRateResponse
: mocks.emptyMetricResponse
);
}
if (params?.body.aggs.groupings) {
if (params?.body.aggs.groupings.composite.after) {
return elasticsearchClientMock.createSuccessTransportRequestPromise(
@ -517,12 +651,6 @@ services.scopedClusterClient.asCurrentUser.search.mockImplementation((params?: a
return elasticsearchClientMock.createSuccessTransportRequestPromise(
mocks.alternateMetricResponse()
);
} else if (metric === 'test.metric.3') {
return elasticsearchClientMock.createSuccessTransportRequestPromise(
params?.body.aggs.aggregatedIntervals.aggregations.aggregatedValueMax
? mocks.emptyRateResponse
: mocks.emptyMetricResponse
);
}
return elasticsearchClientMock.createSuccessTransportRequestPromise(mocks.basicMetricResponse());
});
@ -534,6 +662,13 @@ services.savedObjectsClient.get.mockImplementation(async (type: string, sourceId
type,
references: [],
};
if (sourceId === 'empty-response')
return {
id: 'empty',
attributes: { metricAlias: 'empty-response' },
type,
references: [],
};
return { id: 'default', attributes: { metricAlias: 'metricbeat-*' }, type, references: [] };
});
@ -561,7 +696,13 @@ services.alertInstanceFactory.mockImplementation((instanceID: string) => {
});
function mostRecentAction(id: string) {
return alertInstances.get(id)!.actionQueue.pop();
const instance = alertInstances.get(id);
if (!instance) return undefined;
return instance.actionQueue.pop();
}
function clearInstances() {
alertInstances.clear();
}
const baseNonCountCriterion: Pick<

View file

@ -5,7 +5,7 @@
* 2.0.
*/
import { first, last } from 'lodash';
import { first, last, isEqual } from 'lodash';
import { i18n } from '@kbn/i18n';
import moment from 'moment';
import { ALERT_REASON } from '@kbn/rule-data-utils';
@ -24,12 +24,16 @@ import {
// buildRecoveredAlertReason,
stateToAlertMessage,
} from '../common/messages';
import { UNGROUPED_FACTORY_KEY } from '../common/utils';
import { createFormatter } from '../../../../common/formatters';
import { AlertStates, Comparator } from './types';
import { evaluateAlert, EvaluatedAlertParams } from './lib/evaluate_alert';
export type MetricThresholdAlertTypeParams = Record<string, any>;
export type MetricThresholdAlertTypeState = AlertTypeState; // no specific state used
export type MetricThresholdAlertTypeState = AlertTypeState & {
groups: string[];
groupBy?: string | string[];
};
export type MetricThresholdAlertInstanceState = AlertInstanceState; // no specific instace state used
export type MetricThresholdAlertInstanceContext = AlertInstanceContext; // no specific instace state used
@ -58,7 +62,7 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) =>
MetricThresholdAlertInstanceContext,
MetricThresholdAllowedActionGroups
>(async function (options) {
const { services, params } = options;
const { services, params, state } = options;
const { criteria } = params;
if (criteria.length === 0) throw new Error('Cannot execute an alert with 0 conditions');
const { alertWithLifecycle, savedObjectsClient } = services;
@ -80,14 +84,28 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) =>
sourceId || 'default'
);
const config = source.configuration;
const previousGroupBy = state.groupBy;
const prevGroups = isEqual(previousGroupBy, params.groupBy)
? // Filter out the * key from the previous groups, only include it if it's one of
// the current groups. In case of a groupBy alert that starts out with no data and no
// groups, we don't want to persist the existence of the * alert instance
state.groups?.filter((g) => g !== UNGROUPED_FACTORY_KEY) ?? []
: [];
const alertResults = await evaluateAlert(
services.scopedClusterClient.asCurrentUser,
params as EvaluatedAlertParams,
config
config,
prevGroups
);
// Because each alert result has the same group definitions, just grab the groups from the first one.
const groups = Object.keys(first(alertResults)!);
const resultGroups = Object.keys(first(alertResults)!);
// Merge the list of currently fetched groups and previous groups, and uniquify them. This is necessary for reporting
// no data results on groups that get removed
const groups = [...new Set([...prevGroups, ...resultGroups])];
for (const group of groups) {
// AND logic; all criteria must be across the threshold
const shouldAlertFire = alertResults.every((result) =>
@ -169,6 +187,8 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) =>
});
}
}
return { groups, groupBy: params.groupBy };
});
export const FIRED_ACTIONS = {

View file

@ -199,12 +199,20 @@ export const alternateCompositeResponse = (from: number) => ({
buckets: bucketsA(from),
},
},
{
key: {
groupBy0: 'c',
},
aggregatedIntervals: {
buckets: bucketsC(from),
},
},
],
},
},
hits: {
total: {
value: 2,
value: 3,
},
},
});

View file

@ -24,7 +24,7 @@ export const getAllCompositeData = async <
const { body: response } = await esClientSearch(options);
// Nothing available, return the previous buckets.
if (response.hits.total.value === 0) {
if (response.hits?.total.value === 0) {
return previousBuckets;
}

View file

@ -31,7 +31,8 @@ export const DATES = {
'alert-test-data': {
gauge: {
min: 1609459200000, // '2022-01-01T00:00:00Z'
max: 1609462800000, // '2021-01-01T01:00:00Z'
max: 1609462800000, // '2021-01-01T01:00:00Z',
midpoint: 1609461000000, // '2021-01-01T00:30:00Z'
},
rate: {
min: 1609545600000, // '2021-01-02T00:00:00Z'

View file

@ -100,7 +100,7 @@ export default function ({ getService }: FtrProviderContext) {
],
};
const timeFrame = { end: gauge.max };
const results = await evaluateAlert(esClient, params, configuration, timeFrame);
const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
expect(results).to.eql([
{
'*': {
@ -123,7 +123,7 @@ export default function ({ getService }: FtrProviderContext) {
it('should alert on the last value when the end date is the same as the last event', async () => {
const params = { ...baseParams };
const timeFrame = { end: gauge.max };
const results = await evaluateAlert(esClient, params, configuration, timeFrame);
const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
expect(results).to.eql([
{
'*': {
@ -160,7 +160,7 @@ export default function ({ getService }: FtrProviderContext) {
],
};
const timeFrame = { end: gauge.max };
const results = await evaluateAlert(esClient, params, configuration, timeFrame);
const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
expect(results).to.eql([
{
dev: {
@ -200,7 +200,7 @@ export default function ({ getService }: FtrProviderContext) {
groupBy: ['env'],
};
const timeFrame = { end: gauge.max };
const results = await evaluateAlert(esClient, params, configuration, timeFrame);
const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
expect(results).to.eql([
{
dev: {
@ -234,6 +234,53 @@ export default function ({ getService }: FtrProviderContext) {
},
]);
});
it('should report no data when one of the groups has a data gap', async () => {
const params = {
...baseParams,
groupBy: ['env'],
};
const timeFrame = { end: gauge.midpoint };
const results = await evaluateAlert(
esClient,
params,
configuration,
['dev', 'prod'],
timeFrame
);
expect(results).to.eql([
{
dev: {
timeSize: 5,
timeUnit: 'm',
threshold: [1],
comparator: '>=',
aggType: 'sum',
metric: 'value',
currentValue: null,
timestamp: '2021-01-01T00:25:00.000Z',
shouldFire: [false],
shouldWarn: [false],
isNoData: [true],
isError: false,
},
prod: {
timeSize: 5,
timeUnit: 'm',
threshold: [1],
comparator: '>=',
aggType: 'sum',
metric: 'value',
currentValue: 0,
timestamp: '2021-01-01T00:25:00.000Z',
shouldFire: [false],
shouldWarn: [false],
isNoData: [false],
isError: false,
},
},
]);
});
});
});
@ -254,7 +301,7 @@ export default function ({ getService }: FtrProviderContext) {
],
};
const timeFrame = { end: rate.max };
const results = await evaluateAlert(esClient, params, configuration, timeFrame);
const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
expect(results).to.eql([
{
'*': {
@ -294,7 +341,7 @@ export default function ({ getService }: FtrProviderContext) {
],
};
const timeFrame = { end: rate.max };
const results = await evaluateAlert(esClient, params, configuration, timeFrame);
const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
expect(results).to.eql([
{
dev: {