[Metrics UI] Fix No Data alerts on Metric Threshold with groupBy (#111465) (#113021)

* [Metrics UI] Fix No Data alerts on Metric Threshold with groupBy * Fix typecheck * Add integration test for no data groupBy scenario * Uncomment test files * Uncomment test files * Reset stored groups when groupBy parameter changes * Test for groupBy arrays * Fix initial No Data result when no groups are detected yet * Fix linting error * Fix detecting groups with doc count 0 Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com> Co-authored-by: Zacqary Adam Xeper <Zacqary@users.noreply.github.com>
2021-09-23 16:03:45 -04:00 · 2021-09-23 16:03:45 -04:00 · 4c6ffc70e6
parent 55d4791afd
commit 4c6ffc70e6
8 changed files with 289 additions and 39 deletions
--- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/lib/evaluate_alert.ts
+++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/lib/evaluate_alert.ts
@ -64,6 +64,7 @@ export const evaluateAlert = <Params extends EvaluatedAlertParams = EvaluatedAle
  esClient: ElasticsearchClient,
  params: Params,
  config: InfraSource['configuration'],
+  prevGroups: string[],
  timeframe?: { start?: number; end: number }
 ) => {
  const { criteria, groupBy, filterQuery, shouldDropPartialBuckets } = params;
@ -91,21 +92,53 @@ export const evaluateAlert = <Params extends EvaluatedAlertParams = EvaluatedAle
          : [false];
      };

-      return mapValues(currentValues, (points: any[] | typeof NaN | null) => {
-        if (isTooManyBucketsPreviewException(points)) throw points;
-        return {
-          ...criterion,
-          metric: criterion.metric ?? DOCUMENT_COUNT_I18N,
-          currentValue: Array.isArray(points) ? last(points)?.value : NaN,
-          timestamp: Array.isArray(points) ? last(points)?.key : NaN,
-          shouldFire: pointsEvaluator(points, threshold, comparator),
-          shouldWarn: pointsEvaluator(points, warningThreshold, warningComparator),
-          isNoData: Array.isArray(points)
-            ? points.map((point) => point?.value === null || point === null)
-            : [points === null],
-          isError: isNaN(Array.isArray(points) ? last(points)?.value : points),
-        };
-      });
+      // If any previous groups are no longer being reported, backfill them with null values
+      const currentGroups = Object.keys(currentValues);
+
+      const missingGroups = prevGroups.filter((g) => !currentGroups.includes(g));
+      if (currentGroups.length === 0 && missingGroups.length === 0) {
+        missingGroups.push(UNGROUPED_FACTORY_KEY);
+      }
+      const backfillTimestamp =
+        last(last(Object.values(currentValues)))?.key ?? new Date().toISOString();
+      const backfilledPrevGroups: Record<
+        string,
+        Array<{ key: string; value: number }>
+      > = missingGroups.reduce(
+        (result, group) => ({
+          ...result,
+          [group]: [
+            {
+              key: backfillTimestamp,
+              value: criterion.aggType === Aggregators.COUNT ? 0 : null,
+            },
+          ],
+        }),
+        {}
+      );
+      const currentValuesWithBackfilledPrevGroups = {
+        ...currentValues,
+        ...backfilledPrevGroups,
+      };
+
+      return mapValues(
+        currentValuesWithBackfilledPrevGroups,
+        (points: any[] | typeof NaN | null) => {
+          if (isTooManyBucketsPreviewException(points)) throw points;
+          return {
+            ...criterion,
+            metric: criterion.metric ?? DOCUMENT_COUNT_I18N,
+            currentValue: Array.isArray(points) ? last(points)?.value : NaN,
+            timestamp: Array.isArray(points) ? last(points)?.key : NaN,
+            shouldFire: pointsEvaluator(points, threshold, comparator),
+            shouldWarn: pointsEvaluator(points, warningThreshold, warningComparator),
+            isNoData: Array.isArray(points)
+              ? points.map((point) => point?.value === null || point === null)
+              : [points === null],
+            isError: isNaN(Array.isArray(points) ? last(points)?.value : points),
+          };
+        }
+      );
    })
  );
 };
@ -119,7 +152,7 @@ const getMetric: (
  filterQuery: string | undefined,
  timeframe?: { start?: number; end: number },
  shouldDropPartialBuckets?: boolean
-) => Promise<Record<string, number[]>> = async function (
+) => Promise<Record<string, Array<{ key: string; value: number }>>> = async function (
  esClient,
  params,
  index,
--- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.test.ts
+++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.test.ts
@ -37,10 +37,13 @@ let persistAlertInstances = false; // eslint-disable-line prefer-const

 type TestRuleState = Record<string, unknown> & {
  aRuleStateKey: string;
+  groups: string[];
+  groupBy?: string | string[];
 };

 const initialRuleState: TestRuleState = {
  aRuleStateKey: 'INITIAL_RULE_STATE_VALUE',
+  groups: [],
 };

 const mockOptions = {
@ -90,6 +93,7 @@ const mockOptions = {

 describe('The metric threshold alert type', () => {
  describe('querying the entire infrastructure', () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
      executor({
@ -157,20 +161,29 @@ describe('The metric threshold alert type', () => {
  });

  describe('querying with a groupBy parameter', () => {
-    const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
+    afterAll(() => clearInstances());
+    const execute = (
+      comparator: Comparator,
+      threshold: number[],
+      groupBy: string[] = ['something'],
+      metric?: string,
+      state?: any
+    ) =>
      executor({
        ...mockOptions,
        services,
        params: {
-          groupBy: 'something',
+          groupBy,
          criteria: [
            {
              ...baseNonCountCriterion,
              comparator,
              threshold,
+              metric: metric ?? baseNonCountCriterion.metric,
            },
          ],
        },
+        state: state ?? mockOptions.state.wrapped,
      });
    const instanceIdA = 'a';
    const instanceIdB = 'b';
@ -194,9 +207,35 @@ describe('The metric threshold alert type', () => {
      expect(mostRecentAction(instanceIdA).action.group).toBe('a');
      expect(mostRecentAction(instanceIdB).action.group).toBe('b');
    });
+    test('reports previous groups and the groupBy parameter in its state', async () => {
+      const stateResult = await execute(Comparator.GT, [0.75]);
+      expect(stateResult.groups).toEqual(expect.arrayContaining(['a', 'b']));
+      expect(stateResult.groupBy).toEqual(['something']);
+    });
+    test('persists previous groups that go missing, until the groupBy param changes', async () => {
+      const stateResult1 = await execute(Comparator.GT, [0.75], ['something'], 'test.metric.2');
+      expect(stateResult1.groups).toEqual(expect.arrayContaining(['a', 'b', 'c']));
+      const stateResult2 = await execute(
+        Comparator.GT,
+        [0.75],
+        ['something'],
+        'test.metric.1',
+        stateResult1
+      );
+      expect(stateResult2.groups).toEqual(expect.arrayContaining(['a', 'b', 'c']));
+      const stateResult3 = await execute(
+        Comparator.GT,
+        [0.75],
+        ['something', 'something-else'],
+        'test.metric.1',
+        stateResult2
+      );
+      expect(stateResult3.groups).toEqual(expect.arrayContaining(['a', 'b']));
+    });
  });

  describe('querying with multiple criteria', () => {
+    afterAll(() => clearInstances());
    const execute = (
      comparator: Comparator,
      thresholdA: number[],
@ -257,6 +296,7 @@ describe('The metric threshold alert type', () => {
    });
  });
  describe('querying with the count aggregator', () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
      executor({
@ -279,8 +319,47 @@ describe('The metric threshold alert type', () => {
      await execute(Comparator.LT, [0.5]);
      expect(mostRecentAction(instanceID)).toBe(undefined);
    });
+    describe('with a groupBy parameter', () => {
+      const executeGroupBy = (
+        comparator: Comparator,
+        threshold: number[],
+        sourceId: string = 'default',
+        state?: any
+      ) =>
+        executor({
+          ...mockOptions,
+          services,
+          params: {
+            sourceId,
+            groupBy: 'something',
+            criteria: [
+              {
+                ...baseCountCriterion,
+                comparator,
+                threshold,
+              },
+            ],
+          },
+          state: state ?? mockOptions.state.wrapped,
+        });
+      const instanceIdA = 'a';
+      const instanceIdB = 'b';
+
+      test('successfully detects and alerts on a document count of 0', async () => {
+        const resultState = await executeGroupBy(Comparator.LT_OR_EQ, [0]);
+        expect(mostRecentAction(instanceIdA)).toBe(undefined);
+        expect(mostRecentAction(instanceIdB)).toBe(undefined);
+        await executeGroupBy(Comparator.LT_OR_EQ, [0], 'empty-response', resultState);
+        expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id);
+        expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id);
+        await executeGroupBy(Comparator.LT_OR_EQ, [0]);
+        expect(mostRecentAction(instanceIdA)).toBe(undefined);
+        expect(mostRecentAction(instanceIdB)).toBe(undefined);
+      });
+    });
  });
  describe('querying with the p99 aggregator', () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
      executor({
@ -306,6 +385,7 @@ describe('The metric threshold alert type', () => {
    });
  });
  describe('querying with the p95 aggregator', () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') =>
      executor({
@ -332,6 +412,7 @@ describe('The metric threshold alert type', () => {
    });
  });
  describe("querying a metric that hasn't reported data", () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = (alertOnNoData: boolean, sourceId: string = 'default') =>
      executor({
@ -360,7 +441,51 @@ describe('The metric threshold alert type', () => {
    });
  });

+  describe('querying a groupBy alert that starts reporting no data, and then later reports data', () => {
+    afterAll(() => clearInstances());
+    const instanceID = '*';
+    const instanceIdA = 'a';
+    const instanceIdB = 'b';
+    const execute = (metric: string, state?: any) =>
+      executor({
+        ...mockOptions,
+        services,
+        params: {
+          groupBy: 'something',
+          sourceId: 'default',
+          criteria: [
+            {
+              ...baseNonCountCriterion,
+              comparator: Comparator.GT,
+              threshold: [0],
+              metric,
+            },
+          ],
+          alertOnNoData: true,
+        },
+        state: state ?? mockOptions.state.wrapped,
+      });
+    const resultState: any[] = [];
+    test('first sends a No Data alert with the * group, but then reports groups when data is available', async () => {
+      resultState.push(await execute('test.metric.3'));
+      expect(mostRecentAction(instanceID).id).toBe(FIRED_ACTIONS.id);
+      resultState.push(await execute('test.metric.3', resultState.pop()));
+      expect(mostRecentAction(instanceID).id).toBe(FIRED_ACTIONS.id);
+      resultState.push(await execute('test.metric.1', resultState.pop()));
+      expect(mostRecentAction(instanceID)).toBe(undefined);
+      expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id);
+      expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id);
+    });
+    test('sends No Data alerts for the previously detected groups when they stop reporting data, but not the * group', async () => {
+      await execute('test.metric.3', resultState.pop());
+      expect(mostRecentAction(instanceID)).toBe(undefined);
+      expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id);
+      expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id);
+    });
+  });
+
  describe("querying a rate-aggregated metric that hasn't reported data", () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = (sourceId: string = 'default') =>
      executor({
@ -439,6 +564,7 @@ describe('The metric threshold alert type', () => {
  */

  describe('querying a metric with a percentage metric', () => {
+    afterAll(() => clearInstances());
    const instanceID = '*';
    const execute = () =>
      executor({
@ -497,7 +623,15 @@ const services: AlertServicesMock &
 services.scopedClusterClient.asCurrentUser.search.mockImplementation((params?: any): any => {
  const from = params?.body.query.bool.filter[0]?.range['@timestamp'].gte;
  if (params.index === 'alternatebeat-*') return mocks.changedSourceIdResponse(from);
+  if (params.index === 'empty-response') return mocks.emptyMetricResponse;
  const metric = params?.body.query.bool.filter[1]?.exists.field;
+  if (metric === 'test.metric.3') {
+    return elasticsearchClientMock.createSuccessTransportRequestPromise(
+      params?.body.aggs.aggregatedIntervals?.aggregations.aggregatedValueMax
+        ? mocks.emptyRateResponse
+        : mocks.emptyMetricResponse
+    );
+  }
  if (params?.body.aggs.groupings) {
    if (params?.body.aggs.groupings.composite.after) {
      return elasticsearchClientMock.createSuccessTransportRequestPromise(
@ -517,12 +651,6 @@ services.scopedClusterClient.asCurrentUser.search.mockImplementation((params?: a
    return elasticsearchClientMock.createSuccessTransportRequestPromise(
      mocks.alternateMetricResponse()
    );
-  } else if (metric === 'test.metric.3') {
-    return elasticsearchClientMock.createSuccessTransportRequestPromise(
-      params?.body.aggs.aggregatedIntervals.aggregations.aggregatedValueMax
-        ? mocks.emptyRateResponse
-        : mocks.emptyMetricResponse
-    );
  }
  return elasticsearchClientMock.createSuccessTransportRequestPromise(mocks.basicMetricResponse());
 });
@ -534,6 +662,13 @@ services.savedObjectsClient.get.mockImplementation(async (type: string, sourceId
      type,
      references: [],
    };
+  if (sourceId === 'empty-response')
+    return {
+      id: 'empty',
+      attributes: { metricAlias: 'empty-response' },
+      type,
+      references: [],
+    };
  return { id: 'default', attributes: { metricAlias: 'metricbeat-*' }, type, references: [] };
 });

@ -561,7 +696,13 @@ services.alertInstanceFactory.mockImplementation((instanceID: string) => {
 });

 function mostRecentAction(id: string) {
-  return alertInstances.get(id)!.actionQueue.pop();
+  const instance = alertInstances.get(id);
+  if (!instance) return undefined;
+  return instance.actionQueue.pop();
+}
+
+function clearInstances() {
+  alertInstances.clear();
 }

 const baseNonCountCriterion: Pick<
--- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.ts
+++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.ts
@ -5,7 +5,7 @@
 * 2.0.
 */

-import { first, last } from 'lodash';
+import { first, last, isEqual } from 'lodash';
 import { i18n } from '@kbn/i18n';
 import moment from 'moment';
 import { ALERT_REASON } from '@kbn/rule-data-utils';
@ -24,12 +24,16 @@ import {
  // buildRecoveredAlertReason,
  stateToAlertMessage,
 } from '../common/messages';
+import { UNGROUPED_FACTORY_KEY } from '../common/utils';
 import { createFormatter } from '../../../../common/formatters';
 import { AlertStates, Comparator } from './types';
 import { evaluateAlert, EvaluatedAlertParams } from './lib/evaluate_alert';

 export type MetricThresholdAlertTypeParams = Record<string, any>;
-export type MetricThresholdAlertTypeState = AlertTypeState; // no specific state used
+export type MetricThresholdAlertTypeState = AlertTypeState & {
+  groups: string[];
+  groupBy?: string | string[];
+};
 export type MetricThresholdAlertInstanceState = AlertInstanceState; // no specific instace state used
 export type MetricThresholdAlertInstanceContext = AlertInstanceContext; // no specific instace state used

@ -58,7 +62,7 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) =>
    MetricThresholdAlertInstanceContext,
    MetricThresholdAllowedActionGroups
  >(async function (options) {
-    const { services, params } = options;
+    const { services, params, state } = options;
    const { criteria } = params;
    if (criteria.length === 0) throw new Error('Cannot execute an alert with 0 conditions');
    const { alertWithLifecycle, savedObjectsClient } = services;
@ -80,14 +84,28 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) =>
      sourceId || 'default'
    );
    const config = source.configuration;
+
+    const previousGroupBy = state.groupBy;
+    const prevGroups = isEqual(previousGroupBy, params.groupBy)
+      ? // Filter out the * key from the previous groups, only include it if it's one of
+        // the current groups. In case of a groupBy alert that starts out with no data and no
+        // groups, we don't want to persist the existence of the * alert instance
+        state.groups?.filter((g) => g !== UNGROUPED_FACTORY_KEY) ?? []
+      : [];
+
    const alertResults = await evaluateAlert(
      services.scopedClusterClient.asCurrentUser,
      params as EvaluatedAlertParams,
-      config
+      config,
+      prevGroups
    );

    // Because each alert result has the same group definitions, just grab the groups from the first one.
-    const groups = Object.keys(first(alertResults)!);
+    const resultGroups = Object.keys(first(alertResults)!);
+    // Merge the list of currently fetched groups and previous groups, and uniquify them. This is necessary for reporting
+    // no data results on groups that get removed
+    const groups = [...new Set([...prevGroups, ...resultGroups])];
+
    for (const group of groups) {
      // AND logic; all criteria must be across the threshold
      const shouldAlertFire = alertResults.every((result) =>
@ -169,6 +187,8 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) =>
        });
      }
    }
+
+    return { groups, groupBy: params.groupBy };
  });

 export const FIRED_ACTIONS = {
--- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/test_mocks.ts
+++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/test_mocks.ts
@ -199,12 +199,20 @@ export const alternateCompositeResponse = (from: number) => ({
            buckets: bucketsA(from),
          },
        },
+        {
+          key: {
+            groupBy0: 'c',
+          },
+          aggregatedIntervals: {
+            buckets: bucketsC(from),
+          },
+        },
      ],
    },
  },
  hits: {
    total: {
-      value: 2,
+      value: 3,
    },
  },
 });
--- a/x-pack/plugins/infra/server/utils/get_all_composite_data.ts
+++ b/x-pack/plugins/infra/server/utils/get_all_composite_data.ts
@ -24,7 +24,7 @@ export const getAllCompositeData = async <
  const { body: response } = await esClientSearch(options);

  // Nothing available, return the previous buckets.
-  if (response.hits.total.value === 0) {
+  if (response.hits?.total.value === 0) {
    return previousBuckets;
  }

--- a/x-pack/test/api_integration/apis/metrics_ui/constants.ts
+++ b/x-pack/test/api_integration/apis/metrics_ui/constants.ts
@ -31,7 +31,8 @@ export const DATES = {
  'alert-test-data': {
    gauge: {
      min: 1609459200000, // '2022-01-01T00:00:00Z'
-      max: 1609462800000, // '2021-01-01T01:00:00Z'
+      max: 1609462800000, // '2021-01-01T01:00:00Z',
+      midpoint: 1609461000000, // '2021-01-01T00:30:00Z'
    },
    rate: {
      min: 1609545600000, // '2021-01-02T00:00:00Z'
--- a/x-pack/test/api_integration/apis/metrics_ui/metric_threshold_alert.ts
+++ b/x-pack/test/api_integration/apis/metrics_ui/metric_threshold_alert.ts
@ -100,7 +100,7 @@ export default function ({ getService }: FtrProviderContext) {
            ],
          };
          const timeFrame = { end: gauge.max };
-          const results = await evaluateAlert(esClient, params, configuration, timeFrame);
+          const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
          expect(results).to.eql([
            {
              '*': {
@ -123,7 +123,7 @@ export default function ({ getService }: FtrProviderContext) {
        it('should alert on the last value when the end date is the same as the last event', async () => {
          const params = { ...baseParams };
          const timeFrame = { end: gauge.max };
-          const results = await evaluateAlert(esClient, params, configuration, timeFrame);
+          const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
          expect(results).to.eql([
            {
              '*': {
@ -160,7 +160,7 @@ export default function ({ getService }: FtrProviderContext) {
            ],
          };
          const timeFrame = { end: gauge.max };
-          const results = await evaluateAlert(esClient, params, configuration, timeFrame);
+          const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
          expect(results).to.eql([
            {
              dev: {
@ -200,7 +200,7 @@ export default function ({ getService }: FtrProviderContext) {
            groupBy: ['env'],
          };
          const timeFrame = { end: gauge.max };
-          const results = await evaluateAlert(esClient, params, configuration, timeFrame);
+          const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
          expect(results).to.eql([
            {
              dev: {
@ -234,6 +234,53 @@ export default function ({ getService }: FtrProviderContext) {
            },
          ]);
        });
+
+        it('should report no data when one of the groups has a data gap', async () => {
+          const params = {
+            ...baseParams,
+            groupBy: ['env'],
+          };
+          const timeFrame = { end: gauge.midpoint };
+          const results = await evaluateAlert(
+            esClient,
+            params,
+            configuration,
+            ['dev', 'prod'],
+            timeFrame
+          );
+          expect(results).to.eql([
+            {
+              dev: {
+                timeSize: 5,
+                timeUnit: 'm',
+                threshold: [1],
+                comparator: '>=',
+                aggType: 'sum',
+                metric: 'value',
+                currentValue: null,
+                timestamp: '2021-01-01T00:25:00.000Z',
+                shouldFire: [false],
+                shouldWarn: [false],
+                isNoData: [true],
+                isError: false,
+              },
+              prod: {
+                timeSize: 5,
+                timeUnit: 'm',
+                threshold: [1],
+                comparator: '>=',
+                aggType: 'sum',
+                metric: 'value',
+                currentValue: 0,
+                timestamp: '2021-01-01T00:25:00.000Z',
+                shouldFire: [false],
+                shouldWarn: [false],
+                isNoData: [false],
+                isError: false,
+              },
+            },
+          ]);
+        });
      });
    });

@ -254,7 +301,7 @@ export default function ({ getService }: FtrProviderContext) {
            ],
          };
          const timeFrame = { end: rate.max };
-          const results = await evaluateAlert(esClient, params, configuration, timeFrame);
+          const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
          expect(results).to.eql([
            {
              '*': {
@ -294,7 +341,7 @@ export default function ({ getService }: FtrProviderContext) {
            ],
          };
          const timeFrame = { end: rate.max };
-          const results = await evaluateAlert(esClient, params, configuration, timeFrame);
+          const results = await evaluateAlert(esClient, params, configuration, [], timeFrame);
          expect(results).to.eql([
            {
              dev: {
--- a/x-pack/test/functional/es_archives/infra/alerts_test_data/data.json.gz
+++ b/x-pack/test/functional/es_archives/infra/alerts_test_data/data.json.gz