[ML] Transforms/Data Frame Analytics: Fix freezing wizard for indices with massive amounts of fields. (#98259)

The transform wizard can become very slow when used with indices with e.g. 1000+ fields. This PR fixes it by prefetching 500 random documents to create a list of populated/used fields and passes those on to the data grid component instead of all available fields from the list derived via Kibana index patterns. For example, for an out of the box metricbeat index, this reduces the list of passed on fields from 3000+ to ~120 fields. Previously, the page would freeze on load for tens of seconds and would freeze again on every rerender. With the applied update, the page loads almost instantly again and remains responsive. Note this fix of reducing available fields is only applied to the data grid preview component. All fields are still available to create the configuration in the UI for groups and aggregations. These UI components, e.g. the virtualized dropdowns, can handle large lists of fields.
2021-04-28 08:23:07 +02:00 · 2021-04-28 08:23:07 +02:00 · bfb363f050
parent c24c0d38f8
commit bfb363f050
6 changed files with 205 additions and 37 deletions
--- a/x-pack/plugins/ml/public/application/data_frame_analytics/pages/analytics_creation/hooks/use_index_data.ts
+++ b/x-pack/plugins/ml/public/application/data_frame_analytics/pages/analytics_creation/hooks/use_index_data.ts
@ -50,19 +50,21 @@ function getRuntimeFieldColumns(runtimeMappings: RuntimeMappings) {
  });
 }

-function getInitialColumns(indexPattern: IndexPattern) {
+function getInitialColumns(indexPattern: IndexPattern, fieldsFilter: string[]) {
  const { fields } = newJobCapsServiceAnalytics;
-  const columns = fields.map((field: any) => {
-    const schema =
-      getDataGridSchemaFromESFieldType(field.type) || getDataGridSchemaFromKibanaFieldType(field);
+  const columns = fields
+    .filter((field) => fieldsFilter.includes(field.name))
+    .map((field) => {
+      const schema =
+        getDataGridSchemaFromESFieldType(field.type) || getDataGridSchemaFromKibanaFieldType(field);

-    return {
-      id: field.name,
-      schema,
-      isExpandable: schema !== 'boolean',
-      isRuntimeFieldColumn: false,
-    };
-  });
+      return {
+        id: field.name,
+        schema,
+        isExpandable: schema !== 'boolean',
+        isRuntimeFieldColumn: false,
+      };
+    });

  // Add runtime fields defined in index pattern to columns
  if (indexPattern) {
@ -91,10 +93,57 @@ export const useIndexData = (
  toastNotifications: CoreSetup['notifications']['toasts'],
  runtimeMappings?: RuntimeMappings
 ): UseIndexDataReturnType => {
-  const indexPatternFields = useMemo(() => getFieldsFromKibanaIndexPattern(indexPattern), [
-    indexPattern,
-  ]);
-  const [columns, setColumns] = useState<MLEuiDataGridColumn[]>(getInitialColumns(indexPattern));
+  const [indexPatternFields, setIndexPatternFields] = useState<string[]>();
+
+  // Fetch 500 random documents to determine populated fields.
+  // This is a workaround to avoid passing potentially thousands of unpopulated fields
+  // (for example, as part of filebeat/metricbeat/ECS based indices)
+  // to the data grid component which would significantly slow down the page.
+  const fetchDataGridSampleDocuments = async function () {
+    setErrorMessage('');
+    setStatus(INDEX_STATUS.LOADING);
+
+    const esSearchRequest = {
+      index: indexPattern.title,
+      body: {
+        fields: ['*'],
+        _source: false,
+        query: {
+          function_score: {
+            query: { match_all: {} },
+            random_score: {},
+          },
+        },
+        size: 500,
+      },
+    };
+
+    try {
+      const resp: IndexSearchResponse = await ml.esSearch(esSearchRequest);
+      const docs = resp.hits.hits.map((d) => getProcessedFields(d.fields ?? {}));
+
+      // Get all field names for each returned doc and flatten it
+      // to a list of unique field names used across all docs.
+      const allKibanaIndexPatternFields = getFieldsFromKibanaIndexPattern(indexPattern);
+      const populatedFields = [...new Set(docs.map(Object.keys).flat(1))].filter((d) =>
+        allKibanaIndexPatternFields.includes(d)
+      );
+
+      setStatus(INDEX_STATUS.LOADED);
+      setIndexPatternFields(populatedFields);
+    } catch (e) {
+      setErrorMessage(extractErrorMessage(e));
+      setStatus(INDEX_STATUS.ERROR);
+    }
+  };
+
+  useEffect(() => {
+    fetchDataGridSampleDocuments();
+  }, []);
+
+  const [columns, setColumns] = useState<MLEuiDataGridColumn[]>(
+    getInitialColumns(indexPattern, indexPatternFields ?? [])
+  );
  const dataGrid = useDataGrid(columns);

  const {
@ -151,7 +200,7 @@ export const useIndexData = (
          ...(combinedRuntimeMappings ? getRuntimeFieldColumns(combinedRuntimeMappings) : []),
        ]);
      } else {
-        setColumns(getInitialColumns(indexPattern));
+        setColumns(getInitialColumns(indexPattern, indexPatternFields ?? []));
      }
      setRowCount(typeof resp.hits.total === 'number' ? resp.hits.total : resp.hits.total.value);
      setRowCountRelation(
--- a/x-pack/plugins/transform/public/app/hooks/use_index_data.ts
+++ b/x-pack/plugins/transform/public/app/hooks/use_index_data.ts
@ -5,7 +5,7 @@
 * 2.0.
 */

-import { useEffect, useMemo } from 'react';
+import { useEffect, useMemo, useState } from 'react';

 import type { estypes } from '@elastic/elasticsearch';
 import type { EuiDataGridColumn } from '@elastic/eui';
@ -46,9 +46,66 @@ export const useIndexData = (
    },
  } = useAppDependencies();

-  const indexPatternFields = getFieldsFromKibanaIndexPattern(indexPattern);
+  const [indexPatternFields, setIndexPatternFields] = useState<string[]>();
+
+  // Fetch 500 random documents to determine populated fields.
+  // This is a workaround to avoid passing potentially thousands of unpopulated fields
+  // (for example, as part of filebeat/metricbeat/ECS based indices)
+  // to the data grid component which would significantly slow down the page.
+  const fetchDataGridSampleDocuments = async function () {
+    setErrorMessage('');
+    setStatus(INDEX_STATUS.LOADING);
+
+    const esSearchRequest = {
+      index: indexPattern.title,
+      body: {
+        fields: ['*'],
+        _source: false,
+        query: {
+          function_score: {
+            query: { match_all: {} },
+            random_score: {},
+          },
+        },
+        size: 500,
+      },
+    };
+
+    const resp = await api.esSearch(esSearchRequest);
+
+    if (!isEsSearchResponse(resp)) {
+      setErrorMessage(getErrorMessage(resp));
+      setStatus(INDEX_STATUS.ERROR);
+      return;
+    }
+
+    const isCrossClusterSearch = indexPattern.title.includes(':');
+    const isMissingFields = resp.hits.hits.every((d) => typeof d.fields === 'undefined');
+
+    const docs = resp.hits.hits.map((d) => getProcessedFields(d.fields ?? {}));
+
+    // Get all field names for each returned doc and flatten it
+    // to a list of unique field names used across all docs.
+    const allKibanaIndexPatternFields = getFieldsFromKibanaIndexPattern(indexPattern);
+    const populatedFields = [...new Set(docs.map(Object.keys).flat(1))].filter((d) =>
+      allKibanaIndexPatternFields.includes(d)
+    );
+
+    setCcsWarning(isCrossClusterSearch && isMissingFields);
+    setStatus(INDEX_STATUS.LOADED);
+    setIndexPatternFields(populatedFields);
+  };
+
+  useEffect(() => {
+    fetchDataGridSampleDocuments();
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);

  const columns: EuiDataGridColumn[] = useMemo(() => {
+    if (typeof indexPatternFields === 'undefined') {
+      return [];
+    }
+
    let result: Array<{ id: string; schema: string | undefined }> = [];

    // Get the the runtime fields that are defined from API field and index patterns
--- a/x-pack/plugins/transform/public/app/hooks/use_pivot_data.ts
+++ b/x-pack/plugins/transform/public/app/hooks/use_pivot_data.ts
@ -11,12 +11,12 @@ import { useEffect, useMemo, useState } from 'react';
 import { EuiDataGridColumn } from '@elastic/eui';

 import { i18n } from '@kbn/i18n';
+import { getFlattenedObject } from '@kbn/std';

 import { ES_FIELD_TYPES } from '../../../../../../src/plugins/data/common';

 import type { PreviewMappingsProperties } from '../../../common/api_schemas/transforms';
 import { isPostTransformsPreviewResponseSchema } from '../../../common/api_schemas/type_guards';
-import { getNestedProperty } from '../../../common/utils/object_utils';

 import {
  RenderCellValue,
@ -159,13 +159,36 @@ export const usePivotData = (
      return;
    }

-    setTableItems(resp.preview);
-    setRowCount(resp.preview.length);
+    // To improve UI performance with a latest configuration for indices with a large number
+    // of fields, we reduce the number of available columns to those populated with values.
+
+    // 1. Flatten the returned object structure object documents to match mapping properties
+    const docs = resp.preview.map(getFlattenedObject);
+
+    // 2. Get all field names for each returned doc and flatten it
+    //    to a list of unique field names used across all docs.
+    const populatedFields = [...new Set(docs.map(Object.keys).flat(1))];
+
+    // 3. Filter mapping properties by populated fields
+    const populatedProperties: PreviewMappingsProperties = Object.entries(
+      resp.generated_dest_index.mappings.properties
+    )
+      .filter(([key]) => populatedFields.includes(key))
+      .reduce(
+        (p, [key, value]) => ({
+          ...p,
+          [key]: value,
+        }),
+        {}
+      );
+
+    setTableItems(docs);
+    setRowCount(docs.length);
    setRowCountRelation(ES_CLIENT_TOTAL_HITS_RELATION.EQ);
-    setPreviewMappingsProperties(resp.generated_dest_index.mappings.properties);
+    setPreviewMappingsProperties(populatedProperties);
    setStatus(INDEX_STATUS.LOADED);

-    if (resp.preview.length === 0) {
+    if (docs.length === 0) {
      setNoDataMessage(
        i18n.translate('xpack.transform.pivotPreview.PivotPreviewNoDataCalloutBody', {
          defaultMessage:
@ -201,7 +224,7 @@ export const usePivotData = (
      const adjustedRowIndex = rowIndex - pagination.pageIndex * pagination.pageSize;

      const cellValue = pageData.hasOwnProperty(adjustedRowIndex)
-        ? getNestedProperty(pageData[adjustedRowIndex], columnId, null)
+        ? pageData[adjustedRowIndex][columnId] ?? null
        : null;

      if (typeof cellValue === 'object' && cellValue !== null) {
--- a/x-pack/test/functional/apps/transform/creation_index_pattern.ts
+++ b/x-pack/test/functional/apps/transform/creation_index_pattern.ts
@ -166,11 +166,6 @@ export default function ({ getService }: FtrProviderContext) {
                { color: '#54B399', percentage: 90 },
              ],
            },
-            {
-              chartAvailable: false,
-              id: 'customer_birth_date',
-              legend: '0 documents contain field.',
-            },
            { chartAvailable: false, id: 'customer_first_name', legend: 'Chart not supported.' },
            { chartAvailable: false, id: 'customer_full_name', legend: 'Chart not supported.' },
            {
@ -210,6 +205,15 @@ export default function ({ getService }: FtrProviderContext) {
                { color: '#54B399', percentage: 75 },
              ],
            },
+            {
+              chartAvailable: true,
+              id: 'day_of_week_i',
+              legend: '0 - 6',
+              colorStats: [
+                { color: '#000000', percentage: 20 },
+                { color: '#54B399', percentage: 75 },
+              ],
+            },
          ],
          discoverQueryHits: '7,270',
        },
@ -296,7 +300,6 @@ export default function ({ getService }: FtrProviderContext) {
            columns: 10,
            rows: 5,
          },
-          histogramCharts: [],
          discoverQueryHits: '10',
        },
      } as PivotTransformTestData,
@ -336,7 +339,6 @@ export default function ({ getService }: FtrProviderContext) {
            columns: 10,
            rows: 5,
          },
-          histogramCharts: [],
          transformPreview: {
            column: 0,
            values: [
@ -404,10 +406,14 @@ export default function ({ getService }: FtrProviderContext) {
          await transform.testExecution.logTestStep('enables the index preview histogram charts');
          await transform.wizard.enableIndexPreviewHistogramCharts(true);

-          await transform.testExecution.logTestStep('displays the index preview histogram charts');
-          await transform.wizard.assertIndexPreviewHistogramCharts(
-            testData.expected.histogramCharts
-          );
+          if (Array.isArray(testData.expected.histogramCharts)) {
+            await transform.testExecution.logTestStep(
+              'displays the index preview histogram charts'
+            );
+            await transform.wizard.assertIndexPreviewHistogramCharts(
+              testData.expected.histogramCharts
+            );
+          }

          if (isPivotTransformTestData(testData)) {
            await transform.testExecution.logTestStep('adds the group by entries');
--- a/x-pack/test/functional/apps/transform/creation_runtime_mappings.ts
+++ b/x-pack/test/functional/apps/transform/creation_runtime_mappings.ts
@ -46,14 +46,37 @@ export default function ({ getService }: FtrProviderContext) {
      await transform.api.cleanTransformIndices();
    });

-    // Only testing that histogram charts are available for runtime fields here
    const histogramCharts: HistogramCharts = [
+      {
+        // Skipping colorStats assertion for this chart,
+        // results can be quite different on each run because of sampling.
+        chartAvailable: true,
+        id: '@timestamp',
+      },
+      { chartAvailable: false, id: '@version', legend: 'Chart not supported.' },
+      {
+        chartAvailable: true,
+        id: 'airline',
+        legend: '19 categories',
+        colorStats: [
+          { color: '#000000', percentage: 49 },
+          { color: '#54B399', percentage: 41 },
+        ],
+      },
+      {
+        chartAvailable: true,
+        id: 'responsetime',
+        colorStats: [
+          { color: '#54B399', percentage: 5 },
+          { color: '#000000', percentage: 95 },
+        ],
+      },
      {
        chartAvailable: true,
        id: 'rt_airline_lower',
        legend: '19 categories',
        colorStats: [
-          { color: '#000000', percentage: 48 },
+          { color: '#000000', percentage: 49 },
          { color: '#54B399', percentage: 41 },
        ],
      },
@ -65,6 +88,7 @@ export default function ({ getService }: FtrProviderContext) {
          { color: '#000000', percentage: 95 },
        ],
      },
+      { chartAvailable: false, id: 'type', legend: 'Chart not supported.' },
    ];

    const testDataList: Array<PivotTransformTestData | LatestTransformTestData> = [
--- a/x-pack/test/functional/services/transform/wizard.ts
+++ b/x-pack/test/functional/services/transform/wizard.ts
@ -237,6 +237,15 @@ export function TransformWizardProvider({ getService, getPageObjects }: FtrProvi
      // For each chart, get the content of each header cell and assert
      // the legend text and column id and if the chart should be present or not.
      await retry.tryForTime(5000, async () => {
+        const table = await testSubjects.find(`~transformIndexPreview`);
+        const $ = await table.parseDomContent();
+        const actualColumnLength = $('.euiDataGridHeaderCell__content').toArray().length;
+
+        expect(actualColumnLength).to.eql(
+          expectedHistogramCharts.length,
+          `Number of index preview column charts should be '${expectedHistogramCharts.length}' (got '${actualColumnLength}')`
+        );
+
        for (const expected of expectedHistogramCharts.values()) {
          const id = expected.id;
          await testSubjects.existOrFail(`mlDataGridChart-${id}`);