[Task Manager] Fixed the behavior of the claiming tasks function failing, when inline scripts are disabled. (#94870)

* [Task Manager] Fixed the behavior of the claiming tasks funtion failing, when inline scripts are disabled. * added docs * fixed test * added tests * fixed due to comments * Fixed docs due to comments * extended TM configuration changes message with the possible errors description
2021-03-26 10:46:33 -07:00 · 2021-03-26 10:46:33 -07:00 · e31ef5235e
parent 8101419cb9
commit e31ef5235e
8 changed files with 185 additions and 53 deletions
--- a/docs/user/production-considerations/task-manager-troubleshooting.asciidoc
+++ b/docs/user/production-considerations/task-manager-troubleshooting.asciidoc
@ -706,3 +706,21 @@ These rough calculations give you a lower bound to the required throughput, whic
 Given these inferred attributes, it would be safe to assume that a single {kib} instance with default settings **would not** provide the required throughput. It is possible that scaling horizontally by adding a couple more {kib} instances will.

 For details on scaling Task Manager, see <<task-manager-scaling-guidance>>.
+
+[float]
+[[task-manager-cannot-operate-when-inline-scripts-are-disabled]]
+==== Inline scripts are disabled in {es}
+
+*Problem*:
+
+Tasks are not running, and the server logs contain the following error message:
+
+[source, txt]
+--------------------------------------------------
+[warning][plugins][taskManager] Task Manager cannot operate when inline scripts are disabled in {es}
+--------------------------------------------------
+
+*Solution*:
+
+Inline scripts are a hard requirement for Task Manager to function.
+To enable inline scripting, see the Elasticsearch documentation for {ref}/modules-scripting-security.html#allowed-script-types-setting[configuring allowed script types setting].
--- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts
+++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts
@ -6,7 +6,10 @@
 */

 import sinon from 'sinon';
-import { savedObjectsRepositoryMock } from '../../../../../src/core/server/mocks';
+import {
+  elasticsearchServiceMock,
+  savedObjectsRepositoryMock,
+} from '../../../../../src/core/server/mocks';
 import { SavedObjectsErrorHelpers, Logger } from '../../../../../src/core/server';
 import { ADJUST_THROUGHPUT_INTERVAL } from '../lib/create_managed_configuration';
 import { TaskManagerPlugin, TaskManagerStartContract } from '../plugin';
@ -19,6 +22,7 @@ describe('managed configuration', () => {

  let clock: sinon.SinonFakeTimers;
  const savedObjectsClient = savedObjectsRepositoryMock.create();
+  const esStart = elasticsearchServiceMock.createStart();

  beforeEach(async () => {
    jest.resetAllMocks();
@ -55,6 +59,7 @@ describe('managed configuration', () => {
    });

    const coreStart = coreMock.createStart();
+    coreStart.elasticsearch = esStart;
    coreStart.savedObjects.createInternalRepository.mockReturnValue(savedObjectsClient);
    taskManagerStart = await taskManager.start(coreStart);

@ -81,10 +86,10 @@ describe('managed configuration', () => {
    clock.tick(ADJUST_THROUGHPUT_INTERVAL);

    expect(logger.warn).toHaveBeenCalledWith(
-      'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" error(s).'
+      'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
    );
    expect(logger.debug).toHaveBeenCalledWith(
-      'Max workers configuration changing from 10 to 8 after seeing 1 error(s)'
+      'Max workers configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)'
    );
    expect(logger.debug).toHaveBeenCalledWith('Task pool now using 10 as the max worker value');
  });
@ -105,10 +110,57 @@ describe('managed configuration', () => {
    clock.tick(ADJUST_THROUGHPUT_INTERVAL);

    expect(logger.warn).toHaveBeenCalledWith(
-      'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" error(s).'
+      'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
    );
    expect(logger.debug).toHaveBeenCalledWith(
-      'Poll interval configuration changing from 3000 to 3600 after seeing 1 error(s)'
+      'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)'
+    );
+    expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms');
+  });
+
+  test('should lower max workers when Elasticsearch returns "cannot execute [inline] scripts" error', async () => {
+    esStart
+      .createClient('taskManager')
+      .asInternalUser.search.mockRejectedValueOnce(
+        elasticsearchServiceMock.createErrorTransportRequestPromise(
+          new Error('cannot execute [inline] scripts" error')
+        )
+      );
+
+    await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot(
+      `"cannot execute [inline] scripts" error"`
+    );
+    clock.tick(ADJUST_THROUGHPUT_INTERVAL);
+
+    expect(logger.warn).toHaveBeenCalledWith(
+      'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
+    );
+    expect(logger.debug).toHaveBeenCalledWith(
+      'Max workers configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)'
+    );
+    expect(logger.debug).toHaveBeenCalledWith('Task pool now using 10 as the max worker value');
+  });
+
+  test('should increase poll interval when Elasticsearch returns "cannot execute [inline] scripts" error', async () => {
+    esStart
+      .createClient('taskManager')
+      .asInternalUser.search.mockRejectedValueOnce(
+        elasticsearchServiceMock.createErrorTransportRequestPromise(
+          new Error('cannot execute [inline] scripts" error')
+        )
+      );
+
+    await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot(
+      `"cannot execute [inline] scripts" error"`
+    );
+
+    clock.tick(ADJUST_THROUGHPUT_INTERVAL);
+
+    expect(logger.warn).toHaveBeenCalledWith(
+      'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
+    );
+    expect(logger.debug).toHaveBeenCalledWith(
+      'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)'
    );
    expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms');
  });
--- a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts
+++ b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts
@ -96,7 +96,7 @@ describe('createManagedConfiguration()', () => {
      errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b'));
      clock.tick(ADJUST_THROUGHPUT_INTERVAL);
      expect(logger.warn).toHaveBeenCalledWith(
-        'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" error(s).'
+        'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
      );
    });

@ -180,7 +180,7 @@ describe('createManagedConfiguration()', () => {
      errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b'));
      clock.tick(ADJUST_THROUGHPUT_INTERVAL);
      expect(logger.warn).toHaveBeenCalledWith(
-        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" error(s).'
+        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
      );
    });

--- a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts
+++ b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts
@ -9,6 +9,7 @@ import { interval, merge, of, Observable } from 'rxjs';
 import { filter, mergeScan, map, scan, distinctUntilChanged, startWith } from 'rxjs/operators';
 import { SavedObjectsErrorHelpers } from '../../../../../src/core/server';
 import { Logger } from '../../../../../src/core/server';
+import { isEsCannotExecuteScriptError } from './identify_es_error';

 const FLUSH_MARKER = Symbol('flush');
 export const ADJUST_THROUGHPUT_INTERVAL = 10 * 1000;
@ -76,11 +77,11 @@ function createMaxWorkersScan(logger: Logger, startingMaxWorkers: number) {
    }
    if (newMaxWorkers !== previousMaxWorkers) {
      logger.debug(
-        `Max workers configuration changing from ${previousMaxWorkers} to ${newMaxWorkers} after seeing ${errorCount} error(s)`
+        `Max workers configuration changing from ${previousMaxWorkers} to ${newMaxWorkers} after seeing ${errorCount} "too many request" and/or "execute [inline] script" error(s)`
      );
      if (previousMaxWorkers === startingMaxWorkers) {
        logger.warn(
-          `Max workers configuration is temporarily reduced after Elasticsearch returned ${errorCount} "too many request" error(s).`
+          `Max workers configuration is temporarily reduced after Elasticsearch returned ${errorCount} "too many request" and/or "execute [inline] script" error(s).`
        );
      }
    }
@ -105,11 +106,11 @@ function createPollIntervalScan(logger: Logger, startingPollInterval: number) {
    }
    if (newPollInterval !== previousPollInterval) {
      logger.debug(
-        `Poll interval configuration changing from ${previousPollInterval} to ${newPollInterval} after seeing ${errorCount} error(s)`
+        `Poll interval configuration changing from ${previousPollInterval} to ${newPollInterval} after seeing ${errorCount} "too many request" and/or "execute [inline] script" error(s)`
      );
      if (previousPollInterval === startingPollInterval) {
        logger.warn(
-          `Poll interval configuration is temporarily increased after Elasticsearch returned ${errorCount} "too many request" error(s).`
+          `Poll interval configuration is temporarily increased after Elasticsearch returned ${errorCount} "too many request" and/or "execute [inline] script" error(s).`
        );
      }
    }
@ -121,7 +122,11 @@ function countErrors(errors$: Observable<Error>, countInterval: number): Observa
  return merge(
    // Flush error count at fixed interval
    interval(countInterval).pipe(map(() => FLUSH_MARKER)),
-    errors$.pipe(filter((e) => SavedObjectsErrorHelpers.isTooManyRequestsError(e)))
+    errors$.pipe(
+      filter(
+        (e) => SavedObjectsErrorHelpers.isTooManyRequestsError(e) || isEsCannotExecuteScriptError(e)
+      )
+    )
  ).pipe(
    // When tag is "flush", reset the error counter
    // Otherwise increment the error counter
--- a/x-pack/plugins/task_manager/server/lib/identify_es_error.test.ts
+++ b/x-pack/plugins/task_manager/server/lib/identify_es_error.test.ts
@ -137,33 +137,32 @@ function generateESErrorWithResponse(
  rootCause: ESErrorCausedBy[] = [],
  causeBy: ESErrorCausedBy = {}
 ) {
-  return Object.assign(new Error(), {
-    msg: '[illegal_argument_exception] cannot execute [inline] scripts',
-    path: '/.kibana_task_manager/_update_by_query',
-    query: {},
-    body: '{"query":{}}',
-    statusCode: 400,
-    response: JSON.stringify({
-      error: {
-        root_cause: rootCause,
-        type: 'search_phase_execution_exception',
-        reason: 'all shards failed',
-        phase: 'query',
-        grouped: true,
-        failed_shards: [
-          {
-            shard: 0,
-            index: '.kibana_task_manager_1',
-            node: '24A4QbjHSK6prvtopAKLKw',
-            reason: {
-              type: 'illegal_argument_exception',
-              reason: 'cannot execute [inline] scripts',
+  return {
+    name: 'ResponseError',
+    meta: {
+      body: {
+        error: {
+          root_cause: rootCause,
+          type: 'search_phase_execution_exception',
+          reason: 'all shards failed',
+          phase: 'query',
+          grouped: true,
+          failed_shards: [
+            {
+              shard: 0,
+              index: '.kibana_task_manager_8.0.0_001',
+              node: 'GJ7ekIWTT56-h-aC6Y89Gw',
+              reason: {
+                type: 'illegal_argument_exception',
+                reason: 'cannot execute [inline] scripts',
+              },
            },
-          },
-        ],
-        caused_by: causeBy,
+          ],
+          caused_by: causeBy,
+        },
+        status: 400,
      },
-      status: 400,
-    }),
-  });
+      statusCode: 400,
+    },
+  };
 }
--- a/x-pack/plugins/task_manager/server/lib/identify_es_error.ts
+++ b/x-pack/plugins/task_manager/server/lib/identify_es_error.ts
@ -16,13 +16,27 @@ export interface ESError {
  caused_by?: ESErrorCausedBy;
 }

+export interface ESErrorBody {
+  error?: ESError;
+  status?: number;
+}
+
+export interface ESErrorMeta {
+  body?: ESErrorBody;
+  statusCode?: number;
+}
+export interface ElasticsearchResponseError {
+  name?: string;
+  meta?: ESErrorMeta;
+}
+
 function extractCausedByChain(
  causedBy: ESErrorCausedBy = {},
  accumulator: string[] = []
 ): string[] {
  const { reason, caused_by: innerCausedBy } = causedBy;

-  if (reason) {
+  if (reason && !accumulator.includes(reason)) {
    accumulator.push(reason);
  }

@ -39,11 +53,15 @@ function extractCausedByChain(
 * @param err Object Error thrown by ES JS client
 * @return ES error cause
 */
-export function identifyEsError(err: { response: string }) {
-  const { response } = err;
-
+export function identifyEsError(err: ElasticsearchResponseError) {
+  if (!err.meta) {
+    return [];
+  }
+  const {
+    meta: { body: response },
+  } = err;
  if (response) {
-    const { error } = JSON.parse(response) as { error?: ESError };
+    const { error } = response;
    if (error) {
      const { root_cause: rootCause = [], caused_by: causedBy } = error;

@ -58,3 +76,7 @@ export function identifyEsError(err: { response: string }) {
  }
  return [];
 }
+
+export function isEsCannotExecuteScriptError(err: ElasticsearchResponseError): boolean {
+  return identifyEsError(err).includes('cannot execute [inline] scripts');
+}
--- a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts
+++ b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts
@ -19,6 +19,7 @@ import { TaskClaiming, ClaimOwnershipResult } from './queries/task_claiming';
 import type { TaskClaiming as TaskClaimingClass } from './queries/task_claiming';
 import { asOk, Err, isErr, isOk, Result } from './lib/result_type';
 import { FillPoolResult } from './lib/fill_pool';
+import { ElasticsearchResponseError } from './lib/identify_es_error';

 let mockTaskClaiming = taskClaimingMock.create({});
 jest.mock('./queries/task_claiming', () => {
@ -204,12 +205,46 @@ describe('TaskPollingLifecycle', () => {
      taskClaiming.claimAvailableTasksIfCapacityIsAvailable.mockImplementation(
        () =>
          new Observable<Result<ClaimOwnershipResult, FillPoolResult>>((observer) => {
-            observer.error(
-              Object.assign(new Error(), {
-                response:
-                  '{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}],"type":"search_phase_execution_exception","reason":"all shards failed","phase":"query","grouped":true,"failed_shards":[{"shard":0,"index":".kibana_task_manager_1","node":"24A4QbjHSK6prvtopAKLKw","reason":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}}],"caused_by":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts","caused_by":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}}},"status":400}',
-              })
-            );
+            observer.error({
+              name: 'ResponseError',
+              meta: {
+                body: {
+                  error: {
+                    root_cause: [
+                      {
+                        type: 'illegal_argument_exception',
+                        reason: 'cannot execute [inline] scripts',
+                      },
+                    ],
+                    type: 'search_phase_execution_exception',
+                    reason: 'all shards failed',
+                    phase: 'query',
+                    grouped: true,
+                    failed_shards: [
+                      {
+                        shard: 0,
+                        index: '.kibana_task_manager_1',
+                        node: '24A4QbjHSK6prvtopAKLKw',
+                        reason: {
+                          type: 'illegal_argument_exception',
+                          reason: 'cannot execute [inline] scripts',
+                        },
+                      },
+                    ],
+                    caused_by: {
+                      type: 'illegal_argument_exception',
+                      reason: 'cannot execute [inline] scripts',
+                      caused_by: {
+                        type: 'illegal_argument_exception',
+                        reason: 'cannot execute [inline] scripts',
+                      },
+                    },
+                  },
+                  status: 400,
+                },
+              },
+              statusCode: 400,
+            } as ElasticsearchResponseError);
          })
      );

--- a/x-pack/plugins/task_manager/server/polling_lifecycle.ts
+++ b/x-pack/plugins/task_manager/server/polling_lifecycle.ts
@ -39,7 +39,7 @@ import {
 import { TaskPool } from './task_pool';
 import { TaskManagerRunner, TaskRunner } from './task_running';
 import { TaskStore } from './task_store';
-import { identifyEsError } from './lib/identify_es_error';
+import { identifyEsError, isEsCannotExecuteScriptError } from './lib/identify_es_error';
 import { BufferedTaskStore } from './buffered_task_store';
 import { TaskTypeDictionary } from './task_type_dictionary';
 import { delayOnClaimConflicts } from './polling';
@ -299,15 +299,16 @@ export function claimAvailableTasks(
          // we can identify the reason
          // if we can - we emit an FillPoolResult error rather than erroring out the wrapping Observable
          // returned by `claimAvailableTasks`
-          if (identifyEsError(ex).includes('cannot execute [inline] scripts')) {
+          if (isEsCannotExecuteScriptError(ex)) {
            logger.warn(
              `Task Manager cannot operate when inline scripts are disabled in Elasticsearch`
            );
            observer.next(asErr(FillPoolResult.Failed));
            observer.complete();
          } else {
+            const esError = identifyEsError(ex);
            // as we could't identify the reason - we'll error out the wrapping Observable too
-            observer.error(ex);
+            observer.error(esError.length > 0 ? esError : ex);
          }
        },
        () => {