[ML] Telemetry for the Anomaly detection jobs health rule type (#110052)

* [ML] add mappings for the new rule type * [ML] add telemetry for enabled health checks * [ML] update xpack_plugins.json
2021-08-25 17:52:06 +02:00 · 2021-08-25 17:52:06 +02:00 · d66397cfe4
parent 41f7b429d1
commit d66397cfe4
3 changed files with 135 additions and 0 deletions
--- a/x-pack/plugins/alerting/server/usage/alerts_usage_collector.ts
+++ b/x-pack/plugins/alerting/server/usage/alerts_usage_collector.ts
@ -46,6 +46,7 @@ const byTypeSchema: MakeSchemaFrom<AlertsUsage>['count_by_type'] = {
  '__geo-containment': { type: 'long' },
  // ML
  xpack_ml_anomaly_detection_alert: { type: 'long' },
+  xpack_ml_anomaly_detection_jobs_health: { type: 'long' },
 };

 export function createAlertsUsageCollector(
--- a/x-pack/plugins/ml/server/usage/collector.ts
+++ b/x-pack/plugins/ml/server/usage/collector.ts
@ -8,6 +8,8 @@
 import type { UsageCollectionSetup } from '../../../../../src/plugins/usage_collection/server';
 import { ML_ALERT_TYPES } from '../../common/constants/alerts';
 import { AnomalyResultType } from '../../common/types/anomalies';
+import { MlAnomalyDetectionJobsHealthRuleParams } from '../../common/types/alerts';
+import { getResultJobsHealthRuleConfig } from '../../common/util/alerts';

 export interface MlUsageData {
  alertRules: {
@ -18,6 +20,14 @@ export interface MlUsageData {
        influencer: number;
      };
    };
+    'xpack.ml.anomaly_detection_jobs_health': {
+      count_by_check_type: {
+        datafeed: number;
+        mml: number;
+        delayedData: number;
+        errorMessages: number;
+      };
+    };
  };
 }

@ -42,6 +52,38 @@ export function registerCollector(usageCollection: UsageCollectionSetup, kibanaI
            },
          },
        },
+        'xpack.ml.anomaly_detection_jobs_health': {
+          count_by_check_type: {
+            datafeed: {
+              type: 'long',
+              _meta: {
+                description:
+                  'total number of alerting rules performing the not started datafeed health check',
+              },
+            },
+            mml: {
+              type: 'long',
+              _meta: {
+                description:
+                  'total number of alerting rules performing the model memory limit health check',
+              },
+            },
+            delayedData: {
+              type: 'long',
+              _meta: {
+                description:
+                  'total number of alerting rules performing the delayed data health check',
+              },
+            },
+            errorMessages: {
+              type: 'long',
+              _meta: {
+                description:
+                  'total number of alerting rules performing the error messages health check',
+              },
+            },
+          },
+        },
      },
    },
    isReady: () => !!kibanaIndex,
@ -86,11 +128,65 @@ export function registerCollector(usageCollection: UsageCollectionSetup, kibanaI
        return acc;
      }, {} as MlUsageData['alertRules'][typeof ML_ALERT_TYPES.ANOMALY_DETECTION]['count_by_result_type']);

+      const jobsHealthRuleInstances = await esClient.search<{
+        alert: {
+          params: MlAnomalyDetectionJobsHealthRuleParams;
+        };
+      }>({
+        index: kibanaIndex,
+        size: 10000,
+        body: {
+          query: {
+            bool: {
+              filter: [
+                { term: { type: 'alert' } },
+                {
+                  term: {
+                    'alert.alertTypeId': ML_ALERT_TYPES.AD_JOBS_HEALTH,
+                  },
+                },
+              ],
+            },
+          },
+        },
+      });
+
+      const resultsByCheckType = jobsHealthRuleInstances.body.hits.hits.reduce(
+        (acc, curr) => {
+          const doc = curr._source;
+          if (!doc) return acc;
+
+          const {
+            alert: {
+              params: { testsConfig },
+            },
+          } = doc;
+
+          const resultConfig = getResultJobsHealthRuleConfig(testsConfig);
+
+          acc.datafeed += resultConfig.datafeed.enabled ? 1 : 0;
+          acc.mml += resultConfig.mml.enabled ? 1 : 0;
+          acc.delayedData += resultConfig.delayedData.enabled ? 1 : 0;
+          acc.errorMessages += resultConfig.errorMessages.enabled ? 1 : 0;
+
+          return acc;
+        },
+        {
+          datafeed: 0,
+          mml: 0,
+          delayedData: 0,
+          errorMessages: 0,
+        }
+      );
+
      return {
        alertRules: {
          [ML_ALERT_TYPES.ANOMALY_DETECTION]: {
            count_by_result_type: countByResultType,
          },
+          [ML_ALERT_TYPES.AD_JOBS_HEALTH]: {
+            count_by_check_type: resultsByCheckType,
+          },
        },
      };
    },
--- a/x-pack/plugins/telemetry_collection_xpack/schema/xpack_plugins.json
+++ b/x-pack/plugins/telemetry_collection_xpack/schema/xpack_plugins.json
@ -228,6 +228,9 @@
            },
            "xpack_ml_anomaly_detection_alert": {
              "type": "long"
+            },
+            "xpack_ml_anomaly_detection_jobs_health": {
+              "type": "long"
            }
          }
        },
@ -307,6 +310,9 @@
            },
            "xpack_ml_anomaly_detection_alert": {
              "type": "long"
+            },
+            "xpack_ml_anomaly_detection_jobs_health": {
+              "type": "long"
            }
          }
        }
@ -3804,6 +3810,38 @@
                  }
                }
              }
+            },
+            "xpack.ml.anomaly_detection_jobs_health": {
+              "properties": {
+                "count_by_check_type": {
+                  "properties": {
+                    "datafeed": {
+                      "type": "long",
+                      "_meta": {
+                        "description": "total number of alerting rules performing the not started datafeed health check"
+                      }
+                    },
+                    "mml": {
+                      "type": "long",
+                      "_meta": {
+                        "description": "total number of alerting rules performing the model memory limit health check"
+                      }
+                    },
+                    "delayedData": {
+                      "type": "long",
+                      "_meta": {
+                        "description": "total number of alerting rules performing the delayed data health check"
+                      }
+                    },
+                    "errorMessages": {
+                      "type": "long",
+                      "_meta": {
+                        "description": "total number of alerting rules performing the error messages health check"
+                      }
+                    }
+                  }
+                }
+              }
            }
          }
        }