[Monitoring] Only do a single date_histogram agg for get_nodes calls (#43481)

* I think this is working now

* Add a way to uncovert, and then fix tests

* Remove unnecessary export
This commit is contained in:
Chris Roberson 2019-08-27 14:13:17 -04:00 committed by GitHub
parent a5c63537a2
commit 3489274ce6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 3082 additions and 2491 deletions

View file

@ -0,0 +1,76 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
import { cloneDeep } from 'lodash';
import { LISTING_METRICS_NAMES } from './nodes/get_nodes/nodes_listing_metrics';
// We should use some explicit prefix for the converted aggregation name
// so we can easily strip them out later (see `convertMetricNames` and `uncovertMetricNames`)
const CONVERTED_TOKEN = `odh_`;
/**
* This work stemmed from this issue: https://github.com/elastic/kibana/issues/43477
*
* Historically, the `get_nodes` function created an aggregation with multiple sub `date_histogram`
* aggregations for each metric aggregation. From a top down view, the entire aggregations look liked:
* `terms` agg -> [`date_histogram` -> metric agg]x6
* However, this is very inefficient, as each `date_histogram` will create a new set of search buckets
* unnecessarily and users will hit the `search.max_buckets` ceiling sooner.
*
* To solve this, we need to create a single `date_histogram`, then perform each metric agg as a sub aggregations
* of this single `date_histogram`. This is not straightforward though. The logic to build these aggregations
* is shared code between the rest of the monitoring code base and is not easily updated to accommodate the
* changes from above. To circumvent that, this function will adjust the existing aggregation names to work
* for a single date_histogram.
*
* @param string prefix - This is the aggregation name prefix where the rest of the name will be the type of aggregation
* @param object metricObj The metric aggregation itself
*/
export function convertMetricNames(prefix, metricObj) {
return Object.entries(metricObj).reduce((newObj, [key, value]) => {
const newValue = cloneDeep(value);
if (key.includes('_deriv') && newValue.derivative) {
newValue.derivative.buckets_path = `${CONVERTED_TOKEN}${prefix}__${newValue.derivative.buckets_path}`;
}
newObj[`${CONVERTED_TOKEN}${prefix}__${key}`] = newValue;
return newObj;
}, {});
}
/**
* Building upon the comment for `convertMetricNames`, we are dynamically changing the aggregation names to allow
* the single `date_histogram` to work properly. Unfortunately, the code that looks at the response also needs to
* understand the naming changes. And yet again, this code is shared amongst the rest of the monitoring code base.
* To circumvent this, we need to convert the changed aggregation names back to the original, expected names.
* This feels messy, but possible because we keep the original name in the converted aggregation name.
*
* @param object byDateBucketResponse - The response object from the single `date_histogram` bucket
*/
export function uncovertMetricNames(byDateBucketResponse) {
const unconverted = {};
for (const metricName of LISTING_METRICS_NAMES) {
unconverted[metricName] = {
buckets: byDateBucketResponse.buckets.map(bucket => {
const { key_as_string, key, doc_count, ...rest } = bucket; /* eslint-disable-line camelcase */
const metrics = Object.entries(rest).reduce((accum, [key, value]) => {
if (key.startsWith(`${CONVERTED_TOKEN}${metricName}`)) {
const name = key.split('__')[1];
accum[name] = value;
}
return accum;
}, {});
return {
key_as_string, /* eslint-disable-line camelcase */
key,
doc_count, /* eslint-disable-line camelcase */
...metrics,
};
})
};
}
return unconverted;
}

View file

@ -2,44 +2,26 @@
exports[`get metric aggs should create aggregations for "basic" metrics 1`] = `
Object {
"node_cpu_utilization": Object {
"aggs": Object {
"metric": Object {
"max": Object {
"field": "node_stats.process.cpu.percent",
},
},
"metric_deriv": Object {
"derivative": Object {
"buckets_path": "metric",
"unit": "1s",
},
},
},
"date_histogram": Object {
"field": "timestamp",
"fixed_interval": "30s",
"min_doc_count": 1,
"odh_node_cpu_utilization__metric": Object {
"max": Object {
"field": "node_stats.process.cpu.percent",
},
},
"node_jvm_mem_percent": Object {
"aggs": Object {
"metric": Object {
"max": Object {
"field": "node_stats.jvm.mem.heap_used_percent",
},
},
"metric_deriv": Object {
"derivative": Object {
"buckets_path": "metric",
"unit": "1s",
},
},
"odh_node_cpu_utilization__metric_deriv": Object {
"derivative": Object {
"buckets_path": "odh_node_cpu_utilization__metric",
"unit": "1s",
},
"date_histogram": Object {
"field": "timestamp",
"fixed_interval": "30s",
"min_doc_count": 1,
},
"odh_node_jvm_mem_percent__metric": Object {
"max": Object {
"field": "node_stats.jvm.mem.heap_used_percent",
},
},
"odh_node_jvm_mem_percent__metric_deriv": Object {
"derivative": Object {
"buckets_path": "odh_node_jvm_mem_percent__metric",
"unit": "1s",
},
},
}
@ -47,70 +29,52 @@ Object {
exports[`get metric aggs should incorporate a metric custom aggs 1`] = `
Object {
"node_index_latency": Object {
"aggs": Object {
"event_time_in_millis": Object {
"max": Object {
"field": "node_stats.indices.indexing.index_time_in_millis",
},
},
"event_time_in_millis_deriv": Object {
"derivative": Object {
"buckets_path": "event_time_in_millis",
"gap_policy": "skip",
"unit": "1s",
},
},
"event_total": Object {
"max": Object {
"field": "node_stats.indices.indexing.index_total",
},
},
"event_total_deriv": Object {
"derivative": Object {
"buckets_path": "event_total",
"gap_policy": "skip",
"unit": "1s",
},
},
},
"date_histogram": Object {
"field": "timestamp",
"fixed_interval": "30s",
"min_doc_count": 1,
"odh_node_index_latency__event_time_in_millis": Object {
"max": Object {
"field": "node_stats.indices.indexing.index_time_in_millis",
},
},
"node_query_latency": Object {
"aggs": Object {
"event_time_in_millis": Object {
"max": Object {
"field": "node_stats.indices.search.query_time_in_millis",
},
},
"event_time_in_millis_deriv": Object {
"derivative": Object {
"buckets_path": "event_time_in_millis",
"gap_policy": "skip",
"unit": "1s",
},
},
"event_total": Object {
"max": Object {
"field": "node_stats.indices.search.query_total",
},
},
"event_total_deriv": Object {
"derivative": Object {
"buckets_path": "event_total",
"gap_policy": "skip",
"unit": "1s",
},
},
"odh_node_index_latency__event_time_in_millis_deriv": Object {
"derivative": Object {
"buckets_path": "odh_node_index_latency__event_time_in_millis",
"gap_policy": "skip",
"unit": "1s",
},
"date_histogram": Object {
"field": "timestamp",
"fixed_interval": "30s",
"min_doc_count": 1,
},
"odh_node_index_latency__event_total": Object {
"max": Object {
"field": "node_stats.indices.indexing.index_total",
},
},
"odh_node_index_latency__event_total_deriv": Object {
"derivative": Object {
"buckets_path": "odh_node_index_latency__event_total",
"gap_policy": "skip",
"unit": "1s",
},
},
"odh_node_query_latency__event_time_in_millis": Object {
"max": Object {
"field": "node_stats.indices.search.query_time_in_millis",
},
},
"odh_node_query_latency__event_time_in_millis_deriv": Object {
"derivative": Object {
"buckets_path": "odh_node_query_latency__event_time_in_millis",
"gap_policy": "skip",
"unit": "1s",
},
},
"odh_node_query_latency__event_total": Object {
"max": Object {
"field": "node_stats.indices.search.query_total",
},
},
"odh_node_query_latency__event_total_deriv": Object {
"derivative": Object {
"buckets_path": "odh_node_query_latency__event_total",
"gap_policy": "skip",
"unit": "1s",
},
},
}

View file

@ -46,10 +46,10 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 37.3891333,
"maxVal": 41.948963750000004,
"minVal": 11.02918065,
"slope": 1,
"lastVal": 14.58673435,
"maxVal": 80.3369142,
"minVal": 11.0291808,
"slope": -1,
},
},
"node_cgroup_throttled": Object {
@ -66,8 +66,8 @@ Array [
"units": "ns",
},
"summary": Object {
"lastVal": 123012140,
"maxVal": 30063709491,
"lastVal": 0,
"maxVal": 23311083802,
"minVal": 0,
"slope": -1,
},
@ -85,10 +85,10 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 3,
"maxVal": 4,
"lastVal": 1,
"maxVal": 7,
"minVal": 1,
"slope": 1,
"slope": -1,
},
},
"node_free_space": Object {
@ -104,9 +104,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 3141324800,
"lastVal": 3140956160,
"maxVal": 3195629568,
"minVal": 3141324800,
"minVal": 3140956160,
"slope": -1,
},
},
@ -124,10 +124,10 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 40,
"lastVal": 42,
"maxVal": 52,
"minVal": 25,
"slope": -1,
"minVal": 24,
"slope": 1,
},
},
"node_load_average": Object {
@ -144,9 +144,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 1.0400390625,
"maxVal": 2.439453125,
"minVal": 1.0400390625,
"lastVal": 1.0302734375,
"maxVal": 3.0703125,
"minVal": 1.0302734375,
"slope": -1,
},
},
@ -194,8 +194,8 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 0,
"maxVal": 8,
"lastVal": 1,
"maxVal": 39,
"minVal": 0,
"slope": -1,
},
@ -213,9 +213,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 3141402624,
"maxVal": 3148406784,
"minVal": 3141402624,
"lastVal": 3141033984,
"maxVal": 3162230784,
"minVal": 3141033984,
"slope": -1,
},
},
@ -253,9 +253,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 1.0400390625,
"maxVal": 2.439453125,
"minVal": 1.0400390625,
"lastVal": 1.0302734375,
"maxVal": 3.0703125,
"minVal": 1.0302734375,
"slope": -1,
},
},
@ -325,10 +325,10 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 37.3891333,
"maxVal": 41.948963750000004,
"minVal": 11.02918065,
"slope": 1,
"lastVal": 14.58673435,
"maxVal": 80.3369142,
"minVal": 11.0291808,
"slope": -1,
},
},
"node_cgroup_throttled": Object {
@ -345,8 +345,8 @@ Array [
"units": "ns",
},
"summary": Object {
"lastVal": 123012140,
"maxVal": 30063709491,
"lastVal": 0,
"maxVal": 23311083802,
"minVal": 0,
"slope": -1,
},
@ -364,10 +364,10 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 3,
"maxVal": 4,
"lastVal": 1,
"maxVal": 7,
"minVal": 1,
"slope": 1,
"slope": -1,
},
},
"node_free_space": Object {
@ -383,9 +383,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 3141324800,
"lastVal": 3140956160,
"maxVal": 3195629568,
"minVal": 3141324800,
"minVal": 3140956160,
"slope": -1,
},
},
@ -403,10 +403,10 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 40,
"lastVal": 42,
"maxVal": 52,
"minVal": 25,
"slope": -1,
"minVal": 24,
"slope": 1,
},
},
"node_load_average": Object {
@ -423,9 +423,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 1.0400390625,
"maxVal": 2.439453125,
"minVal": 1.0400390625,
"lastVal": 1.0302734375,
"maxVal": 3.0703125,
"minVal": 1.0302734375,
"slope": -1,
},
},
@ -473,8 +473,8 @@ Array [
"units": "%",
},
"summary": Object {
"lastVal": 0,
"maxVal": 8,
"lastVal": 1,
"maxVal": 39,
"minVal": 0,
"slope": -1,
},
@ -492,9 +492,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 3141402624,
"maxVal": 3148406784,
"minVal": 3141402624,
"lastVal": 3141033984,
"maxVal": 3162230784,
"minVal": 3141033984,
"slope": -1,
},
},
@ -532,9 +532,9 @@ Array [
"units": "",
},
"summary": Object {
"lastVal": 1.0400390625,
"maxVal": 2.439453125,
"minVal": 1.0400390625,
"lastVal": 1.0302734375,
"maxVal": 3.0703125,
"minVal": 1.0302734375,
"slope": -1,
},
},

View file

@ -6,6 +6,7 @@
import { metrics } from '../../../metrics';
import { NORMALIZED_DERIVATIVE_UNIT } from '../../../../../common/constants';
import { convertMetricNames } from '../../convert_metric_names';
/*
* Create the DSL for date histogram aggregations based on an array of metric names
@ -16,8 +17,8 @@ import { NORMALIZED_DERIVATIVE_UNIT } from '../../../../../common/constants';
* @param {Number} bucketSize: Bucket size in seconds for date histogram interval
* @return {Object} Aggregation DSL
*/
export function getMetricAggs(listingMetrics, bucketSize) {
const aggItems = {};
export function getMetricAggs(listingMetrics) {
let aggItems = {};
listingMetrics.forEach(metricName => {
const metric = metrics[metricName];
@ -43,13 +44,9 @@ export function getMetricAggs(listingMetrics, bucketSize) {
};
}
aggItems[metricName] = {
date_histogram: {
field: 'timestamp',
min_doc_count: 1,
fixed_interval: bucketSize + 's'
},
aggs: metric.aggs || metricAgg
aggItems = {
...aggItems,
...convertMetricNames(metricName, metric.aggs || metricAgg)
};
});

View file

@ -72,7 +72,16 @@ export async function getNodes(req, esIndexPattern, clusterStats, shardStats) {
field: `source_node.uuid`,
size: config.get('xpack.monitoring.max_bucket_size')
},
aggs: getMetricAggs(LISTING_METRICS_NAMES, bucketSize)
aggs: {
by_date: {
date_histogram: {
field: 'timestamp',
min_doc_count: 1,
fixed_interval: bucketSize + 's'
},
aggs: getMetricAggs(LISTING_METRICS_NAMES, bucketSize)
}
}
}
},
sort: [ { timestamp: { order: 'desc' } } ]

View file

@ -4,10 +4,10 @@
* you may not use this file except in compliance with the Elastic License.
*/
import { get, pick } from 'lodash';
import { get } from 'lodash';
import { mapNodesInfo } from './map_nodes_info';
import { mapNodesMetrics } from './map_nodes_metrics';
import { LISTING_METRICS_NAMES } from './nodes_listing_metrics';
import { uncovertMetricNames } from '../../convert_metric_names';
/*
* Process the response from the get_nodes query
@ -31,10 +31,10 @@ export function handleResponse(response, clusterStats, shardStats, timeOptions =
* with a sub-object for all the metrics buckets
*/
const nodeBuckets = get(response, 'aggregations.nodes.buckets', []);
const metricsForNodes = nodeBuckets.reduce((accum, { key: nodeId, ...allAggBuckets }) => {
const metricsForNodes = nodeBuckets.reduce((accum, { key: nodeId, by_date: byDate }) => {
return {
...accum,
[nodeId]: pick(allAggBuckets, LISTING_METRICS_NAMES) // "metrics" are just the date histogram aggs
[nodeId]: uncovertMetricNames(byDate),
};
}, {});
const nodesMetrics = mapNodesMetrics(metricsForNodes, nodesInfo, timeOptions); // summarize the metrics of online nodes

View file

@ -16,10 +16,5 @@ export const LISTING_METRICS_NAMES = [
];
export const LISTING_METRICS_PATHS = [
'aggregations.nodes.buckets.node_cgroup_quota.buckets',
'aggregations.nodes.buckets.node_cgroup_throttled.buckets',
'aggregations.nodes.buckets.node_cpu_utilization.buckets',
'aggregations.nodes.buckets.node_load_average.buckets',
'aggregations.nodes.buckets.node_jvm_mem_percent.buckets',
'aggregations.nodes.buckets.node_free_space.buckets',
`aggregations.nodes.buckets.by_date.buckets`,
];