[ML] Categorization jobs improvements (#54579)

* chunking token examples

* disabling bucket span estimator

* passing sample size to client

* better handing of token errors

* changes based on review
This commit is contained in:
James Gowdy 2020-01-14 17:53:52 +00:00 committed by GitHub
parent 14be0ee8f4
commit b598c9dc7f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 106 additions and 46 deletions

View file

@ -27,6 +27,6 @@ export const DEFAULT_QUERY_DELAY = '60s';
export const SHARED_RESULTS_INDEX_NAME = 'shared';
export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
export const CATEGORY_EXAMPLES_MULTIPLIER = 20;
export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;
export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.2;
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;

View file

@ -102,10 +102,10 @@ export class CategorizationJobCreator extends JobCreator {
}
public async loadCategorizationFieldExamples() {
const { valid, examples } = await this._examplesLoader.loadExamples();
const { valid, examples, sampleSize } = await this._examplesLoader.loadExamples();
this._categoryFieldExamples = examples;
this._categoryFieldValid = valid;
return { valid, examples };
return { valid, examples, sampleSize };
}
public get categoryFieldExamples() {

View file

@ -36,7 +36,7 @@ export class CategorizationExamplesLoader {
const analyzer = this._jobCreator.categorizationAnalyzer;
const categorizationFieldName = this._jobCreator.categorizationFieldName;
if (categorizationFieldName === null) {
return { valid: 0, examples: [] };
return { valid: 0, examples: [], sampleSize: 0 };
}
const start = Math.floor(

View file

@ -7,7 +7,9 @@
import React, { FC, useState, useEffect, useContext } from 'react';
import { FormattedMessage } from '@kbn/i18n/react';
import { EuiButton } from '@elastic/eui';
import { isAdvancedJobCreator } from '../../../../../common/job_creator';
import { JobCreatorContext } from '../../../job_creator_context';
import { MLCATEGORY } from '../../../../../../../../../common/constants/field_types';
import { useEstimateBucketSpan, ESTIMATE_STATUS } from './estimate_bucket_span';
@ -19,6 +21,7 @@ export const BucketSpanEstimator: FC<Props> = ({ setEstimating }) => {
const { jobCreator, jobCreatorUpdate } = useContext(JobCreatorContext);
const { status, estimateBucketSpan } = useEstimateBucketSpan();
const [noDetectors, setNoDetectors] = useState(jobCreator.detectors.length === 0);
const [isUsingMlCategory, setIsUsingMlCategory] = useState(checkIsUsingMlCategory());
useEffect(() => {
setEstimating(status === ESTIMATE_STATUS.RUNNING);
@ -26,11 +29,29 @@ export const BucketSpanEstimator: FC<Props> = ({ setEstimating }) => {
useEffect(() => {
setNoDetectors(jobCreator.detectors.length === 0);
setIsUsingMlCategory(checkIsUsingMlCategory());
}, [jobCreatorUpdate]);
function checkIsUsingMlCategory() {
return (
isAdvancedJobCreator(jobCreator) &&
jobCreator.detectors.some(d => {
if (
d.partition_field_name === MLCATEGORY ||
d.over_field_name === MLCATEGORY ||
d.by_field_name === MLCATEGORY
) {
return true;
}
})
);
}
return (
<EuiButton
disabled={status === ESTIMATE_STATUS.RUNNING || noDetectors === true}
disabled={
status === ESTIMATE_STATUS.RUNNING || noDetectors === true || isUsingMlCategory === true
}
onClick={estimateBucketSpan}
>
<FormattedMessage

View file

@ -30,7 +30,7 @@ export const Description: FC<Props> = memo(({ children, isOptional }) => {
) : (
<FormattedMessage
id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationField.description"
defaultMessage="Specifies which field will be categorized. Using text data types is recommended."
defaultMessage="Specifies which field will be categorized. Using text data types is recommended. Categorization works best on machine written log messages, typically logging written by a developer for the purpose of system troubleshooting."
/>
)}
</>

View file

@ -12,8 +12,6 @@ import { FormattedMessage } from '@kbn/i18n/react';
import { CategorizationAnalyzer } from '../../../../../../../services/ml_server_info';
import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
import {
NUMBER_OF_CATEGORY_EXAMPLES,
CATEGORY_EXAMPLES_MULTIPLIER,
CATEGORY_EXAMPLES_ERROR_LIMIT,
CATEGORY_EXAMPLES_WARNING_LIMIT,
} from '../../../../../../../../../common/constants/new_job';
@ -22,11 +20,16 @@ type CategorizationAnalyzerType = CategorizationAnalyzer | null;
interface Props {
examplesValid: number;
sampleSize: number;
categorizationAnalyzer: CategorizationAnalyzerType;
}
export const ExamplesValidCallout: FC<Props> = ({ examplesValid, categorizationAnalyzer }) => {
const percentageText = <PercentageText examplesValid={examplesValid} />;
export const ExamplesValidCallout: FC<Props> = ({
examplesValid,
categorizationAnalyzer,
sampleSize,
}) => {
const percentageText = <PercentageText examplesValid={examplesValid} sampleSize={sampleSize} />;
const analyzerUsed = <AnalyzerUsed categorizationAnalyzer={categorizationAnalyzer} />;
let color: EuiCallOutProps['color'] = 'success';
@ -64,13 +67,16 @@ export const ExamplesValidCallout: FC<Props> = ({ examplesValid, categorizationA
);
};
const PercentageText: FC<{ examplesValid: number }> = ({ examplesValid }) => (
const PercentageText: FC<{ examplesValid: number; sampleSize: number }> = ({
examplesValid,
sampleSize,
}) => (
<div>
<FormattedMessage
id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldPercentage"
defaultMessage="{number} field values analyzed, {percentage}% contain valid tokens."
defaultMessage="{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain valid tokens."
values={{
number: NUMBER_OF_CATEGORY_EXAMPLES * CATEGORY_EXAMPLES_MULTIPLIER,
number: sampleSize,
percentage: Math.floor(examplesValid * 100),
}}
/>

View file

@ -6,6 +6,7 @@
import React, { FC, useContext, useEffect, useState } from 'react';
import { EuiHorizontalRule } from '@elastic/eui';
import { mlMessageBarService } from '../../../../../../../components/messagebar';
import { JobCreatorContext } from '../../../job_creator_context';
import { CategorizationJobCreator } from '../../../../../common/job_creator';
@ -32,6 +33,7 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
);
const [fieldExamples, setFieldExamples] = useState<CategoryExample[] | null>(null);
const [examplesValid, setExamplesValid] = useState(0);
const [sampleSize, setSampleSize] = useState(0);
const [categorizationFieldName, setCategorizationFieldName] = useState(
jobCreator.categorizationFieldName
@ -69,10 +71,20 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
async function loadFieldExamples() {
if (categorizationFieldName !== null) {
setLoadingData(true);
const { valid, examples } = await jobCreator.loadCategorizationFieldExamples();
setFieldExamples(examples);
setExamplesValid(valid);
setLoadingData(false);
try {
const {
valid,
examples,
sampleSize: tempSampleSize,
} = await jobCreator.loadCategorizationFieldExamples();
setFieldExamples(examples);
setExamplesValid(valid);
setLoadingData(false);
setSampleSize(tempSampleSize);
} catch (error) {
setLoadingData(false);
mlMessageBarService.notify.error(error);
}
} else {
setFieldExamples(null);
setExamplesValid(0);
@ -97,6 +109,7 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
{fieldExamples !== null && loadingData === false && (
<>
<ExamplesValidCallout
sampleSize={sampleSize}
examplesValid={examplesValid}
categorizationAnalyzer={jobCreator.categorizationAnalyzer}
/>

View file

@ -185,7 +185,7 @@ declare interface Ml {
start: number,
end: number,
analyzer: any
): Promise<{ valid: number; examples: any[] }>;
): Promise<{ valid: number; examples: any[]; sampleSize: number }>;
topCategories(
jobId: string,
count: number

View file

@ -4,11 +4,15 @@
* you may not use this file except in compliance with the Elastic License.
*/
import { chunk } from 'lodash';
import { ML_RESULTS_INDEX_PATTERN } from '../../../../common/constants/index_patterns';
import { CATEGORY_EXAMPLES_MULTIPLIER } from '../../../../common/constants/new_job';
import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../common/constants/new_job';
import { CategoryId, Category, Token } from '../../../../common/types/categories';
import { callWithRequestType } from '../../../../common/types/kibana';
const VALID_TOKEN_COUNT = 3;
const CHUNK_SIZE = 100;
export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
async function categorizationExamples(
indexPatternTitle: string,
@ -54,21 +58,31 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
});
const examples: string[] = results.hits?.hits
?.map((doc: any) => doc._source[categorizationFieldName])
.filter((example: string | undefined) => example !== undefined);
.filter((example: string | null | undefined) => example !== undefined && example !== null);
let tokens: Token[] = [];
try {
const { tokens: tempTokens } = await callWithRequest('indices.analyze', {
body: {
...getAnalyzer(analyzer),
text: examples,
},
});
tokens = tempTokens;
} catch (error) {
// fail silently, the tokens could not be loaded
// an empty list of tokens will be returned for each example
async function loadTokens(chunkSize: number) {
const exampleChunks = chunk(examples, chunkSize);
const tokensPerChunks = await Promise.all(exampleChunks.map(c => getTokens(c, analyzer)));
const tokensPerExample = tokensPerChunks.flat();
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
}
try {
return loadTokens(CHUNK_SIZE);
} catch (error) {
// if an error is thrown when loading the tokens, lower the chunk size by half and try again
// the error may have been caused by too many tokens being found.
// the _analyze endpoint has a maximum of 10000 tokens.
return loadTokens(CHUNK_SIZE / 2);
}
}
async function getTokens(examples: string[], analyzer?: any) {
const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
body: {
...getAnalyzer(analyzer),
text: examples,
},
});
const lengths = examples.map(e => e.length);
const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
@ -88,8 +102,7 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
}
}
});
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
return tokensPerExample;
}
function getAnalyzer(analyzer: any) {
@ -110,10 +123,10 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
end: number,
analyzer?: any
) {
const examples = await categorizationExamples(
const resp = await categorizationExamples(
indexPatternTitle,
query,
size * CATEGORY_EXAMPLES_MULTIPLIER,
CATEGORY_EXAMPLES_SAMPLE_SIZE,
categorizationFieldName,
timeField,
start,
@ -121,20 +134,27 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
analyzer
);
const sortedExamples = examples
const sortedExamples = resp
.map((e, i) => ({ ...e, origIndex: i }))
.sort((a, b) => b.tokens.length - a.tokens.length);
const validExamples = sortedExamples.filter(e => e.tokens.length > 1);
const validExamples = sortedExamples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT);
const sampleSize = sortedExamples.length;
const multiple = Math.floor(sampleSize / size) || sampleSize;
const filteredExamples = [];
let i = 0;
while (filteredExamples.length < size && i < sortedExamples.length) {
filteredExamples.push(sortedExamples[i]);
i += multiple;
}
const examples = filteredExamples
.sort((a, b) => a.origIndex - b.origIndex)
.map(e => ({ text: e.text, tokens: e.tokens }));
return {
sampleSize,
valid: sortedExamples.length === 0 ? 0 : validExamples.length / sortedExamples.length,
examples: sortedExamples
.filter(
(e, i) =>
i / CATEGORY_EXAMPLES_MULTIPLIER - Math.floor(i / CATEGORY_EXAMPLES_MULTIPLIER) === 0
)
.sort((a, b) => a.origIndex - b.origIndex)
.map(e => ({ text: e.text, tokens: e.tokens })),
examples,
};
}