[ML] Fixing categorization tokens for multi-line messages (#103007)

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
2021-06-29 10:28:51 +01:00 · 2021-06-29 10:28:51 +01:00 · 824463ace5
parent b774e37ea1
commit 824463ace5
1 changed files with 3 additions and 2 deletions
--- a/x-pack/plugins/ml/server/models/job_service/new_job/categorization/examples.ts
+++ b/x-pack/plugins/ml/server/models/job_service/new_job/categorization/examples.ts
@ -145,10 +145,11 @@ export function categorizationExamplesProvider({
        for (let g = 0; g < sumLengths.length; g++) {
          if (t.start_offset <= sumLengths[g] + g) {
            const offset = g > 0 ? sumLengths[g - 1] + g : 0;
+            const start = t.start_offset - offset;
            tokensPerExample[g].push({
              ...t,
-              start_offset: t.start_offset - offset,
-              end_offset: t.end_offset - offset,
+              start_offset: start,
+              end_offset: start + t.token.length,
            });
            break;
          }