Improves performance of regex by only breaking on invisible or ambiguous characters.

This commit is contained in:
Henning Dieterichs 2021-11-22 17:48:20 +01:00
parent df082d6c2d
commit b5b33d1c05
No known key found for this signature in database
GPG key ID: 771381EFFDB9EC06
3 changed files with 60 additions and 9 deletions

View file

@ -1086,6 +1086,10 @@ export class AmbiguousCharacters {
public static getPrimaryConfusable(codePoint: number): number | undefined {
return AmbiguousCharacters.getConfusablesForCurrentLocale().get(codePoint);
}
public static getPrimaryConfusableCodePoints(): ReadonlySet<number> {
return new Set(AmbiguousCharacters.getConfusablesForCurrentLocale().keys());
}
}
export class InvisibleCharacters {
@ -1105,4 +1109,8 @@ export class InvisibleCharacters {
public static isInvisibleCharacter(codePoint: number): boolean {
return InvisibleCharacters.getData().has(codePoint);
}
public static get codePoints(): ReadonlySet<number> {
return InvisibleCharacters.getData();
}
}

View file

@ -8,16 +8,20 @@ import { Searcher } from 'vs/editor/common/model/textModelSearch';
import * as strings from 'vs/base/common/strings';
export class UnicodeTextModelHighlighter {
public static NON_BASIC_ASCII_REGEX = '[^\\t\\n\\r\\x20-\\x7E]';
public static computeUnicodeHighlights(model: IUnicodeCharacterSearcherTarget, options: UnicodeHighlighterOptions, range?: IRange): Range[] {
const startLine = range ? range.startLineNumber : 1;
const endLine = range ? range.endLineNumber : model.getLineCount();
const codePointHighlighter = new CodePointHighlighter(options);
// Only check for non-basic ASCII characters
const regex = new RegExp(UnicodeTextModelHighlighter.NON_BASIC_ASCII_REGEX, 'g');
const candidates = codePointHighlighter.getCandidateCodePoints();
let regex: RegExp;
if (candidates === 'allNonBasicAscii') {
regex = new RegExp('[^\\t\\n\\r\\x20-\\x7E]', 'g');
} else {
regex = new RegExp(`${buildRegExpCharClassExpr(Array.from(candidates))}`, 'g');
}
const searcher = new Searcher(null, regex);
const result: Range[] = [];
let m: RegExpExecArray | null;
@ -49,6 +53,12 @@ export class UnicodeTextModelHighlighter {
const str = lineContent.substring(startIndex, endIndex);
if (codePointHighlighter.shouldHighlightNonBasicASCII(str) !== SimpleHighlightReason.None) {
result.push(new Range(lineNumber, startIndex + 1, lineNumber, endIndex + 1));
const maxResultLength = 1000;
if (result.length > maxResultLength) {
// TODO@hediet a message should be shown in this case
break;
}
}
}
} while (m);
@ -76,6 +86,13 @@ export class UnicodeTextModelHighlighter {
}
}
function buildRegExpCharClassExpr(codePoints: number[], flags?: string): string {
const src = `[${strings.escapeRegExpCharacters(
codePoints.map((i) => String.fromCodePoint(i)).join('')
)}]`;
return src;
}
export const enum UnicodeHighlighterReasonKind {
Ambiguous, Invisible, NonBasicAscii
}
@ -95,6 +112,32 @@ class CodePointHighlighter {
this.allowedCodePoints = new Set(options.allowedCodePoints);
}
public getCandidateCodePoints(): Set<number> | 'allNonBasicAscii' {
if (this.options.nonBasicASCII) {
return 'allNonBasicAscii';
}
const set = new Set<number>();
if (this.options.invisibleCharacters) {
for (const cp of strings.InvisibleCharacters.codePoints) {
set.add(cp);
}
}
if (this.options.ambiguousCharacters) {
for (const cp of strings.AmbiguousCharacters.getPrimaryConfusableCodePoints()) {
set.add(cp);
}
}
for (const cp of this.allowedCodePoints) {
set.delete(cp);
}
return set;
}
public shouldHighlightNonBasicASCII(character: string): SimpleHighlightReason {
const codePoint = character.codePointAt(0)!;

View file

@ -450,14 +450,14 @@ export class ShowExcludeOptions extends EditorAction {
label: nls.localize('unicodeHighlight.excludeCharFromBeingHighlighted', 'Exclude {0} from being highlighted', `U+${codePoint.toString(16)} "${char}"`),
run: async () => {
const existingValue = configurationService.getValue(unicodeHighlightConfigKeys.allowedCharacters);
let value: string[];
if (Array.isArray(existingValue)) {
value = [...existingValue as string[]];
let value: string;
if (typeof existingValue === 'string') {
value = existingValue;
} else {
value = [];
value = '';
}
value.push(char);
value += char;
await configurationService.updateValue(unicodeHighlightConfigKeys.allowedCharacters, value, ConfigurationTarget.USER);
}
},