Simplify or optimize regexes with polynomial time worst cases (#44197)

* Simplify or optimize regexes with polynomial time worst cases

* PR feedback & cleanup

Co-authored-by: David Michon <dmichon-msft@users.noreply.github.com>

* Use builtin scanner function for checking whitespace in fallback method (its faster)

Co-authored-by: David Michon <dmichon-msft@users.noreply.github.com>
This commit is contained in:
Wesley Wigham 2021-05-24 15:28:52 -07:00 committed by GitHub
parent 2203228b62
commit fcabb5c0cc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 158 additions and 85 deletions

View file

@ -3034,10 +3034,6 @@ namespace ts {
return filter(map(values, v => convertJsonOption(option.element, v, basePath, errors)), v => !!v);
}
function trimString(s: string) {
return typeof s.trim === "function" ? s.trim() : s.replace(/^[\s]+|[\s]+$/g, "");
}
/**
* Tests for a path that ends in a recursive directory wildcard.
* Matches **, \**, **\, and \**\, but not a**b.
@ -3051,36 +3047,6 @@ namespace ts {
*/
const invalidTrailingRecursionPattern = /(^|\/)\*\*\/?$/;
/**
* Tests for a path where .. appears after a recursive directory wildcard.
* Matches **\..\*, **\a\..\*, and **\.., but not ..\**\*
*
* NOTE: used \ in place of / above to avoid issues with multiline comments.
*
* Breakdown:
* (^|\/) # matches either the beginning of the string or a directory separator.
* \*\*\/ # matches a recursive directory wildcard "**" followed by a directory separator.
* (.*\/)? # optionally matches any number of characters followed by a directory separator.
* \.\. # matches a parent directory path component ".."
* ($|\/) # matches either the end of the string or a directory separator.
*/
const invalidDotDotAfterRecursiveWildcardPattern = /(^|\/)\*\*\/(.*\/)?\.\.($|\/)/;
/**
* Tests for a path containing a wildcard character in a directory component of the path.
* Matches \*\, \?\, and \a*b\, but not \a\ or \a\*.
*
* NOTE: used \ in place of / above to avoid issues with multiline comments.
*
* Breakdown:
* \/ # matches a directory separator.
* [^/]*? # matches any number of characters excluding directory separators (non-greedy).
* [*?] # matches either a wildcard character (* or ?)
* [^/]* # matches any number of characters excluding directory separators (greedy).
* \/ # matches a directory separator.
*/
const watchRecursivePattern = /\/[^/]*?[*?][^/]*\//;
/**
* Matches the portion of a wildcard path that does not contain wildcards.
* Matches \a of \a\*, or \a\b\c of \a\b\c\?\d.
@ -3217,6 +3183,20 @@ namespace ts {
return matchesExcludeWorker(pathToCheck, validatedExcludeSpecs, useCaseSensitiveFileNames, currentDirectory, basePath);
}
function invalidDotDotAfterRecursiveWildcard(s: string) {
// We used to use the regex /(^|\/)\*\*\/(.*\/)?\.\.($|\/)/ to check for this case, but
// in v8, that has polynomial performance because the recursive wildcard match - **/ -
// can be matched in many arbitrary positions when multiple are present, resulting
// in bad backtracking (and we don't care which is matched - just that some /.. segment
// comes after some **/ segment).
const wildcardIndex = startsWith(s, "**/") ? 0 : s.indexOf("/**/");
if (wildcardIndex === -1) {
return false;
}
const lastDotIndex = endsWith(s, "/..") ? s.length : s.lastIndexOf("/../");
return lastDotIndex > wildcardIndex;
}
/* @internal */
export function matchesExclude(
pathToCheck: string,
@ -3226,7 +3206,7 @@ namespace ts {
) {
return matchesExcludeWorker(
pathToCheck,
filter(excludeSpecs, spec => !invalidDotDotAfterRecursiveWildcardPattern.test(spec)),
filter(excludeSpecs, spec => !invalidDotDotAfterRecursiveWildcard(spec)),
useCaseSensitiveFileNames,
currentDirectory
);
@ -3268,7 +3248,7 @@ namespace ts {
if (disallowTrailingRecursion && invalidTrailingRecursionPattern.test(spec)) {
return [Diagnostics.File_specification_cannot_end_in_a_recursive_directory_wildcard_Asterisk_Asterisk_Colon_0, spec];
}
else if (invalidDotDotAfterRecursiveWildcardPattern.test(spec)) {
else if (invalidDotDotAfterRecursiveWildcard(spec)) {
return [Diagnostics.File_specification_cannot_contain_a_parent_directory_that_appears_after_a_recursive_directory_wildcard_Asterisk_Asterisk_Colon_0, spec];
}
}
@ -3331,9 +3311,18 @@ namespace ts {
function getWildcardDirectoryFromSpec(spec: string, useCaseSensitiveFileNames: boolean): { key: string, flags: WatchDirectoryFlags } | undefined {
const match = wildcardDirectoryPattern.exec(spec);
if (match) {
// We check this with a few `indexOf` calls because 3 `indexOf`/`lastIndexOf` calls is
// less algorithmically complex (roughly O(3n) worst-case) than the regex we used to use,
// \/[^/]*?[*?][^/]*\/ which was polynominal in v8, since arbitrary sequences of wildcard
// characters could match any of the central patterns, resulting in bad backtracking.
const questionWildcardIndex = spec.indexOf("?");
const starWildcardIndex = spec.indexOf("*");
const lastDirectorySeperatorIndex = spec.lastIndexOf(directorySeparator);
return {
key: useCaseSensitiveFileNames ? match[0] : toFileNameLowerCase(match[0]),
flags: watchRecursivePattern.test(spec) ? WatchDirectoryFlags.Recursive : WatchDirectoryFlags.None
flags: (questionWildcardIndex !== -1 && questionWildcardIndex < lastDirectorySeperatorIndex)
|| (starWildcardIndex !== -1 && starWildcardIndex < lastDirectorySeperatorIndex)
? WatchDirectoryFlags.Recursive : WatchDirectoryFlags.None
};
}
if (isImplicitGlob(spec)) {

View file

@ -2035,11 +2035,51 @@ namespace ts {
* Takes a string like "jquery-min.4.2.3" and returns "jquery"
*/
export function removeMinAndVersionNumbers(fileName: string) {
// Match a "." or "-" followed by a version number or 'min' at the end of the name
const trailingMinOrVersion = /[.-]((min)|(\d+(\.\d+)*))$/;
// We used to use the regex /[.-]((min)|(\d+(\.\d+)*))$/ and would just .replace it twice.
// Unfortunately, that regex has O(n^2) performance because v8 doesn't match from the end of the string.
// Instead, we now essentially scan the filename (backwards) ourselves.
// The "min" or version may both be present, in either order, so try applying the above twice.
return fileName.replace(trailingMinOrVersion, "").replace(trailingMinOrVersion, "");
let end: number = fileName.length;
for (let pos = end - 1; pos > 0; pos--) {
let ch: number = fileName.charCodeAt(pos);
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
// Match a \d+ segment
do {
--pos;
ch = fileName.charCodeAt(pos);
} while (pos > 0 && ch >= CharacterCodes._0 && ch <= CharacterCodes._9);
}
else if (pos > 4 && (ch === CharacterCodes.n || ch === CharacterCodes.N)) {
// Looking for "min" or "min"
// Already matched the 'n'
--pos;
ch = fileName.charCodeAt(pos);
if (ch !== CharacterCodes.i && ch !== CharacterCodes.I) {
break;
}
--pos;
ch = fileName.charCodeAt(pos);
if (ch !== CharacterCodes.m && ch !== CharacterCodes.M) {
break;
}
--pos;
ch = fileName.charCodeAt(pos);
}
else {
// This character is not part of either suffix pattern
break;
}
if (ch !== CharacterCodes.minus && ch !== CharacterCodes.dot) {
break;
}
end = pos;
}
// end might be fileName.length, in which case this should internally no-op
return end === fileName.length ? fileName : fileName.slice(0, end);
}
/** Remove an item from an array, moving everything to its right one space left. */

View file

@ -471,6 +471,9 @@ namespace ts {
// An `Array` with extra properties is rendered as `[A, B, prop1: 1, prop2: 2]`. Most of
// these aren't immediately useful so we trim off the `prop1: ..., prop2: ...` part from the
// formatted string.
// This regex can trigger slow backtracking because of overlapping potential captures.
// We don't care, this is debug code that's only enabled with a debugger attached -
// we're just taking note of it for anyone checking regex performance in the future.
defaultValue = String(defaultValue).replace(/(?:,[\s\w\d_]+:[^,]+)+\]$/, "]");
return `NodeArray ${defaultValue}`;
}

View file

@ -9094,7 +9094,7 @@ namespace ts {
if (namedArgRegExCache.has(name)) {
return namedArgRegExCache.get(name)!;
}
const result = new RegExp(`(\\s${name}\\s*=\\s*)('|")(.+?)\\2`, "im");
const result = new RegExp(`(\\s${name}\\s*=\\s*)(?:(?:'([^']*)')|(?:"([^"]*)"))`, "im");
namedArgRegExCache.set(name, result);
return result;
}
@ -9118,16 +9118,17 @@ namespace ts {
return; // Missing required argument, don't parse
}
else if (matchResult) {
const value = matchResult[2] || matchResult[3];
if (arg.captureSpan) {
const startPos = range.pos + matchResult.index + matchResult[1].length + matchResult[2].length;
const startPos = range.pos + matchResult.index + matchResult[1].length + 1;
argument[arg.name] = {
value: matchResult[3],
value,
pos: startPos,
end: startPos + matchResult[3].length
end: startPos + value.length
};
}
else {
argument[arg.name] = matchResult[3];
argument[arg.name] = value;
}
}
}
@ -9145,7 +9146,7 @@ namespace ts {
}
if (range.kind === SyntaxKind.MultiLineCommentTrivia) {
const multiLinePragmaRegEx = /\s*@(\S+)\s*(.*)\s*$/gim; // Defined inline since it uses the "g" flag, which keeps a persistent index (for iterating)
const multiLinePragmaRegEx = /@(\S+)(\s+.*)?$/gim; // Defined inline since it uses the "g" flag, which keeps a persistent index (for iterating)
let multiLineMatch: RegExpExecArray | null;
while (multiLineMatch = multiLinePragmaRegEx.exec(text)) {
addPragmaForMatch(pragmas, range, PragmaKindFlags.MultiLine, multiLineMatch);
@ -9170,7 +9171,7 @@ namespace ts {
function getNamedPragmaArguments(pragma: PragmaDefinition, text: string | undefined): {[index: string]: string} | "fail" {
if (!text) return {};
if (!pragma.args) return {};
const args = text.split(/\s+/);
const args = trimString(text).split(/\s+/);
const argMap: {[index: string]: string} = {};
for (let i = 0; i < pragma.args.length; i++) {
const argument = pragma.args[i];

View file

@ -406,7 +406,7 @@ namespace ts {
const lineStart = getPositionOfLineAndCharacter(file, i, 0);
const lineEnd = i < lastLineInFile ? getPositionOfLineAndCharacter(file, i + 1, 0) : file.text.length;
let lineContent = file.text.slice(lineStart, lineEnd);
lineContent = lineContent.replace(/\s+$/g, ""); // trim from end
lineContent = trimStringEnd(lineContent); // trim from end
lineContent = lineContent.replace(/\t/g, " "); // convert tabs to single spaces
// Output the gutter and the actual contents of the line.

File diff suppressed because one or more lines are too long

View file

@ -204,7 +204,7 @@ namespace ts {
// range-set ::= range ( logical-or range ) *
// range ::= hyphen | simple ( ' ' simple ) * | ''
// logical-or ::= ( ' ' ) * '||' ( ' ' ) *
const logicalOrRegExp = /\s*\|\|\s*/g;
const logicalOrRegExp = /\|\|/g;
const whitespaceRegExp = /\s+/g;
// https://github.com/npm/node-semver#range-grammar
@ -230,20 +230,21 @@ namespace ts {
// primitive ::= ( '<' | '>' | '>=' | '<=' | '=' ) partial
// tilde ::= '~' partial
// caret ::= '^' partial
const rangeRegExp = /^\s*(~|\^|<|<=|>|>=|=)?\s*([a-z0-9-+.*]+)$/i;
const rangeRegExp = /^(~|\^|<|<=|>|>=|=)?\s*([a-z0-9-+.*]+)$/i;
function parseRange(text: string) {
const alternatives: Comparator[][] = [];
for (const range of text.trim().split(logicalOrRegExp)) {
for (let range of trimString(text).split(logicalOrRegExp)) {
if (!range) continue;
const comparators: Comparator[] = [];
range = trimString(range);
const match = hyphenRegExp.exec(range);
if (match) {
if (!parseHyphen(match[1], match[2], comparators)) return undefined;
}
else {
for (const simple of range.split(whitespaceRegExp)) {
const match = rangeRegExp.exec(simple);
const match = rangeRegExp.exec(trimString(simple));
if (!match || !parseComparator(match[1], match[2], comparators)) return undefined;
}
}

View file

@ -322,7 +322,7 @@ namespace ts {
}
// Sometimes tools can see the following line as a source mapping url comment, so we mangle it a bit (the [M])
const sourceMapCommentRegExp = /^\/\/[@#] source[M]appingURL=(.+)\s*$/;
const sourceMapCommentRegExp = /^\/\/[@#] source[M]appingURL=(.+)$/;
const whitespaceOrMapCommentRegExp = /^\s*(\/\/[@#] .*)?$/;
export interface LineInfo {
@ -345,7 +345,7 @@ namespace ts {
const line = lineInfo.getLineText(index);
const comment = sourceMapCommentRegExp.exec(line);
if (comment) {
return comment[1];
return trimStringEnd(comment[1]);
}
// If we see a non-whitespace/map comment-like line, break, to avoid scanning up the entire file
else if (!line.match(whitespaceOrMapCommentRegExp)) {

View file

@ -414,10 +414,10 @@ namespace ts {
commentPos + 2 < commentEnd &&
text.charCodeAt(commentPos + 2) === CharacterCodes.slash) {
const textSubStr = text.substring(commentPos, commentEnd);
return textSubStr.match(fullTripleSlashReferencePathRegEx) ||
textSubStr.match(fullTripleSlashAMDReferencePathRegEx) ||
textSubStr.match(fullTripleSlashReferenceTypeReferenceDirectiveRegEx) ||
textSubStr.match(defaultLibReferenceRegEx) ?
return fullTripleSlashReferencePathRegEx.test(textSubStr) ||
fullTripleSlashAMDReferencePathRegEx.test(textSubStr) ||
fullTripleSlashReferenceTypeReferenceDirectiveRegEx.test(textSubStr) ||
defaultLibReferenceRegEx.test(textSubStr) ?
true : false;
}
return false;
@ -517,12 +517,43 @@ namespace ts {
if (isJSDocTypeExpressionOrChild(node)) {
// strip space + asterisk at line start
text = text.replace(/(^|\r?\n|\r)\s*\*\s*/g, "$1");
text = text.split(/\r\n|\n|\r/).map(line => trimStringStart(line.replace(/^\s*\*/, ""))).join("\n");
}
return text;
}
/**
* Removes the leading and trailing white space and line terminator characters from a string.
*/
export const trimString = !!String.prototype.trim ? ((s: string) => s.trim()) : (s: string) => trimStringEnd(trimStringStart(s));
/**
* Returns a copy with trailing whitespace removed.
*/
export const trimStringEnd = !!String.prototype.trimEnd ? ((s: string) => s.trimEnd()) : trimEndImpl;
/**
* Returns a copy with leading whitespace removed.
*/
export const trimStringStart = !!String.prototype.trimStart ? ((s: string) => s.trimStart()) : (s: string) => s.replace(/^\s+/g, "");
/**
* https://jsbench.me/gjkoxld4au/1
* The simple regex for this, /\s+$/g is O(n^2) in v8.
* The native .trimEnd method is by far best, but since that's technically ES2019,
* we provide a (still much faster than the simple regex) fallback.
*/
function trimEndImpl(s: string) {
let end = s.length - 1;
while (end >= 0) {
if (!isWhiteSpaceLike(s.charCodeAt(end))) break;
end--;
}
return s.slice(0, end + 1);
}
export function getTextOfNode(node: Node, includeTrivia = false): string {
return getSourceTextOfNodeFromSourceFile(getSourceFileOfNode(node), node, includeTrivia);
}
@ -1226,10 +1257,10 @@ namespace ts {
text.charCodeAt(comment.pos + 3) !== CharacterCodes.slash);
}
export const fullTripleSlashReferencePathRegEx = /^(\/\/\/\s*<reference\s+path\s*=\s*)('|")(.+?)\2.*?\/>/;
const fullTripleSlashReferenceTypeReferenceDirectiveRegEx = /^(\/\/\/\s*<reference\s+types\s*=\s*)('|")(.+?)\2.*?\/>/;
export const fullTripleSlashAMDReferencePathRegEx = /^(\/\/\/\s*<amd-dependency\s+path\s*=\s*)('|")(.+?)\2.*?\/>/;
const defaultLibReferenceRegEx = /^(\/\/\/\s*<reference\s+no-default-lib\s*=\s*)('|")(.+?)\2\s*\/>/;
export const fullTripleSlashReferencePathRegEx = /^(\/\/\/\s*<reference\s+path\s*=\s*)(('[^']*')|("[^"]*")).*?\/>/;
const fullTripleSlashReferenceTypeReferenceDirectiveRegEx = /^(\/\/\/\s*<reference\s+types\s*=\s*)(('[^']*')|("[^"]*")).*?\/>/;
export const fullTripleSlashAMDReferencePathRegEx = /^(\/\/\/\s*<amd-dependency\s+path\s*=\s*)(('[^']*')|("[^"]*")).*?\/>/;
const defaultLibReferenceRegEx = /^(\/\/\/\s*<reference\s+no-default-lib\s*=\s*)(('[^']*')|("[^"]*"))\s*\/>/;
export function isPartOfTypeNode(node: Node): boolean {
if (SyntaxKind.FirstTypeNode <= node.kind && node.kind <= SyntaxKind.LastTypeNode) {
@ -4630,7 +4661,7 @@ namespace ts {
function writeTrimmedCurrentLine(text: string, commentEnd: number, writer: EmitTextWriter, newLine: string, pos: number, nextLineStart: number) {
const end = Math.min(commentEnd, nextLineStart - 1);
const currentLineText = text.substring(pos, end).replace(/^\s+|\s+$/g, "");
const currentLineText = trimString(text.substring(pos, end));
if (currentLineText) {
// trimmed forward and ending spaces text
writer.writeComment(currentLineText);

View file

@ -806,7 +806,8 @@ namespace ts {
function tryClassifyTripleSlashComment(start: number, width: number): boolean {
const tripleSlashXMLCommentRegEx = /^(\/\/\/\s*)(<)(?:(\S+)((?:[^/]|\/[^>])*)(\/>)?)?/im;
const attributeRegex = /(\S+)(\s*)(=)(\s*)('[^']+'|"[^"]+")/img;
// Require a leading whitespace character (the parser already does) to prevent terrible backtracking performance
const attributeRegex = /(\s)(\S+)(\s*)(=)(\s*)('[^']+'|"[^"]+")/img;
const text = sourceFile.text.substr(start, width);
const match = tripleSlashXMLCommentRegEx.exec(text);
@ -842,30 +843,30 @@ namespace ts {
break;
}
const newAttrPos = pos + attrMatch.index;
const newAttrPos = pos + attrMatch.index + attrMatch[1].length; // whitespace
if (newAttrPos > attrPos) {
pushCommentRange(attrPos, newAttrPos - attrPos);
attrPos = newAttrPos;
}
pushClassification(attrPos, attrMatch[1].length, ClassificationType.jsxAttribute); // attribute name
attrPos += attrMatch[1].length;
pushClassification(attrPos, attrMatch[2].length, ClassificationType.jsxAttribute); // attribute name
attrPos += attrMatch[2].length;
if (attrMatch[2].length) {
pushCommentRange(attrPos, attrMatch[2].length); // whitespace
attrPos += attrMatch[2].length;
if (attrMatch[3].length) {
pushCommentRange(attrPos, attrMatch[3].length); // whitespace
attrPos += attrMatch[3].length;
}
pushClassification(attrPos, attrMatch[3].length, ClassificationType.operator); // =
attrPos += attrMatch[3].length;
pushClassification(attrPos, attrMatch[4].length, ClassificationType.operator); // =
attrPos += attrMatch[4].length;
if (attrMatch[4].length) {
pushCommentRange(attrPos, attrMatch[4].length); // whitespace
attrPos += attrMatch[4].length;
if (attrMatch[5].length) {
pushCommentRange(attrPos, attrMatch[5].length); // whitespace
attrPos += attrMatch[5].length;
}
pushClassification(attrPos, attrMatch[5].length, ClassificationType.jsxAttributeStringLiteralValue); // attribute value
attrPos += attrMatch[5].length;
pushClassification(attrPos, attrMatch[6].length, ClassificationType.jsxAttributeStringLiteralValue); // attribute value
attrPos += attrMatch[6].length;
}
pos += match[4].length;

View file

@ -94,8 +94,15 @@ namespace ts.OutliningElementsCollector {
}
}
const regionDelimiterRegExp = /^\s*\/\/\s*#(end)?region(?:\s+(.*))?(?:\r)?$/;
const regionDelimiterRegExp = /^#(end)?region(?:\s+(.*))?(?:\r)?$/;
function isRegionDelimiter(lineText: string) {
// We trim the leading whitespace and // without the regex since the
// multiple potential whitespace matches can make for some gnarly backtracking behavior
lineText = trimStringStart(lineText);
if (!startsWith(lineText, "\/\/")) {
return null; // eslint-disable-line no-null/no-null
}
lineText = trimString(lineText.slice(2));
return regionDelimiterRegExp.exec(lineText);
}