Simplify or optimize regexes with polynomial time worst cases (#44197)

* Simplify or optimize regexes with polynomial time worst cases * PR feedback & cleanup Co-authored-by: David Michon <dmichon-msft@users.noreply.github.com> * Use builtin scanner function for checking whitespace in fallback method (its faster) Co-authored-by: David Michon <dmichon-msft@users.noreply.github.com>
2021-05-24 15:28:52 -07:00 · 2021-05-24 15:28:52 -07:00 · fcabb5c0cc
parent 2203228b62
commit fcabb5c0cc
11 changed files with 158 additions and 85 deletions
--- a/src/compiler/commandLineParser.ts
+++ b/src/compiler/commandLineParser.ts
@ -3034,10 +3034,6 @@ namespace ts {
        return filter(map(values, v => convertJsonOption(option.element, v, basePath, errors)), v => !!v);
    }

-    function trimString(s: string) {
-        return typeof s.trim === "function" ? s.trim() : s.replace(/^[\s]+|[\s]+$/g, "");
-    }
-
    /**
     * Tests for a path that ends in a recursive directory wildcard.
     * Matches **, \**, **\, and \**\, but not a**b.
@ -3051,36 +3047,6 @@ namespace ts {
     */
    const invalidTrailingRecursionPattern = /(^|\/)\*\*\/?$/;

-    /**
-     * Tests for a path where .. appears after a recursive directory wildcard.
-     * Matches **\..\*, **\a\..\*, and **\.., but not ..\**\*
-     *
-     * NOTE: used \ in place of / above to avoid issues with multiline comments.
-     *
-     * Breakdown:
-     *  (^|\/)      # matches either the beginning of the string or a directory separator.
-     *  \*\*\/      # matches a recursive directory wildcard "**" followed by a directory separator.
-     *  (.*\/)?     # optionally matches any number of characters followed by a directory separator.
-     *  \.\.        # matches a parent directory path component ".."
-     *  ($|\/)      # matches either the end of the string or a directory separator.
-     */
-    const invalidDotDotAfterRecursiveWildcardPattern = /(^|\/)\*\*\/(.*\/)?\.\.($|\/)/;
-
-    /**
-     * Tests for a path containing a wildcard character in a directory component of the path.
-     * Matches \*\, \?\, and \a*b\, but not \a\ or \a\*.
-     *
-     * NOTE: used \ in place of / above to avoid issues with multiline comments.
-     *
-     * Breakdown:
-     *  \/          # matches a directory separator.
-     *  [^/]*?      # matches any number of characters excluding directory separators (non-greedy).
-     *  [*?]        # matches either a wildcard character (* or ?)
-     *  [^/]*       # matches any number of characters excluding directory separators (greedy).
-     *  \/          # matches a directory separator.
-     */
-    const watchRecursivePattern = /\/[^/]*?[*?][^/]*\//;
-
    /**
     * Matches the portion of a wildcard path that does not contain wildcards.
     * Matches \a of \a\*, or \a\b\c of \a\b\c\?\d.
@ -3217,6 +3183,20 @@ namespace ts {
        return matchesExcludeWorker(pathToCheck, validatedExcludeSpecs, useCaseSensitiveFileNames, currentDirectory, basePath);
    }

+    function invalidDotDotAfterRecursiveWildcard(s: string) {
+        // We used to use the regex /(^|\/)\*\*\/(.*\/)?\.\.($|\/)/ to check for this case, but
+        // in v8, that has polynomial performance because the recursive wildcard match - **/ -
+        // can be matched in many arbitrary positions when multiple are present, resulting
+        // in bad backtracking (and we don't care which is matched - just that some /.. segment
+        // comes after some **/ segment).
+        const wildcardIndex = startsWith(s, "**/") ? 0 : s.indexOf("/**/");
+        if (wildcardIndex === -1) {
+            return false;
+        }
+        const lastDotIndex = endsWith(s, "/..") ? s.length : s.lastIndexOf("/../");
+        return lastDotIndex > wildcardIndex;
+    }
+
    /* @internal */
    export function matchesExclude(
        pathToCheck: string,
@ -3226,7 +3206,7 @@ namespace ts {
    ) {
        return matchesExcludeWorker(
            pathToCheck,
-            filter(excludeSpecs, spec => !invalidDotDotAfterRecursiveWildcardPattern.test(spec)),
+            filter(excludeSpecs, spec => !invalidDotDotAfterRecursiveWildcard(spec)),
            useCaseSensitiveFileNames,
            currentDirectory
        );
@ -3268,7 +3248,7 @@ namespace ts {
        if (disallowTrailingRecursion && invalidTrailingRecursionPattern.test(spec)) {
            return [Diagnostics.File_specification_cannot_end_in_a_recursive_directory_wildcard_Asterisk_Asterisk_Colon_0, spec];
        }
-        else if (invalidDotDotAfterRecursiveWildcardPattern.test(spec)) {
+        else if (invalidDotDotAfterRecursiveWildcard(spec)) {
            return [Diagnostics.File_specification_cannot_contain_a_parent_directory_that_appears_after_a_recursive_directory_wildcard_Asterisk_Asterisk_Colon_0, spec];
        }
    }
@ -3331,9 +3311,18 @@ namespace ts {
    function getWildcardDirectoryFromSpec(spec: string, useCaseSensitiveFileNames: boolean): { key: string, flags: WatchDirectoryFlags } | undefined {
        const match = wildcardDirectoryPattern.exec(spec);
        if (match) {
+            // We check this with a few `indexOf` calls because 3 `indexOf`/`lastIndexOf` calls is
+            // less algorithmically complex (roughly O(3n) worst-case) than the regex we used to use,
+            // \/[^/]*?[*?][^/]*\/ which was polynominal in v8, since arbitrary sequences of wildcard
+            // characters could match any of the central patterns, resulting in bad backtracking.
+            const questionWildcardIndex = spec.indexOf("?");
+            const starWildcardIndex = spec.indexOf("*");
+            const lastDirectorySeperatorIndex = spec.lastIndexOf(directorySeparator);
            return {
                key: useCaseSensitiveFileNames ? match[0] : toFileNameLowerCase(match[0]),
-                flags: watchRecursivePattern.test(spec) ? WatchDirectoryFlags.Recursive : WatchDirectoryFlags.None
+                flags: (questionWildcardIndex !== -1 && questionWildcardIndex < lastDirectorySeperatorIndex)
+                    || (starWildcardIndex !== -1 && starWildcardIndex < lastDirectorySeperatorIndex)
+                    ? WatchDirectoryFlags.Recursive : WatchDirectoryFlags.None
            };
        }
        if (isImplicitGlob(spec)) {
--- a/src/compiler/core.ts
+++ b/src/compiler/core.ts
@ -2035,11 +2035,51 @@ namespace ts {
     * Takes a string like "jquery-min.4.2.3" and returns "jquery"
     */
    export function removeMinAndVersionNumbers(fileName: string) {
-        // Match a "." or "-" followed by a version number or 'min' at the end of the name
-        const trailingMinOrVersion = /[.-]((min)|(\d+(\.\d+)*))$/;
+        // We used to use the regex /[.-]((min)|(\d+(\.\d+)*))$/ and would just .replace it twice.
+        // Unfortunately, that regex has O(n^2) performance because v8 doesn't match from the end of the string.
+        // Instead, we now essentially scan the filename (backwards) ourselves.

-        // The "min" or version may both be present, in either order, so try applying the above twice.
-        return fileName.replace(trailingMinOrVersion, "").replace(trailingMinOrVersion, "");
+        let end: number = fileName.length;
+
+        for (let pos = end - 1; pos > 0; pos--) {
+            let ch: number = fileName.charCodeAt(pos);
+            if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
+                // Match a \d+ segment
+                do {
+                    --pos;
+                    ch = fileName.charCodeAt(pos);
+                } while (pos > 0 && ch >= CharacterCodes._0 && ch <= CharacterCodes._9);
+            }
+            else if (pos > 4 && (ch === CharacterCodes.n || ch === CharacterCodes.N)) {
+                // Looking for "min" or "min"
+                // Already matched the 'n'
+                --pos;
+                ch = fileName.charCodeAt(pos);
+                if (ch !== CharacterCodes.i && ch !== CharacterCodes.I) {
+                    break;
+                }
+                --pos;
+                ch = fileName.charCodeAt(pos);
+                if (ch !== CharacterCodes.m && ch !== CharacterCodes.M) {
+                    break;
+                }
+                --pos;
+                ch = fileName.charCodeAt(pos);
+            }
+            else {
+                // This character is not part of either suffix pattern
+                break;
+            }
+
+            if (ch !== CharacterCodes.minus && ch !== CharacterCodes.dot) {
+                break;
+            }
+
+            end = pos;
+        }
+
+        // end might be fileName.length, in which case this should internally no-op
+        return end === fileName.length ? fileName : fileName.slice(0, end);
    }

    /** Remove an item from an array, moving everything to its right one space left. */
--- a/src/compiler/debug.ts
+++ b/src/compiler/debug.ts
@ -471,6 +471,9 @@ namespace ts {
                            // An `Array` with extra properties is rendered as `[A, B, prop1: 1, prop2: 2]`. Most of
                            // these aren't immediately useful so we trim off the `prop1: ..., prop2: ...` part from the
                            // formatted string.
+                            // This regex can trigger slow backtracking because of overlapping potential captures.
+                            // We don't care, this is debug code that's only enabled with a debugger attached -
+                            // we're just taking note of it for anyone checking regex performance in the future.
                            defaultValue = String(defaultValue).replace(/(?:,[\s\w\d_]+:[^,]+)+\]$/, "]");
                            return `NodeArray ${defaultValue}`;
                        }
--- a/src/compiler/parser.ts
+++ b/src/compiler/parser.ts
@ -9094,7 +9094,7 @@ namespace ts {
        if (namedArgRegExCache.has(name)) {
            return namedArgRegExCache.get(name)!;
        }
-        const result = new RegExp(`(\\s${name}\\s*=\\s*)('|")(.+?)\\2`, "im");
+        const result = new RegExp(`(\\s${name}\\s*=\\s*)(?:(?:'([^']*)')|(?:"([^"]*)"))`, "im");
        namedArgRegExCache.set(name, result);
        return result;
    }
@ -9118,16 +9118,17 @@ namespace ts {
                        return; // Missing required argument, don't parse
                    }
                    else if (matchResult) {
+                        const value = matchResult[2] || matchResult[3];
                        if (arg.captureSpan) {
-                            const startPos = range.pos + matchResult.index + matchResult[1].length + matchResult[2].length;
+                            const startPos = range.pos + matchResult.index + matchResult[1].length + 1;
                            argument[arg.name] = {
-                                value: matchResult[3],
+                                value,
                                pos: startPos,
-                                end: startPos + matchResult[3].length
+                                end: startPos + value.length
                            };
                        }
                        else {
-                            argument[arg.name] = matchResult[3];
+                            argument[arg.name] = value;
                        }
                    }
                }
@ -9145,7 +9146,7 @@ namespace ts {
        }

        if (range.kind === SyntaxKind.MultiLineCommentTrivia) {
-            const multiLinePragmaRegEx = /\s*@(\S+)\s*(.*)\s*$/gim; // Defined inline since it uses the "g" flag, which keeps a persistent index (for iterating)
+            const multiLinePragmaRegEx = /@(\S+)(\s+.*)?$/gim; // Defined inline since it uses the "g" flag, which keeps a persistent index (for iterating)
            let multiLineMatch: RegExpExecArray | null;
            while (multiLineMatch = multiLinePragmaRegEx.exec(text)) {
                addPragmaForMatch(pragmas, range, PragmaKindFlags.MultiLine, multiLineMatch);
@ -9170,7 +9171,7 @@ namespace ts {
    function getNamedPragmaArguments(pragma: PragmaDefinition, text: string | undefined): {[index: string]: string} | "fail" {
        if (!text) return {};
        if (!pragma.args) return {};
-        const args = text.split(/\s+/);
+        const args = trimString(text).split(/\s+/);
        const argMap: {[index: string]: string} = {};
        for (let i = 0; i < pragma.args.length; i++) {
            const argument = pragma.args[i];
--- a/src/compiler/program.ts
+++ b/src/compiler/program.ts
@ -406,7 +406,7 @@ namespace ts {
            const lineStart = getPositionOfLineAndCharacter(file, i, 0);
            const lineEnd = i < lastLineInFile ? getPositionOfLineAndCharacter(file, i + 1, 0) : file.text.length;
            let lineContent = file.text.slice(lineStart, lineEnd);
-            lineContent = lineContent.replace(/\s+$/g, "");  // trim from end
+            lineContent = trimStringEnd(lineContent);  // trim from end
            lineContent = lineContent.replace(/\t/g, " ");   // convert tabs to single spaces

            // Output the gutter and the actual contents of the line.
--- a/src/compiler/scanner.ts
+++ b/src/compiler/scanner.ts
--- a/src/compiler/semver.ts
+++ b/src/compiler/semver.ts
@ -204,7 +204,7 @@ namespace ts {
    // range-set    ::= range ( logical-or range ) *
    // range        ::= hyphen | simple ( ' ' simple ) * | ''
    // logical-or   ::= ( ' ' ) * '||' ( ' ' ) *
-    const logicalOrRegExp = /\s*\|\|\s*/g;
+    const logicalOrRegExp = /\|\|/g;
    const whitespaceRegExp = /\s+/g;

    // https://github.com/npm/node-semver#range-grammar
@ -230,20 +230,21 @@ namespace ts {
    // primitive    ::= ( '<' | '>' | '>=' | '<=' | '=' ) partial
    // tilde        ::= '~' partial
    // caret        ::= '^' partial
-    const rangeRegExp = /^\s*(~|\^|<|<=|>|>=|=)?\s*([a-z0-9-+.*]+)$/i;
+    const rangeRegExp = /^(~|\^|<|<=|>|>=|=)?\s*([a-z0-9-+.*]+)$/i;

    function parseRange(text: string) {
        const alternatives: Comparator[][] = [];
-        for (const range of text.trim().split(logicalOrRegExp)) {
+        for (let range of trimString(text).split(logicalOrRegExp)) {
            if (!range) continue;
            const comparators: Comparator[] = [];
+            range = trimString(range);
            const match = hyphenRegExp.exec(range);
            if (match) {
                if (!parseHyphen(match[1], match[2], comparators)) return undefined;
            }
            else {
                for (const simple of range.split(whitespaceRegExp)) {
-                    const match = rangeRegExp.exec(simple);
+                    const match = rangeRegExp.exec(trimString(simple));
                    if (!match || !parseComparator(match[1], match[2], comparators)) return undefined;
                }
            }
--- a/src/compiler/sourcemap.ts
+++ b/src/compiler/sourcemap.ts
@ -322,7 +322,7 @@ namespace ts {
    }

    // Sometimes tools can see the following line as a source mapping url comment, so we mangle it a bit (the [M])
-    const sourceMapCommentRegExp = /^\/\/[@#] source[M]appingURL=(.+)\s*$/;
+    const sourceMapCommentRegExp = /^\/\/[@#] source[M]appingURL=(.+)$/;
    const whitespaceOrMapCommentRegExp = /^\s*(\/\/[@#] .*)?$/;

    export interface LineInfo {
@ -345,7 +345,7 @@ namespace ts {
            const line = lineInfo.getLineText(index);
            const comment = sourceMapCommentRegExp.exec(line);
            if (comment) {
-                return comment[1];
+                return trimStringEnd(comment[1]);
            }
            // If we see a non-whitespace/map comment-like line, break, to avoid scanning up the entire file
            else if (!line.match(whitespaceOrMapCommentRegExp)) {
--- a/src/compiler/utilities.ts
+++ b/src/compiler/utilities.ts
@ -414,10 +414,10 @@ namespace ts {
            commentPos + 2 < commentEnd &&
            text.charCodeAt(commentPos + 2) === CharacterCodes.slash) {
            const textSubStr = text.substring(commentPos, commentEnd);
-            return textSubStr.match(fullTripleSlashReferencePathRegEx) ||
-                textSubStr.match(fullTripleSlashAMDReferencePathRegEx) ||
-                textSubStr.match(fullTripleSlashReferenceTypeReferenceDirectiveRegEx) ||
-                textSubStr.match(defaultLibReferenceRegEx) ?
+            return fullTripleSlashReferencePathRegEx.test(textSubStr) ||
+                fullTripleSlashAMDReferencePathRegEx.test(textSubStr) ||
+                fullTripleSlashReferenceTypeReferenceDirectiveRegEx.test(textSubStr) ||
+                defaultLibReferenceRegEx.test(textSubStr) ?
                true : false;
        }
        return false;
@ -517,12 +517,43 @@ namespace ts {

        if (isJSDocTypeExpressionOrChild(node)) {
            // strip space + asterisk at line start
-            text = text.replace(/(^|\r?\n|\r)\s*\*\s*/g, "$1");
+            text = text.split(/\r\n|\n|\r/).map(line => trimStringStart(line.replace(/^\s*\*/, ""))).join("\n");
        }

        return text;
    }

+    /**
+     * Removes the leading and trailing white space and line terminator characters from a string.
+     */
+    export const trimString = !!String.prototype.trim ? ((s: string) => s.trim()) : (s: string) => trimStringEnd(trimStringStart(s));
+
+    /**
+     * Returns a copy with trailing whitespace removed.
+     */
+    export const trimStringEnd = !!String.prototype.trimEnd ? ((s: string) => s.trimEnd()) : trimEndImpl;
+
+
+    /**
+     * Returns a copy with leading whitespace removed.
+     */
+     export const trimStringStart = !!String.prototype.trimStart ? ((s: string) => s.trimStart()) : (s: string) => s.replace(/^\s+/g, "");
+
+    /**
+     * https://jsbench.me/gjkoxld4au/1
+     * The simple regex for this, /\s+$/g is O(n^2) in v8.
+     * The native .trimEnd method is by far best, but since that's technically ES2019,
+     * we provide a (still much faster than the simple regex) fallback.
+     */
+    function trimEndImpl(s: string) {
+        let end = s.length - 1;
+        while (end >= 0) {
+            if (!isWhiteSpaceLike(s.charCodeAt(end))) break;
+            end--;
+        }
+        return s.slice(0, end + 1);
+    }
+
    export function getTextOfNode(node: Node, includeTrivia = false): string {
        return getSourceTextOfNodeFromSourceFile(getSourceFileOfNode(node), node, includeTrivia);
    }
@ -1226,10 +1257,10 @@ namespace ts {
            text.charCodeAt(comment.pos + 3) !== CharacterCodes.slash);
    }

-    export const fullTripleSlashReferencePathRegEx = /^(\/\/\/\s*<reference\s+path\s*=\s*)('|")(.+?)\2.*?\/>/;
-    const fullTripleSlashReferenceTypeReferenceDirectiveRegEx = /^(\/\/\/\s*<reference\s+types\s*=\s*)('|")(.+?)\2.*?\/>/;
-    export const fullTripleSlashAMDReferencePathRegEx = /^(\/\/\/\s*<amd-dependency\s+path\s*=\s*)('|")(.+?)\2.*?\/>/;
-    const defaultLibReferenceRegEx = /^(\/\/\/\s*<reference\s+no-default-lib\s*=\s*)('|")(.+?)\2\s*\/>/;
+    export const fullTripleSlashReferencePathRegEx = /^(\/\/\/\s*<reference\s+path\s*=\s*)(('[^']*')|("[^"]*")).*?\/>/;
+    const fullTripleSlashReferenceTypeReferenceDirectiveRegEx = /^(\/\/\/\s*<reference\s+types\s*=\s*)(('[^']*')|("[^"]*")).*?\/>/;
+    export const fullTripleSlashAMDReferencePathRegEx = /^(\/\/\/\s*<amd-dependency\s+path\s*=\s*)(('[^']*')|("[^"]*")).*?\/>/;
+    const defaultLibReferenceRegEx = /^(\/\/\/\s*<reference\s+no-default-lib\s*=\s*)(('[^']*')|("[^"]*"))\s*\/>/;

    export function isPartOfTypeNode(node: Node): boolean {
        if (SyntaxKind.FirstTypeNode <= node.kind && node.kind <= SyntaxKind.LastTypeNode) {
@ -4630,7 +4661,7 @@ namespace ts {

    function writeTrimmedCurrentLine(text: string, commentEnd: number, writer: EmitTextWriter, newLine: string, pos: number, nextLineStart: number) {
        const end = Math.min(commentEnd, nextLineStart - 1);
-        const currentLineText = text.substring(pos, end).replace(/^\s+|\s+$/g, "");
+        const currentLineText = trimString(text.substring(pos, end));
        if (currentLineText) {
            // trimmed forward and ending spaces text
            writer.writeComment(currentLineText);
--- a/src/services/classifier.ts
+++ b/src/services/classifier.ts
@ -806,7 +806,8 @@ namespace ts {

        function tryClassifyTripleSlashComment(start: number, width: number): boolean {
            const tripleSlashXMLCommentRegEx = /^(\/\/\/\s*)(<)(?:(\S+)((?:[^/]|\/[^>])*)(\/>)?)?/im;
-            const attributeRegex = /(\S+)(\s*)(=)(\s*)('[^']+'|"[^"]+")/img;
+            // Require a leading whitespace character (the parser already does) to prevent terrible backtracking performance
+            const attributeRegex = /(\s)(\S+)(\s*)(=)(\s*)('[^']+'|"[^"]+")/img;

            const text = sourceFile.text.substr(start, width);
            const match = tripleSlashXMLCommentRegEx.exec(text);
@ -842,30 +843,30 @@ namespace ts {
                    break;
                }

-                const newAttrPos = pos + attrMatch.index;
+                const newAttrPos = pos + attrMatch.index + attrMatch[1].length; // whitespace
                if (newAttrPos > attrPos) {
                    pushCommentRange(attrPos, newAttrPos - attrPos);
                    attrPos = newAttrPos;
                }

-                pushClassification(attrPos, attrMatch[1].length, ClassificationType.jsxAttribute); // attribute name
-                attrPos += attrMatch[1].length;
+                pushClassification(attrPos, attrMatch[2].length, ClassificationType.jsxAttribute); // attribute name
+                attrPos += attrMatch[2].length;

-                if (attrMatch[2].length) {
-                    pushCommentRange(attrPos, attrMatch[2].length); // whitespace
-                    attrPos += attrMatch[2].length;
+                if (attrMatch[3].length) {
+                    pushCommentRange(attrPos, attrMatch[3].length); // whitespace
+                    attrPos += attrMatch[3].length;
                }

-                pushClassification(attrPos, attrMatch[3].length, ClassificationType.operator); // =
-                attrPos += attrMatch[3].length;
+                pushClassification(attrPos, attrMatch[4].length, ClassificationType.operator); // =
+                attrPos += attrMatch[4].length;

-                if (attrMatch[4].length) {
-                    pushCommentRange(attrPos, attrMatch[4].length); // whitespace
-                    attrPos += attrMatch[4].length;
+                if (attrMatch[5].length) {
+                    pushCommentRange(attrPos, attrMatch[5].length); // whitespace
+                    attrPos += attrMatch[5].length;
                }

-                pushClassification(attrPos, attrMatch[5].length, ClassificationType.jsxAttributeStringLiteralValue); // attribute value
-                attrPos += attrMatch[5].length;
+                pushClassification(attrPos, attrMatch[6].length, ClassificationType.jsxAttributeStringLiteralValue); // attribute value
+                attrPos += attrMatch[6].length;
            }

            pos += match[4].length;
--- a/src/services/outliningElementsCollector.ts
+++ b/src/services/outliningElementsCollector.ts
@ -94,8 +94,15 @@ namespace ts.OutliningElementsCollector {
        }
    }

-    const regionDelimiterRegExp = /^\s*\/\/\s*#(end)?region(?:\s+(.*))?(?:\r)?$/;
+    const regionDelimiterRegExp = /^#(end)?region(?:\s+(.*))?(?:\r)?$/;
    function isRegionDelimiter(lineText: string) {
+        // We trim the leading whitespace and // without the regex since the
+        // multiple potential whitespace matches can make for some gnarly backtracking behavior
+        lineText = trimStringStart(lineText);
+        if (!startsWith(lineText, "\/\/")) {
+            return null; // eslint-disable-line no-null/no-null
+        }
+        lineText = trimString(lineText.slice(2));
        return regionDelimiterRegExp.exec(lineText);
    }