From 5afc5cd1601dced3413ffa9a420daa12c80a30aa Mon Sep 17 00:00:00 2001 From: Connor Peet Date: Fri, 13 Nov 2020 15:11:37 -0800 Subject: [PATCH] search: intelligently normalize crlf in regex search Fixes https://github.com/microsoft/vscode/issues/100569 --- package.json | 3 +- .../search/node/ripgrepTextSearchEngine.ts | 75 ++++++++++++++++++- .../test/node/ripgrepTextSearchEngine.test.ts | 23 +++++- yarn.lock | 5 ++ 4 files changed, 100 insertions(+), 6 deletions(-) diff --git a/package.json b/package.json index 8c5cac69565..a893846b90e 100644 --- a/package.json +++ b/package.json @@ -67,6 +67,7 @@ "vscode-nsfw": "1.2.9", "vscode-oniguruma": "1.3.1", "vscode-proxy-agent": "^0.5.2", + "vscode-regexpp": "^3.1.0", "vscode-ripgrep": "^1.11.0", "vscode-sqlite3": "4.0.10", "vscode-textmate": "5.2.0", @@ -194,4 +195,4 @@ "windows-mutex": "0.3.0", "windows-process-tree": "0.2.4" } -} \ No newline at end of file +} diff --git a/src/vs/workbench/services/search/node/ripgrepTextSearchEngine.ts b/src/vs/workbench/services/search/node/ripgrepTextSearchEngine.ts index a22fd8ba3b9..53c6ae8fd67 100644 --- a/src/vs/workbench/services/search/node/ripgrepTextSearchEngine.ts +++ b/src/vs/workbench/services/search/node/ripgrepTextSearchEngine.ts @@ -16,6 +16,7 @@ import { URI } from 'vs/base/common/uri'; import { Progress } from 'vs/platform/progress/common/progress'; import { IExtendedExtensionSearchOptions, SearchError, SearchErrorCode, serializeSearchError } from 'vs/workbench/services/search/common/search'; import { Range, TextSearchComplete, TextSearchContext, TextSearchMatch, TextSearchOptions, TextSearchPreviewOptions, TextSearchQuery, TextSearchResult } from 'vs/workbench/services/search/common/searchExtTypes'; +import { RegExpParser, RegExpVisitor, AST as ReAST } from 'vscode-regexpp'; import { rgPath } from 'vscode-ripgrep'; import { anchorGlob, createTextSearchResult, IOutputChannel, Maybe } from './ripgrepSearchUtils'; @@ -541,10 +542,78 @@ export interface IRgSubmatch { export type IRgBytesOrText = { bytes: string } | { text: string }; +const isLookBehind = (node: ReAST.Node) => node.type === 'Assertion' && node.kind === 'lookbehind'; + export function fixRegexNewline(pattern: string): string { - // Replace an unescaped $ at the end of the pattern with \r?$ - // Match $ preceded by none or even number of literal \ - return pattern.replace(/(?<=[^\\]|^)(\\\\)*\\n/g, '$1\\r?\\n'); + // we parse the pattern anew each tiem + let re: ReAST.Pattern; + try { + re = new RegExpParser().parsePattern(pattern); + } catch { + return pattern; + } + + let output = ''; + let lastEmittedIndex = 0; + const replace = (start: number, end: number, text: string) => { + output += pattern.slice(lastEmittedIndex, start) + text; + lastEmittedIndex = end; + }; + + const context: ReAST.Node[] = []; + const visitor = new RegExpVisitor({ + onCharacterEnter(char) { + if (char.raw !== '\\n') { + return; + } + + const parent = context[0]; + if (!parent) { + // simple char, \n -> \r?\n + replace(char.start, char.end, '\\r?\\n'); + } else if (context.some(isLookBehind)) { + // no-op in a lookbehind, see #100569 + } else if (parent.type === 'CharacterClass') { + // in a bracket expr, [a-z\n] -> (?:[a-z]|\r?\n) + const otherContent = pattern.slice(parent.start + 1, char.start) + pattern.slice(char.end, parent.end - 1); + replace(parent.start, parent.end, otherContent === '' ? '\\r?\\n' : `(?:[${otherContent}]|\\r?\\n)`); + } else if (parent.type === 'Quantifier') { + replace(char.start, char.end, '(?:\\r?\\n)'); + } + }, + onQuantifierEnter(node) { + context.unshift(node); + }, + onQuantifierLeave() { + context.shift(); + }, + onCharacterClassRangeEnter(node) { + context.unshift(node); + }, + onCharacterClassRangeLeave() { + context.shift(); + }, + onCharacterClassEnter(node) { + context.unshift(node); + }, + onCharacterClassLeave() { + context.shift(); + }, + onAssertionEnter(node) { + if (isLookBehind(node)) { + context.push(node); + } + }, + onAssertionLeave(node) { + if (context[0] === node) { + context.shift(); + } + }, + }); + + visitor.visit(re); + output += pattern.slice(lastEmittedIndex); + return output; } export function fixNewline(pattern: string): string { diff --git a/src/vs/workbench/services/search/test/node/ripgrepTextSearchEngine.test.ts b/src/vs/workbench/services/search/test/node/ripgrepTextSearchEngine.test.ts index 6c7d8296a2d..9e15c6dc9bb 100644 --- a/src/vs/workbench/services/search/test/node/ripgrepTextSearchEngine.test.ts +++ b/src/vs/workbench/services/search/test/node/ripgrepTextSearchEngine.test.ts @@ -28,7 +28,24 @@ suite('RipgrepTextSearchEngine', () => { assert.equal(unicodeEscapesToPCRE2(''), ''); }); - test('fixRegexNewline', () => { + test('fixRegexNewline - src', () => { + const ttable = [ + ['foo', 'foo'], + ['invalid(', 'invalid('], + ['fo\\no', 'fo\\r?\\no'], + ['f\\no\\no', 'f\\r?\\no\\r?\\no'], + ['f[a-z\\n1]', 'f(?:[a-z1]|\\r?\\n)'], + ['f[\\n-a]', 'f[\\n-a]'], + ['(?<=\\n)\\w', '(?<=\\n)\\w'], + ['fo\\n+o', 'fo(?:\\r?\\n)+o'], + ]; + + for (const [input, expected] of ttable) { + assert.equal(fixRegexNewline(input), expected, `${input} -> ${expected}`); + } + }); + + test('fixRegexNewline - re', () => { function testFixRegexNewline([inputReg, testStr, shouldMatch]: readonly [string, string, boolean]): void { const fixed = fixRegexNewline(inputReg); const reg = new RegExp(fixed); @@ -48,10 +65,12 @@ suite('RipgrepTextSearchEngine', () => { ['foo\\n+abc', 'foo\r\nabc', true], ['foo\\n+abc', 'foo\n\n\nabc', true], + ['foo\\n+abc', 'foo\r\n\r\n\r\nabc', true], + ['foo[\\n-9]+abc', 'foo1abc', true], ] as const).forEach(testFixRegexNewline); }); - test('fixNewline', () => { + test('fixNewline - matching', () => { function testFixNewline([inputReg, testStr, shouldMatch = true]: readonly [string, string, boolean?]): void { const fixed = fixNewline(inputReg); const reg = new RegExp(fixed); diff --git a/yarn.lock b/yarn.lock index f3c63a68fd6..9e4f8f930c9 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9735,6 +9735,11 @@ vscode-proxy-agent@^0.5.2: https-proxy-agent "^2.2.3" socks-proxy-agent "^4.0.1" +vscode-regexpp@^3.1.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/vscode-regexpp/-/vscode-regexpp-3.1.0.tgz#42d059b6fffe99bd42939c0d013f632f0cad823f" + integrity sha512-pqtN65VC1jRLawfluX4Y80MMG0DHJydWhe5ZwMHewZD6sys4LbU6lHwFAHxeuaVE6Y6+xZOtAw+9hvq7/0ejkg== + vscode-ripgrep@^1.11.0: version "1.11.0" resolved "https://registry.yarnpkg.com/vscode-ripgrep/-/vscode-ripgrep-1.11.0.tgz#2874adea1753545590a315f02f36bed05b9e2380"