search: intelligently normalize crlf in regex search

Fixes https://github.com/microsoft/vscode/issues/100569
This commit is contained in:
Connor Peet 2020-11-13 15:11:37 -08:00
parent 7dde16206f
commit 5afc5cd160
No known key found for this signature in database
GPG key ID: CF8FD2EA0DBC61BD
4 changed files with 100 additions and 6 deletions

View file

@ -67,6 +67,7 @@
"vscode-nsfw": "1.2.9",
"vscode-oniguruma": "1.3.1",
"vscode-proxy-agent": "^0.5.2",
"vscode-regexpp": "^3.1.0",
"vscode-ripgrep": "^1.11.0",
"vscode-sqlite3": "4.0.10",
"vscode-textmate": "5.2.0",
@ -194,4 +195,4 @@
"windows-mutex": "0.3.0",
"windows-process-tree": "0.2.4"
}
}
}

View file

@ -16,6 +16,7 @@ import { URI } from 'vs/base/common/uri';
import { Progress } from 'vs/platform/progress/common/progress';
import { IExtendedExtensionSearchOptions, SearchError, SearchErrorCode, serializeSearchError } from 'vs/workbench/services/search/common/search';
import { Range, TextSearchComplete, TextSearchContext, TextSearchMatch, TextSearchOptions, TextSearchPreviewOptions, TextSearchQuery, TextSearchResult } from 'vs/workbench/services/search/common/searchExtTypes';
import { RegExpParser, RegExpVisitor, AST as ReAST } from 'vscode-regexpp';
import { rgPath } from 'vscode-ripgrep';
import { anchorGlob, createTextSearchResult, IOutputChannel, Maybe } from './ripgrepSearchUtils';
@ -541,10 +542,78 @@ export interface IRgSubmatch {
export type IRgBytesOrText = { bytes: string } | { text: string };
const isLookBehind = (node: ReAST.Node) => node.type === 'Assertion' && node.kind === 'lookbehind';
export function fixRegexNewline(pattern: string): string {
// Replace an unescaped $ at the end of the pattern with \r?$
// Match $ preceded by none or even number of literal \
return pattern.replace(/(?<=[^\\]|^)(\\\\)*\\n/g, '$1\\r?\\n');
// we parse the pattern anew each tiem
let re: ReAST.Pattern;
try {
re = new RegExpParser().parsePattern(pattern);
} catch {
return pattern;
}
let output = '';
let lastEmittedIndex = 0;
const replace = (start: number, end: number, text: string) => {
output += pattern.slice(lastEmittedIndex, start) + text;
lastEmittedIndex = end;
};
const context: ReAST.Node[] = [];
const visitor = new RegExpVisitor({
onCharacterEnter(char) {
if (char.raw !== '\\n') {
return;
}
const parent = context[0];
if (!parent) {
// simple char, \n -> \r?\n
replace(char.start, char.end, '\\r?\\n');
} else if (context.some(isLookBehind)) {
// no-op in a lookbehind, see #100569
} else if (parent.type === 'CharacterClass') {
// in a bracket expr, [a-z\n] -> (?:[a-z]|\r?\n)
const otherContent = pattern.slice(parent.start + 1, char.start) + pattern.slice(char.end, parent.end - 1);
replace(parent.start, parent.end, otherContent === '' ? '\\r?\\n' : `(?:[${otherContent}]|\\r?\\n)`);
} else if (parent.type === 'Quantifier') {
replace(char.start, char.end, '(?:\\r?\\n)');
}
},
onQuantifierEnter(node) {
context.unshift(node);
},
onQuantifierLeave() {
context.shift();
},
onCharacterClassRangeEnter(node) {
context.unshift(node);
},
onCharacterClassRangeLeave() {
context.shift();
},
onCharacterClassEnter(node) {
context.unshift(node);
},
onCharacterClassLeave() {
context.shift();
},
onAssertionEnter(node) {
if (isLookBehind(node)) {
context.push(node);
}
},
onAssertionLeave(node) {
if (context[0] === node) {
context.shift();
}
},
});
visitor.visit(re);
output += pattern.slice(lastEmittedIndex);
return output;
}
export function fixNewline(pattern: string): string {

View file

@ -28,7 +28,24 @@ suite('RipgrepTextSearchEngine', () => {
assert.equal(unicodeEscapesToPCRE2(''), '');
});
test('fixRegexNewline', () => {
test('fixRegexNewline - src', () => {
const ttable = [
['foo', 'foo'],
['invalid(', 'invalid('],
['fo\\no', 'fo\\r?\\no'],
['f\\no\\no', 'f\\r?\\no\\r?\\no'],
['f[a-z\\n1]', 'f(?:[a-z1]|\\r?\\n)'],
['f[\\n-a]', 'f[\\n-a]'],
['(?<=\\n)\\w', '(?<=\\n)\\w'],
['fo\\n+o', 'fo(?:\\r?\\n)+o'],
];
for (const [input, expected] of ttable) {
assert.equal(fixRegexNewline(input), expected, `${input} -> ${expected}`);
}
});
test('fixRegexNewline - re', () => {
function testFixRegexNewline([inputReg, testStr, shouldMatch]: readonly [string, string, boolean]): void {
const fixed = fixRegexNewline(inputReg);
const reg = new RegExp(fixed);
@ -48,10 +65,12 @@ suite('RipgrepTextSearchEngine', () => {
['foo\\n+abc', 'foo\r\nabc', true],
['foo\\n+abc', 'foo\n\n\nabc', true],
['foo\\n+abc', 'foo\r\n\r\n\r\nabc', true],
['foo[\\n-9]+abc', 'foo1abc', true],
] as const).forEach(testFixRegexNewline);
});
test('fixNewline', () => {
test('fixNewline - matching', () => {
function testFixNewline([inputReg, testStr, shouldMatch = true]: readonly [string, string, boolean?]): void {
const fixed = fixNewline(inputReg);
const reg = new RegExp(fixed);

View file

@ -9735,6 +9735,11 @@ vscode-proxy-agent@^0.5.2:
https-proxy-agent "^2.2.3"
socks-proxy-agent "^4.0.1"
vscode-regexpp@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/vscode-regexpp/-/vscode-regexpp-3.1.0.tgz#42d059b6fffe99bd42939c0d013f632f0cad823f"
integrity sha512-pqtN65VC1jRLawfluX4Y80MMG0DHJydWhe5ZwMHewZD6sys4LbU6lHwFAHxeuaVE6Y6+xZOtAw+9hvq7/0ejkg==
vscode-ripgrep@^1.11.0:
version "1.11.0"
resolved "https://registry.yarnpkg.com/vscode-ripgrep/-/vscode-ripgrep-1.11.0.tgz#2874adea1753545590a315f02f36bed05b9e2380"