Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Improve Recovery of Unterminated Regular Expressions #58289

Merged
merged 11 commits into from
May 24, 2024
7 changes: 1 addition & 6 deletions src/compiler/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ import {
DeleteExpression,
Diagnostic,
DiagnosticArguments,
DiagnosticCategory,
DiagnosticMessage,
Diagnostics,
DiagnosticWithDetachedLocation,
Expand Down Expand Up @@ -2144,11 +2143,7 @@ namespace Parser {
// Don't report another error if it would just be at the same position as the last error.
const lastError = lastOrUndefined(parseDiagnostics);
let result: DiagnosticWithDetachedLocation | undefined;
if (message.category === DiagnosticCategory.Message && lastError && start === lastError.start && length === lastError.length) {
result = createDetachedDiagnostic(fileName, sourceText, start, length, message, ...args);
addRelatedInfo(lastError, result);
}
else if (!lastError || start !== lastError.start) {
if (!lastError || start !== lastError.start) {
result = createDetachedDiagnostic(fileName, sourceText, start, length, message, ...args);
parseDiagnostics.push(result);
}
Expand Down
143 changes: 88 additions & 55 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
LanguageFeatureMinimumTarget,
LanguageVariant,
last,
lastOrUndefined,
LineAndCharacter,
MapLike,
parsePseudoBigInt,
Expand Down Expand Up @@ -2391,7 +2392,8 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
function reScanSlashToken(reportErrors?: boolean): SyntaxKind {
if (token === SyntaxKind.SlashToken || token === SyntaxKind.SlashEqualsToken) {
// Quickly get to the end of regex such that we know the flags
let p = tokenStart + 1;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is eliminated for readability

const startOfRegExpBody = tokenStart + 1;
pos = startOfRegExpBody;
let inEscape = false;
// Although nested character classes are allowed in Unicode Sets mode,
// an unescaped slash is nevertheless invalid even in a character class in Unicode mode.
Expand All @@ -2403,16 +2405,14 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
while (true) {
// If we reach the end of a file, or hit a newline, then this is an unterminated
// regex. Report error and return what we have so far.
if (p >= end) {
if (pos >= end) {
tokenFlags |= TokenFlags.Unterminated;
error(Diagnostics.Unterminated_regular_expression_literal);
break;
}

const ch = text.charCodeAt(p);
const ch = text.charCodeAt(pos);
if (isLineBreak(ch)) {
tokenFlags |= TokenFlags.Unterminated;
error(Diagnostics.Unterminated_regular_expression_literal);
break;
}

Expand All @@ -2424,7 +2424,6 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
else if (ch === CharacterCodes.slash && !inCharacterClass) {
// A slash within a character class is permissible,
// but in general it signals the end of the regexp literal.
p++;
break;
}
else if (ch === CharacterCodes.openBracket) {
Expand All @@ -2436,58 +2435,95 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
else if (ch === CharacterCodes.closeBracket) {
inCharacterClass = false;
}
p++;
pos++;
}
const isUnterminated = !!(tokenFlags & TokenFlags.Unterminated);
const endOfBody = p - (isUnterminated ? 0 : 1);
let regExpFlags = RegularExpressionFlags.None;
while (p < end) {
const ch = text.charCodeAt(p);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
if (reportErrors) {
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, p, 1);
const endOfRegExpBody = pos;
if (tokenFlags & TokenFlags.Unterminated) {
// Search for the nearest unbalanced bracket for better recovery. Since the expression is
// invalid anyways, we take nested square brackets into consideration for the best guess.
pos = startOfRegExpBody;
inEscape = false;
let characterClassDepth = 0;
const bracketStack: CharacterCodes[] = [];
while (pos < endOfRegExpBody) {
const ch = text.charCodeAt(pos);
if (inEscape) {
inEscape = false;
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, p, 1);
else if (ch === CharacterCodes.backslash) {
inEscape = true;
}
else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, 1);
else if (ch === CharacterCodes.openBracket) {
characterClassDepth++;
}
else {
regExpFlags |= flag;
checkRegularExpressionFlagAvailable(flag, p);
else if (ch === CharacterCodes.closeBracket && characterClassDepth) {
characterClassDepth--;
}
else if (!characterClassDepth) {
if (ch === CharacterCodes.openParen) {
bracketStack.push(CharacterCodes.closeParen);
}
else if (ch === CharacterCodes.openBrace) {
bracketStack.push(CharacterCodes.closeBrace);
}
else if (ch === lastOrUndefined(bracketStack)) {
bracketStack.pop();
}
else if (ch === CharacterCodes.closeParen || ch === CharacterCodes.closeBracket || ch === CharacterCodes.closeBrace) {
// We encountered an unbalanced bracket outside a character class. Treat this position as the end of regex.
break;
}
}
pos++;
}
p++;
}
if (reportErrors) {
pos = tokenStart + 1;
const saveTokenPos = tokenStart;
const saveTokenFlags = tokenFlags;
scanRegularExpressionWorker(text, endOfBody, regExpFlags, isUnterminated, /*annexB*/ true);
if (!isUnterminated) {
pos = p;
}
tokenStart = saveTokenPos;
tokenFlags = saveTokenFlags;
// Whitespaces and semicolons at the end are not likely to be part of the regex
while (isWhiteSpaceLike(text.charCodeAt(pos - 1)) || text.charCodeAt(pos - 1) === CharacterCodes.semicolon) pos--;
error(Diagnostics.Unterminated_regular_expression_literal, tokenStart, pos - tokenStart);
}
else {
pos = p;
// Consume the slash character
pos++;
let regExpFlags = RegularExpressionFlags.None;
while (pos < end) {
const ch = codePointAt(text, pos);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
const size = charSize(ch);
if (reportErrors) {
const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, size);
}
else {
regExpFlags |= flag;
checkRegularExpressionFlagAvailability(flag);
}
}
pos += size;
}
if (reportErrors) {
scanRange(startOfRegExpBody, endOfRegExpBody - startOfRegExpBody, () => {
scanRegularExpressionWorker(regExpFlags, /*annexB*/ true);
});
}
}
tokenValue = text.substring(tokenStart, pos);
token = SyntaxKind.RegularExpressionLiteral;
}
return token;

function scanRegularExpressionWorker(text: string, end: number, regExpFlags: RegularExpressionFlags, isUnterminated: boolean, annexB: boolean) {
/** Grammar parameter */
const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);
function scanRegularExpressionWorker(regExpFlags: RegularExpressionFlags, annexB: boolean) {
/** Grammar parameter */
const unicodeMode = !!(regExpFlags & RegularExpressionFlags.UnicodeMode);
/** Grammar parameter */
const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);

if (unicodeMode) {
// Annex B treats any unicode mode as the strict syntax.
Expand All @@ -2497,7 +2533,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
/** @see {scanClassSetExpression} */
let mayContainStrings = false;

/** The number of numeric (anonymous) capturing groups defined in the regex. */
/** The number of all (named and unnamed) capturing groups defined in the regex. */
let numberOfCapturingGroups = 0;
/** All named capturing groups defined in the regex. */
const groupSpecifiers = new Set<string>();
Expand Down Expand Up @@ -2698,10 +2734,6 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// falls through
case CharacterCodes.closeBracket:
case CharacterCodes.closeBrace:
if (isUnterminated && !isInGroup) {
// Assume what starting from the character to be outside of the regex
return;
}
if (unicodeMode || ch === CharacterCodes.closeParen) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch));
}
Expand All @@ -2721,25 +2753,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean

function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
while (pos < end) {
const ch = text.charCodeAt(pos);
const ch = codePointAt(text, pos);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
const size = charSize(ch);
const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (currFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (!(flag & RegularExpressionFlags.Modifiers)) {
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size);
}
else {
currFlags |= flag;
checkRegularExpressionFlagAvailable(flag, pos);
checkRegularExpressionFlagAvailability(flag);
}
pos++;
pos += size;
}
return currFlags;
}
Expand Down Expand Up @@ -3439,7 +3472,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
});
}

function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) {
function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags) {
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined;
if (availableFrom && languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));
Expand Down
1 change: 1 addition & 0 deletions src/testRunner/tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import "./unittests/paths";
import "./unittests/printer";
import "./unittests/programApi";
import "./unittests/publicApi";
import "./unittests/regExpParserRecovery";
import "./unittests/reuseProgramStructure";
import "./unittests/semver";
import "./unittests/transform";
Expand Down
2 changes: 1 addition & 1 deletion src/testRunner/unittests/incrementalParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ describe("unittests:: Incremental Parser", () => {
const oldText = ts.ScriptSnapshot.fromString(source);
const newTextAndChange = withInsert(oldText, semicolonIndex, "/");

compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 0);
compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 4);
});

it("Regular expression 2", () => {
Expand Down
81 changes: 81 additions & 0 deletions src/testRunner/unittests/regExpParserRecovery.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import * as ts from "../_namespaces/ts";

describe("unittests:: regExpParserRecovery", () => {
const testCases = [
"/",
"/[]",
"/{}",
"/()",
"/foo",
"/foo[]",
"/foo{}",
"/foo()",
"/[]foo",
"/{}foo",
"/()foo",
"/{[]}",
"/([])",
"/[)}({]",
"/({[)}]})",
"/\\[",
"/\\{",
"/\\(",
"/[\\[]",
"/(\\[)",
"/{\\[}",
"/[\\(]",
"/(\\()",
"/{\\(}",
"/[\\{]",
"/(\\{)",
"/{\\{}",
"/\\{(\\[\\([{])",
"/\\]",
"/\\}",
"/\\)",
"/[\\]]",
"/(\\])",
"/{\\]}",
"/[\\)]",
"/(\\))",
"/{\\)}",
"/[\\}]",
"/(\\})",
"/{\\}}",
"/({[\\]})]})",
];
const whiteSpaceSequences = [
"",
" ",
"\t\v\r\n",
"\u3000\u2028",
];
it("stops parsing unterminated regexes at correct position", () => {
ts.forEach(testCases, testCase => {
ts.forEach(whiteSpaceSequences, whiteSpaces => {
const testCaseWithWhiteSpaces = testCase + whiteSpaces;
const sources = [
`const regex = ${testCaseWithWhiteSpaces};`,
`(${testCaseWithWhiteSpaces});`,
`([${testCaseWithWhiteSpaces}]);`,
`({prop: ${testCaseWithWhiteSpaces}});`,
`({prop: ([(${testCaseWithWhiteSpaces})])});`,
`({[(${testCaseWithWhiteSpaces}).source]: 42});`,
];
ts.forEach(sources, source => {
const { parseDiagnostics } = ts.createLanguageServiceSourceFile(
/*fileName*/ "",
ts.ScriptSnapshot.fromString(source),
ts.ScriptTarget.Latest,
/*version*/ "0",
/*setNodeParents*/ false,
);
const diagnostic = ts.find(parseDiagnostics, ({ code }) => code === ts.Diagnostics.Unterminated_regular_expression_literal.code);
assert(diagnostic, "There should be an 'Unterminated regular expression literal.' error");
assert.equal(diagnostic.start, source.indexOf("/"), "Diagnostic should start at where the regex starts");
assert.equal(diagnostic.length, testCase.length, "Diagnostic should end at where the regex ends");
});
});
});
});
});
7 changes: 2 additions & 5 deletions tests/baselines/reference/parser645086_1.errors.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
parser645086_1.ts(1,13): error TS1005: ',' expected.
parser645086_1.ts(1,14): error TS1134: Variable declaration expected.
parser645086_1.ts(1,15): error TS1161: Unterminated regular expression literal.


==== parser645086_1.ts (3 errors) ====
==== parser645086_1.ts (2 errors) ====
var v = /[]/]/
~
!!! error TS1005: ',' expected.
~
!!! error TS1134: Variable declaration expected.

!!! error TS1161: Unterminated regular expression literal.
!!! error TS1134: Variable declaration expected.
7 changes: 2 additions & 5 deletions tests/baselines/reference/parser645086_2.errors.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
parser645086_2.ts(1,14): error TS1005: ',' expected.
parser645086_2.ts(1,15): error TS1134: Variable declaration expected.
parser645086_2.ts(1,16): error TS1161: Unterminated regular expression literal.


==== parser645086_2.ts (3 errors) ====
==== parser645086_2.ts (2 errors) ====
var v = /[^]/]/
~
!!! error TS1005: ',' expected.
~
!!! error TS1134: Variable declaration expected.

!!! error TS1161: Unterminated regular expression literal.
!!! error TS1134: Variable declaration expected.
Loading