microsoft · rbuckton · May 24, 2024 · Apr 22, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/src/compiler/parser.ts b/src/compiler/parser.ts
@@ -62,7 +62,6 @@ import {
     DeleteExpression,
     Diagnostic,
     DiagnosticArguments,
-    DiagnosticCategory,
     DiagnosticMessage,
     Diagnostics,
     DiagnosticWithDetachedLocation,
@@ -2144,11 +2143,7 @@ namespace Parser {
         // Don't report another error if it would just be at the same position as the last error.
         const lastError = lastOrUndefined(parseDiagnostics);
         let result: DiagnosticWithDetachedLocation | undefined;
-        if (message.category === DiagnosticCategory.Message && lastError && start === lastError.start && length === lastError.length) {
-            result = createDetachedDiagnostic(fileName, sourceText, start, length, message, ...args);
-            addRelatedInfo(lastError, result);
-        }
-        else if (!lastError || start !== lastError.start) {
+        if (!lastError || start !== lastError.start) {
             result = createDetachedDiagnostic(fileName, sourceText, start, length, message, ...args);
             parseDiagnostics.push(result);
         }

diff --git a/src/compiler/scanner.ts b/src/compiler/scanner.ts
@@ -22,6 +22,7 @@ import {
     LanguageFeatureMinimumTarget,
     LanguageVariant,
     last,
+    lastOrUndefined,
     LineAndCharacter,
     MapLike,
     parsePseudoBigInt,
@@ -2391,7 +2392,8 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
     function reScanSlashToken(reportErrors?: boolean): SyntaxKind {
         if (token === SyntaxKind.SlashToken || token === SyntaxKind.SlashEqualsToken) {
             // Quickly get to the end of regex such that we know the flags
-            let p = tokenStart + 1;
+            const startOfRegExpBody = tokenStart + 1;
+            pos = startOfRegExpBody;
             let inEscape = false;
             // Although nested character classes are allowed in Unicode Sets mode,
             // an unescaped slash is nevertheless invalid even in a character class in Unicode mode.
@@ -2403,16 +2405,14 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
             while (true) {
                 // If we reach the end of a file, or hit a newline, then this is an unterminated
                 // regex.  Report error and return what we have so far.
-                if (p >= end) {
+                if (pos >= end) {
                     tokenFlags |= TokenFlags.Unterminated;
-                    error(Diagnostics.Unterminated_regular_expression_literal);
                     break;
                 }
 
-                const ch = text.charCodeAt(p);
+                const ch = text.charCodeAt(pos);
                 if (isLineBreak(ch)) {
                     tokenFlags |= TokenFlags.Unterminated;
-                    error(Diagnostics.Unterminated_regular_expression_literal);
                     break;
                 }
 
@@ -2424,7 +2424,6 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
                 else if (ch === CharacterCodes.slash && !inCharacterClass) {
                     // A slash within a character class is permissible,
                     // but in general it signals the end of the regexp literal.
-                    p++;
                     break;
                 }
                 else if (ch === CharacterCodes.openBracket) {
@@ -2436,58 +2435,95 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
                 else if (ch === CharacterCodes.closeBracket) {
                     inCharacterClass = false;
                 }
-                p++;
+                pos++;
             }
-            const isUnterminated = !!(tokenFlags & TokenFlags.Unterminated);
-            const endOfBody = p - (isUnterminated ? 0 : 1);
-            let regExpFlags = RegularExpressionFlags.None;
-            while (p < end) {
-                const ch = text.charCodeAt(p);
-                if (!isIdentifierPart(ch, languageVersion)) {
-                    break;
-                }
-                if (reportErrors) {
-                    const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
-                    if (flag === undefined) {
-                        error(Diagnostics.Unknown_regular_expression_flag, p, 1);
+            const endOfRegExpBody = pos;
+            if (tokenFlags & TokenFlags.Unterminated) {
+                // Search for the nearest unbalanced bracket for better recovery. Since the expression is
+                // invalid anyways, we take nested square brackets into consideration for the best guess.
+                pos = startOfRegExpBody;
+                inEscape = false;
+                let characterClassDepth = 0;
+                const bracketStack: CharacterCodes[] = [];
+                while (pos < endOfRegExpBody) {
+                    const ch = text.charCodeAt(pos);
+                    if (inEscape) {
+                        inEscape = false;
                     }
-                    else if (regExpFlags & flag) {
-                        error(Diagnostics.Duplicate_regular_expression_flag, p, 1);
+                    else if (ch === CharacterCodes.backslash) {
+                        inEscape = true;
                     }
-                    else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
-                        error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, 1);
+                    else if (ch === CharacterCodes.openBracket) {
+                        characterClassDepth++;
                     }
-                    else {
-                        regExpFlags |= flag;
-                        checkRegularExpressionFlagAvailable(flag, p);
+                    else if (ch === CharacterCodes.closeBracket && characterClassDepth) {
+                        characterClassDepth--;
                     }
+                    else if (!characterClassDepth) {
+                        if (ch === CharacterCodes.openParen) {
+                            bracketStack.push(CharacterCodes.closeParen);
+                        }
+                        else if (ch === CharacterCodes.openBrace) {
+                            bracketStack.push(CharacterCodes.closeBrace);
+                        }
+                        else if (ch === lastOrUndefined(bracketStack)) {
+                            bracketStack.pop();
+                        }
+                        else if (ch === CharacterCodes.closeParen || ch === CharacterCodes.closeBracket || ch === CharacterCodes.closeBrace) {
+                            // We encountered an unbalanced bracket outside a character class. Treat this position as the end of regex.
+                            break;
+                        }
+                    }
+                    pos++;
                 }
-                p++;
-            }
-            if (reportErrors) {
-                pos = tokenStart + 1;
-                const saveTokenPos = tokenStart;
-                const saveTokenFlags = tokenFlags;
-                scanRegularExpressionWorker(text, endOfBody, regExpFlags, isUnterminated, /*annexB*/ true);
-                if (!isUnterminated) {
-                    pos = p;
-                }
-                tokenStart = saveTokenPos;
-                tokenFlags = saveTokenFlags;
+                // Whitespaces and semicolons at the end are not likely to be part of the regex
+                while (isWhiteSpaceLike(text.charCodeAt(pos - 1)) || text.charCodeAt(pos - 1) === CharacterCodes.semicolon) pos--;
+                error(Diagnostics.Unterminated_regular_expression_literal, tokenStart, pos - tokenStart);
             }
             else {
-                pos = p;
+                // Consume the slash character
+                pos++;
+                let regExpFlags = RegularExpressionFlags.None;
+                while (pos < end) {
+                    const ch = codePointAt(text, pos);
+                    if (!isIdentifierPart(ch, languageVersion)) {
+                        break;
+                    }
+                    const size = charSize(ch);
+                    if (reportErrors) {
+                        const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch));
+                        if (flag === undefined) {
+                            error(Diagnostics.Unknown_regular_expression_flag, pos, size);
+                        }
+                        else if (regExpFlags & flag) {
+                            error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
+                        }
+                        else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
+                            error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, size);
+                        }
+                        else {
+                            regExpFlags |= flag;
+                            checkRegularExpressionFlagAvailability(flag);
+                        }
+                    }
+                    pos += size;
+                }
+                if (reportErrors) {
+                    scanRange(startOfRegExpBody, endOfRegExpBody - startOfRegExpBody, () => {
+                        scanRegularExpressionWorker(regExpFlags, /*annexB*/ true);
+                    });
+                }
             }
             tokenValue = text.substring(tokenStart, pos);
             token = SyntaxKind.RegularExpressionLiteral;
         }
         return token;
 
-        function scanRegularExpressionWorker(text: string, end: number, regExpFlags: RegularExpressionFlags, isUnterminated: boolean, annexB: boolean) {
-            /** Grammar parameter */
-            const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);
+        function scanRegularExpressionWorker(regExpFlags: RegularExpressionFlags, annexB: boolean) {
             /** Grammar parameter */
             const unicodeMode = !!(regExpFlags & RegularExpressionFlags.UnicodeMode);
+            /** Grammar parameter */
+            const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);
 
             if (unicodeMode) {
                 // Annex B treats any unicode mode as the strict syntax.
@@ -2497,7 +2533,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
             /** @see {scanClassSetExpression} */
             let mayContainStrings = false;
 
-            /** The number of numeric (anonymous) capturing groups defined in the regex. */
+            /** The number of all (named and unnamed) capturing groups defined in the regex. */
             let numberOfCapturingGroups = 0;
             /** All named capturing groups defined in the regex. */
             const groupSpecifiers = new Set<string>();
@@ -2698,10 +2734,6 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
                         // falls through
                         case CharacterCodes.closeBracket:
                         case CharacterCodes.closeBrace:
-                            if (isUnterminated && !isInGroup) {
-                                // Assume what starting from the character to be outside of the regex
-                                return;
-                            }
                             if (unicodeMode || ch === CharacterCodes.closeParen) {
                                 error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch));
                             }
@@ -2721,25 +2753,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
 
             function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
                 while (pos < end) {
-                    const ch = text.charCodeAt(pos);
+                    const ch = codePointAt(text, pos);
                     if (!isIdentifierPart(ch, languageVersion)) {
                         break;
                     }
-                    const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
+                    const size = charSize(ch);
+                    const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch));
                     if (flag === undefined) {
-                        error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
+                        error(Diagnostics.Unknown_regular_expression_flag, pos, size);
                     }
                     else if (currFlags & flag) {
-                        error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
+                        error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
                     }
                     else if (!(flag & RegularExpressionFlags.Modifiers)) {
-                        error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
+                        error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size);
                     }
                     else {
                         currFlags |= flag;
-                        checkRegularExpressionFlagAvailable(flag, pos);
+                        checkRegularExpressionFlagAvailability(flag);
                     }
-                    pos++;
+                    pos += size;
                 }
                 return currFlags;
             }
@@ -3439,7 +3472,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
             });
         }
 
-        function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) {
+        function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags) {
             const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined;
             if (availableFrom && languageVersion < availableFrom) {
                 error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));

diff --git a/src/testRunner/tests.ts b/src/testRunner/tests.ts
@@ -16,6 +16,7 @@ import "./unittests/paths";
 import "./unittests/printer";
 import "./unittests/programApi";
 import "./unittests/publicApi";
+import "./unittests/regExpParserRecovery";
 import "./unittests/reuseProgramStructure";
 import "./unittests/semver";
 import "./unittests/transform";

diff --git a/src/testRunner/unittests/incrementalParser.ts b/src/testRunner/unittests/incrementalParser.ts
@@ -160,7 +160,7 @@ describe("unittests:: Incremental Parser", () => {
         const oldText = ts.ScriptSnapshot.fromString(source);
         const newTextAndChange = withInsert(oldText, semicolonIndex, "/");
 
-        compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 0);
+        compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 4);
     });
 
     it("Regular expression 2", () => {

diff --git a/src/testRunner/unittests/regExpParserRecovery.ts b/src/testRunner/unittests/regExpParserRecovery.ts
@@ -0,0 +1,81 @@
+import * as ts from "../_namespaces/ts";
+
+describe("unittests:: regExpParserRecovery", () => {
+    const testCases = [
+        "/",
+        "/[]",
+        "/{}",
+        "/()",
+        "/foo",
+        "/foo[]",
+        "/foo{}",
+        "/foo()",
+        "/[]foo",
+        "/{}foo",
+        "/()foo",
+        "/{[]}",
+        "/([])",
+        "/[)}({]",
+        "/({[)}]})",
+        "/\\[",
+        "/\\{",
+        "/\\(",
+        "/[\\[]",
+        "/(\\[)",
+        "/{\\[}",
+        "/[\\(]",
+        "/(\\()",
+        "/{\\(}",
+        "/[\\{]",
+        "/(\\{)",
+        "/{\\{}",
+        "/\\{(\\[\\([{])",
+        "/\\]",
+        "/\\}",
+        "/\\)",
+        "/[\\]]",
+        "/(\\])",
+        "/{\\]}",
+        "/[\\)]",
+        "/(\\))",
+        "/{\\)}",
+        "/[\\}]",
+        "/(\\})",
+        "/{\\}}",
+        "/({[\\]})]})",
+    ];
+    const whiteSpaceSequences = [
+        "",
+        "  ",
+        "\t\v\r\n",
+        "\u3000\u2028",
+    ];
+    it("stops parsing unterminated regexes at correct position", () => {
+        ts.forEach(testCases, testCase => {
+            ts.forEach(whiteSpaceSequences, whiteSpaces => {
+                const testCaseWithWhiteSpaces = testCase + whiteSpaces;
+                const sources = [
+                    `const regex = ${testCaseWithWhiteSpaces};`,
+                    `(${testCaseWithWhiteSpaces});`,
+                    `([${testCaseWithWhiteSpaces}]);`,
+                    `({prop: ${testCaseWithWhiteSpaces}});`,
+                    `({prop: ([(${testCaseWithWhiteSpaces})])});`,
+                    `({[(${testCaseWithWhiteSpaces}).source]: 42});`,
+                ];
+                ts.forEach(sources, source => {
+                    const { parseDiagnostics } = ts.createLanguageServiceSourceFile(
+                        /*fileName*/ "",
+                        ts.ScriptSnapshot.fromString(source),
+                        ts.ScriptTarget.Latest,
+                        /*version*/ "0",
+                        /*setNodeParents*/ false,
+                    );
+                    const diagnostic = ts.find(parseDiagnostics, ({ code }) => code === ts.Diagnostics.Unterminated_regular_expression_literal.code);
+                    assert(diagnostic, "There should be an 'Unterminated regular expression literal.' error");
+                    assert.equal(diagnostic.start, source.indexOf("/"), "Diagnostic should start at where the regex starts");
+                    assert.equal(diagnostic.length, testCase.length, "Diagnostic should end at where the regex ends");
+                });
+            });
+        });
+    });
+});
diff --git a/tests/baselines/reference/parser645086_1.errors.txt b/tests/baselines/reference/parser645086_1.errors.txt
@@ -1,13 +1,10 @@
 parser645086_1.ts(1,13): error TS1005: ',' expected.
 parser645086_1.ts(1,14): error TS1134: Variable declaration expected.
-parser645086_1.ts(1,15): error TS1161: Unterminated regular expression literal.
 
 
-==== parser645086_1.ts (3 errors) ====
+==== parser645086_1.ts (2 errors) ====
     var v = /[]/]/
                 ~
 !!! error TS1005: ',' expected.
                  ~
-!!! error TS1134: Variable declaration expected.
-
-!!! error TS1161: Unterminated regular expression literal.
+!!! error TS1134: Variable declaration expected.
diff --git a/tests/baselines/reference/parser645086_2.errors.txt b/tests/baselines/reference/parser645086_2.errors.txt
@@ -1,13 +1,10 @@
 parser645086_2.ts(1,14): error TS1005: ',' expected.
 parser645086_2.ts(1,15): error TS1134: Variable declaration expected.
-parser645086_2.ts(1,16): error TS1161: Unterminated regular expression literal.
 
 
-==== parser645086_2.ts (3 errors) ====
+==== parser645086_2.ts (2 errors) ====
     var v = /[^]/]/
                  ~
 !!! error TS1005: ',' expected.
                   ~
-!!! error TS1134: Variable declaration expected.
-
-!!! error TS1161: Unterminated regular expression literal.
+!!! error TS1134: Variable declaration expected.