From 4b4ae43e8b66a2eab2d41a73364745c2a4570287 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 1 Aug 2024 11:54:15 -0700 Subject: [PATCH] Fix incorrect linenos on fstring tokens with escaped newlines (#4423) I don't think this can affect Black itself much (maybe for formatting ranges), but I ran into this with https://github.com/JelleZijlstra/lib2toast --- CHANGES.md | 3 + src/blib2to3/pgen2/tokenize.py | 2 +- tests/test_tokenize.py | 120 +++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 tests/test_tokenize.py diff --git a/CHANGES.md b/CHANGES.md index e3e37484a59..48fe337392d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -39,6 +39,9 @@ - Fix bug with Black incorrectly parsing empty lines with a backslash (#4343) +- Fix incorrect line numbers in the tokenizer for certain tokens within f-strings + (#4423) + ### Performance diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 28972a9bd78..ecd017b3148 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -638,7 +638,7 @@ def generate_tokens( else: if is_fstring_start(token): fstring_start, token = _split_fstring_start_and_middle(token) - fstring_start_epos = (lnum, spos[1] + len(fstring_start)) + fstring_start_epos = (spos[0], spos[1] + len(fstring_start)) yield ( FSTRING_START, fstring_start, diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py new file mode 100644 index 00000000000..3798a9b6a92 --- /dev/null +++ b/tests/test_tokenize.py @@ -0,0 +1,120 @@ +"""Tests for the blib2to3 tokenizer.""" + +import io +import sys +import textwrap +from dataclasses import dataclass +from typing import List + +import black +from blib2to3.pgen2 import token, tokenize + + +@dataclass +class Token: + type: str + string: str + start: tokenize.Coord + end: tokenize.Coord + + +def get_tokens(text: str) -> List[Token]: + """Return the tokens produced by the tokenizer.""" + readline = io.StringIO(text).readline + tokens: List[Token] = [] + + def tokeneater( + type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str + ) -> None: + tokens.append(Token(token.tok_name[type], string, start, end)) + + tokenize.tokenize(readline, tokeneater) + return tokens + + +def assert_tokenizes(text: str, tokens: List[Token]) -> None: + """Assert that the tokenizer produces the expected tokens.""" + actual_tokens = get_tokens(text) + assert actual_tokens == tokens + + +def test_simple() -> None: + assert_tokenizes( + "1", + [Token("NUMBER", "1", (1, 0), (1, 1)), Token("ENDMARKER", "", (2, 0), (2, 0))], + ) + assert_tokenizes( + "'a'", + [ + Token("STRING", "'a'", (1, 0), (1, 3)), + Token("ENDMARKER", "", (2, 0), (2, 0)), + ], + ) + assert_tokenizes( + "a", + [Token("NAME", "a", (1, 0), (1, 1)), Token("ENDMARKER", "", (2, 0), (2, 0))], + ) + + +def test_fstring() -> None: + assert_tokenizes( + 'f"x"', + [ + Token("FSTRING_START", 'f"', (1, 0), (1, 2)), + Token("FSTRING_MIDDLE", "x", (1, 2), (1, 3)), + Token("FSTRING_END", '"', (1, 3), (1, 4)), + Token("ENDMARKER", "", (2, 0), (2, 0)), + ], + ) + assert_tokenizes( + 'f"{x}"', + [ + Token("FSTRING_START", 'f"', (1, 0), (1, 2)), + Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)), + Token("LBRACE", "{", (1, 2), (1, 3)), + Token("NAME", "x", (1, 3), (1, 4)), + Token("RBRACE", "}", (1, 4), (1, 5)), + Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)), + Token("FSTRING_END", '"', (1, 5), (1, 6)), + Token("ENDMARKER", "", (2, 0), (2, 0)), + ], + ) + assert_tokenizes( + 'f"{x:y}"\n', + [ + Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), + Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)), + Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)), + Token(type="NAME", string="x", start=(1, 3), end=(1, 4)), + Token(type="OP", string=":", start=(1, 4), end=(1, 5)), + Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)), + Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)), + Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)), + Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)), + Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)), + Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)), + ], + ) + assert_tokenizes( + 'f"x\\\n{a}"\n', + [ + Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), + Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)), + Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)), + Token(type="NAME", string="a", start=(2, 1), end=(2, 2)), + Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)), + Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)), + Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)), + Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)), + Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)), + ], + ) + + +# Run "echo some code | python tests/test_tokenize.py" to generate test cases. +if __name__ == "__main__": + code = sys.stdin.read() + tokens = get_tokens(code) + text = f"assert_tokenizes({code!r}, {tokens!r})" + text = black.format_str(text, mode=black.Mode()) + print(textwrap.indent(text, " "))