From cba7f34a4ea82070e9e043c7c71ed4714aa79b59 Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Sat, 1 Feb 2025 20:53:17 +0900 Subject: [PATCH 1/9] Update unicodedata.c --- Modules/unicodedata.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 60bde755d24574..79be7674fc8ab5 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -933,34 +933,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, if (PyUnicode_GET_LENGTH(input) == 0) { /* Special case empty input strings, since resizing them later would cause internal errors. */ - return Py_NewRef(input); + return PyUnicode_FromObject(input); } if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) { if (is_normalized_quickcheck(self, input, true, false, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfc_nfkc(self, input, 0); } if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) { if (is_normalized_quickcheck(self, input, true, true, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfc_nfkc(self, input, 1); } if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) { if (is_normalized_quickcheck(self, input, false, false, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfd_nfkd(self, input, 0); } if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) { if (is_normalized_quickcheck(self, input, false, true, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfd_nfkd(self, input, 1); } From 0856f1bd82affd947de5b85372e08dcf6b43b582 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sun, 2 Feb 2025 16:30:28 +0000 Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst diff --git a/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst new file mode 100644 index 00000000000000..2707473f858cfc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst @@ -0,0 +1 @@ +Fix an issue in :func:`unicodedata.normalize` where the return value's type is inconsistent when the function receives an instance of a subclass of :class:`str` due to incorrect optimization. Now, the function always returns an instance of the built-in :class:`str` type. From 0028ffb22dc24909722291ed9d5ddf80633b211f Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Mon, 3 Feb 2025 22:26:20 +0900 Subject: [PATCH 3/9] Add NormalizationTest.test_issue129569() to test_unicodedata.py --- Lib/test/test_unicodedata.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 0285f0d51f2365..4fb71c9b77f593 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -467,6 +467,46 @@ def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00') + def test_issue129569(self): + # subclass of str + class StrSub(str): + pass + + # must always be str + EARLY_RETURN_TYPE = str + RETURN_TYPE = str + + def NFC(s: str): + return unicodedata.normalize("NFC", s) + + def NFKC(s: str): + return unicodedata.normalize("NFKC", s) + + def NFD(s: str): + return unicodedata.normalize("NFD", s) + + def NFKD(s: str): + return unicodedata.normalize("NFKD", s) + + # normalized strings + empty_str = "" + self.assertEqual(len(StrSub(empty_str)), 0) + self.assertIs(type(NFKC(StrSub(empty_str))), EARLY_RETURN_TYPE) + + ascii_str = "ascii" + self.assertTrue(StrSub(ascii_str).isascii()) + self.assertIs(type(NFC(StrSub(ascii_str))), EARLY_RETURN_TYPE) + self.assertIs(type(NFKC(StrSub(ascii_str))), EARLY_RETURN_TYPE) + self.assertIs(type(NFD(StrSub(ascii_str))), EARLY_RETURN_TYPE) + self.assertIs(type(NFKD(StrSub(ascii_str))), EARLY_RETURN_TYPE) + + # unnormalized strings + s1, s2, s3, s4 = "\u1e0b\u0323", "\ufb01", "\u1e69", "\u1e9b\u0323" + self.assertIs(type(NFC(StrSub(s1))), RETURN_TYPE) + self.assertIs(type(NFKC(StrSub(s2))), RETURN_TYPE) + self.assertIs(type(NFD(StrSub(s3))), RETURN_TYPE) + self.assertIs(type(NFKD(StrSub(s4))), RETURN_TYPE) + if __name__ == "__main__": unittest.main() From fa116106aa807f3a7729a7bafe13cca969acaf4e Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Tue, 4 Feb 2025 00:30:54 +0900 Subject: [PATCH 4/9] Rename and refactor test method for the str subclass in test_unicodedata.py --- Lib/test/test_unicodedata.py | 46 +++++++++++++----------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 4fb71c9b77f593..cb3580dcdb8fb0 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -467,45 +467,31 @@ def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00') - def test_issue129569(self): - # subclass of str + def test_normalize_func_shall_return_exact_str(self): + # See: https://github.com/python/cpython/issues/129569 + normalize = unicodedata.normalize + class StrSub(str): pass - # must always be str - EARLY_RETURN_TYPE = str - RETURN_TYPE = str - - def NFC(s: str): - return unicodedata.normalize("NFC", s) - - def NFKC(s: str): - return unicodedata.normalize("NFKC", s) - - def NFD(s: str): - return unicodedata.normalize("NFD", s) - - def NFKD(s: str): - return unicodedata.normalize("NFKD", s) + normalization_forms = ("NFC", "NFKC", "NFD", "NFKD") # normalized strings empty_str = "" - self.assertEqual(len(StrSub(empty_str)), 0) - self.assertIs(type(NFKC(StrSub(empty_str))), EARLY_RETURN_TYPE) - ascii_str = "ascii" - self.assertTrue(StrSub(ascii_str).isascii()) - self.assertIs(type(NFC(StrSub(ascii_str))), EARLY_RETURN_TYPE) - self.assertIs(type(NFKC(StrSub(ascii_str))), EARLY_RETURN_TYPE) - self.assertIs(type(NFD(StrSub(ascii_str))), EARLY_RETURN_TYPE) - self.assertIs(type(NFKD(StrSub(ascii_str))), EARLY_RETURN_TYPE) + for form in normalization_forms: + with self.subTest(form=form): + self.assertIs(type(normalize(form, empty_str)), str) + self.assertIs(type(normalize(form, ascii_str)), str) + self.assertIs(type(normalize(form, StrSub(empty_str))), str) + self.assertIs(type(normalize(form, StrSub(ascii_str))), str) # unnormalized strings - s1, s2, s3, s4 = "\u1e0b\u0323", "\ufb01", "\u1e69", "\u1e9b\u0323" - self.assertIs(type(NFC(StrSub(s1))), RETURN_TYPE) - self.assertIs(type(NFKC(StrSub(s2))), RETURN_TYPE) - self.assertIs(type(NFD(StrSub(s3))), RETURN_TYPE) - self.assertIs(type(NFKD(StrSub(s4))), RETURN_TYPE) + strings_to_normalize = ("\u1e0b\u0323", "\ufb01", "\u1e69", "\u1e9b\u0323") + for form, input_str in zip(normalization_forms, strings_to_normalize): + with self.subTest(form=form, input_str=input_str): + self.assertIs(type(normalize(form, input_str)), str) + self.assertIs(type(normalize(form, StrSub(input_str))), str) if __name__ == "__main__": From 1dd3b75d902860393483ccacb38e4d79e0d704d2 Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Tue, 4 Feb 2025 06:21:51 +0900 Subject: [PATCH 5/9] Update NormalizationTest.test_normalize_func_shall_return_exact_str() in test_unicodedata.py --- Lib/test/test_unicodedata.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index cb3580dcdb8fb0..67c98453b1705a 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -471,7 +471,7 @@ def test_normalize_func_shall_return_exact_str(self): # See: https://github.com/python/cpython/issues/129569 normalize = unicodedata.normalize - class StrSub(str): + class MyStr(str): pass normalization_forms = ("NFC", "NFKC", "NFD", "NFKD") @@ -483,15 +483,17 @@ class StrSub(str): with self.subTest(form=form): self.assertIs(type(normalize(form, empty_str)), str) self.assertIs(type(normalize(form, ascii_str)), str) - self.assertIs(type(normalize(form, StrSub(empty_str))), str) - self.assertIs(type(normalize(form, StrSub(ascii_str))), str) + self.assertIs(type(normalize(form, MyStr(empty_str))), str) + self.assertIs(type(normalize(form, MyStr(ascii_str))), str) # unnormalized strings strings_to_normalize = ("\u1e0b\u0323", "\ufb01", "\u1e69", "\u1e9b\u0323") - for form, input_str in zip(normalization_forms, strings_to_normalize): + for form, input_str in zip( + normalization_forms, strings_to_normalize, strict=True + ): with self.subTest(form=form, input_str=input_str): self.assertIs(type(normalize(form, input_str)), str) - self.assertIs(type(normalize(form, StrSub(input_str))), str) + self.assertIs(type(normalize(form, MyStr(input_str))), str) if __name__ == "__main__": From 22f7973b81bfeca148d76ac39631014e436723d8 Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Fri, 7 Feb 2025 23:53:19 +0900 Subject: [PATCH 6/9] Update Lib/test/test_unicodedata.py Co-authored-by: Victor Stinner --- Lib/test/test_unicodedata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 67c98453b1705a..d09d7f4227c34b 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -467,7 +467,7 @@ def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00') - def test_normalize_func_shall_return_exact_str(self): + def test_normalize_return_type(self): # See: https://github.com/python/cpython/issues/129569 normalize = unicodedata.normalize From 55ad3df36e49ae1f6f96164af5535b4d766bc198 Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Fri, 7 Feb 2025 23:53:48 +0900 Subject: [PATCH 7/9] Update Lib/test/test_unicodedata.py Co-authored-by: Victor Stinner --- Lib/test/test_unicodedata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index d09d7f4227c34b..62c119c06c536b 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -468,7 +468,7 @@ def test_bug_834676(self): unicodedata.normalize('NFC', '\ud55c\uae00') def test_normalize_return_type(self): - # See: https://github.com/python/cpython/issues/129569 + # gh-129569: normalize() return type must always be str normalize = unicodedata.normalize class MyStr(str): From dd41cf925a1321e3accb72ef00ea1e914fef99d5 Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Fri, 7 Feb 2025 23:56:17 +0900 Subject: [PATCH 8/9] Update NormalizationTest.test_normalize_return_type() in test_unicodedata.py --- Lib/test/test_unicodedata.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 62c119c06c536b..8e3fef6b6fe4a0 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -475,25 +475,20 @@ class MyStr(str): pass normalization_forms = ("NFC", "NFKC", "NFD", "NFKD") + input_strings = ( + # normalized strings + "", + "ascii", + # unnormalized strings + "\u1e0b\u0323", + "\u0071\u0307\u0323", + ) - # normalized strings - empty_str = "" - ascii_str = "ascii" for form in normalization_forms: - with self.subTest(form=form): - self.assertIs(type(normalize(form, empty_str)), str) - self.assertIs(type(normalize(form, ascii_str)), str) - self.assertIs(type(normalize(form, MyStr(empty_str))), str) - self.assertIs(type(normalize(form, MyStr(ascii_str))), str) - - # unnormalized strings - strings_to_normalize = ("\u1e0b\u0323", "\ufb01", "\u1e69", "\u1e9b\u0323") - for form, input_str in zip( - normalization_forms, strings_to_normalize, strict=True - ): - with self.subTest(form=form, input_str=input_str): - self.assertIs(type(normalize(form, input_str)), str) - self.assertIs(type(normalize(form, MyStr(input_str))), str) + for input_str in input_strings: + with self.subTest(form=form, input_str=input_str): + self.assertIs(type(normalize(form, input_str)), str) + self.assertIs(type(normalize(form, MyStr(input_str))), str) if __name__ == "__main__": From 8c27430097d34572a5d4f50a25b95f6b95f21e2c Mon Sep 17 00:00:00 2001 From: Hizuru <106918920+Hizuru3@users.noreply.github.com> Date: Sat, 8 Feb 2025 00:06:42 +0900 Subject: [PATCH 9/9] Update issue description --- .../next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst index 2707473f858cfc..c4b8965106aa56 100644 --- a/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst +++ b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst @@ -1 +1 @@ -Fix an issue in :func:`unicodedata.normalize` where the return value's type is inconsistent when the function receives an instance of a subclass of :class:`str` due to incorrect optimization. Now, the function always returns an instance of the built-in :class:`str` type. +Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized.