Skip to content

Commit

Permalink
pythongh-129569: The function unicodedata.normalize() always returns …
Browse files Browse the repository at this point in the history
…built-in str (pythonGH-129570)

(cherry picked from commit c359fcd)

Co-authored-by: Hizuru <106918920+Hizuru3@users.noreply.github.com>
Co-authored-by: Victor Stinner <vstinner@python.org>
  • Loading branch information
2 people authored and miss-islington committed Feb 21, 2025
1 parent 6c4de32 commit 96f4a79
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 5 deletions.
23 changes: 23 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,29 @@ def test_bug_834676(self):
# Check for bug 834676
unicodedata.normalize('NFC', '\ud55c\uae00')

def test_normalize_return_type(self):
# gh-129569: normalize() return type must always be str
normalize = unicodedata.normalize

class MyStr(str):
pass

normalization_forms = ("NFC", "NFKC", "NFD", "NFKD")
input_strings = (
# normalized strings
"",
"ascii",
# unnormalized strings
"\u1e0b\u0323",
"\u0071\u0307\u0323",
)

for form in normalization_forms:
for input_str in input_strings:
with self.subTest(form=form, input_str=input_str):
self.assertIs(type(normalize(form, input_str)), str)
self.assertIs(type(normalize(form, MyStr(input_str))), str)


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized.
10 changes: 5 additions & 5 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -939,34 +939,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
if (PyUnicode_GET_LENGTH(input) == 0) {
/* Special case empty input strings, since resizing
them later would cause internal errors. */
return Py_NewRef(input);
return PyUnicode_FromObject(input);
}

if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
if (is_normalized_quickcheck(self, input,
true, false, true) == YES) {
return Py_NewRef(input);
return PyUnicode_FromObject(input);
}
return nfc_nfkc(self, input, 0);
}
if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
if (is_normalized_quickcheck(self, input,
true, true, true) == YES) {
return Py_NewRef(input);
return PyUnicode_FromObject(input);
}
return nfc_nfkc(self, input, 1);
}
if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
if (is_normalized_quickcheck(self, input,
false, false, true) == YES) {
return Py_NewRef(input);
return PyUnicode_FromObject(input);
}
return nfd_nfkd(self, input, 0);
}
if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
if (is_normalized_quickcheck(self, input,
false, true, true) == YES) {
return Py_NewRef(input);
return PyUnicode_FromObject(input);
}
return nfd_nfkd(self, input, 1);
}
Expand Down

0 comments on commit 96f4a79

Please # to comment.