Skip to content

Commit

Permalink
Assign the same CJK width to canonically equivalent strings
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed May 22, 2024
1 parent a2db56b commit dc86c74
Show file tree
Hide file tree
Showing 4 changed files with 462 additions and 394 deletions.
44 changes: 43 additions & 1 deletion scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@
# - DerivedCoreProperties.txt
# - EastAsianWidth.txt
# - HangulSyllableType.txt
# - NormalizationTest.txt (for tests only)
# - PropList.txt
# - ReadMe.txt
# - Scripts.txt
# - UnicodeData.txt
# - emoji/emoji-data.txt
# - emoji/emoji-variation-sequences.txt
# - extracted/DerivedGeneralCategory.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the generated module into git.
Expand Down Expand Up @@ -142,6 +147,7 @@ def load_east_asian_widths() -> list[EffectiveWidth]:
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
`Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""

with fetch_open("EastAsianWidth.txt") as eaw:
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
Expand Down Expand Up @@ -179,7 +185,43 @@ def load_east_asian_widths() -> list[EffectiveWidth]:
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
width_map.append(EffectiveWidth.NARROW)

return width_map
# Characters from alphabetic scripts are narrow
load_property(
"Scripts.txt",
r"(?:Latin|Greek|Cyrillic)",
lambda cp: (
operator.setitem(width_map, cp, EffectiveWidth.NARROW)
if width_map[cp] == EffectiveWidth.AMBIGUOUS
and not (0x2160 <= cp <= 0x217F) # Roman numerals remain ambiguous
else None
),
)

# Ambiguous `Modifier_Symbol`s are narrow
load_property(
"extracted/DerivedGeneralCategory.txt",
"Sk",
lambda cp: (
operator.setitem(width_map, cp, EffectiveWidth.NARROW)
if width_map[cp] == EffectiveWidth.AMBIGUOUS
else None
),
)

# GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT
width_map[0x0387] = EffectiveWidth.AMBIGUOUS

# Canonical equivalence for symbols with stroke
with fetch_open("UnicodeData.txt") as udata:
single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;")
for line in udata.readlines():
if match := single.match(line):
composed = int(match.group(1), 16)
decomposed = int(match.group(2), 16)
if width_map[decomposed] == EffectiveWidth.AMBIGUOUS:
width_map[composed] = EffectiveWidth.AMBIGUOUS

return width_map


def load_zero_widths() -> list[bool]:
Expand Down
42 changes: 29 additions & 13 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@
//! 3. The sequence `"\r\n"` has width 1.
//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 6. The following have width 0:
//! 5. In an East Asian context only, `<`, `=`, or `>` have width 2 when followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY].
//! 6. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 7. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`] property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All @@ -64,18 +65,26 @@
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
//! 9. All other characters have width 1.
//! 9. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
//! 10. All other characters have width 1.
//!
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
//!
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
//!
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
Expand All @@ -84,14 +93,13 @@
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
//!
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
//!
//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
//!
//! ## Canonical equivalence
//!
//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
//! However, this guarantee does not currently hold for the CJK width variants.
//! Canonically equivalent strings are assigned the same width (CJK and non-CJK).
#![forbid(unsafe_code)]
#![deny(missing_docs)]
Expand Down Expand Up @@ -198,14 +206,17 @@ enum NextCharInfo {
#[default]
Default,
/// `'\n'`
LineFeed = 0x0A,
LineFeed,
/// '\u{0338}'
/// For preserving canonical equivalence with CJK
CombiningLongSolidusOverlay,
/// `'\u{A4FC}'..='\u{A4FD}'`
/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
TrailingLisuToneLetter,
/// `'\u{FE0E}'`
Vs15 = 0x0E,
Vs15,
/// `'\u{FE0F}'`
Vs16 = 0x0F,
Vs16,
}

fn str_width(s: &str, is_cjk: bool) -> usize {
Expand All @@ -222,7 +233,11 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
/// they're treated as single width.
#[inline]
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
if (is_cjk
&& next_info == NextCharInfo::CombiningLongSolidusOverlay
&& matches!(c, '<' | '=' | '>'))
|| (next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c))
{
(2, NextCharInfo::Default)
} else if c <= '\u{A0}' {
match c {
Expand All @@ -235,6 +250,7 @@ fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextC
('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
(0, NextCharInfo::Default)
}
('\u{0338}', _) => (0, NextCharInfo::CombiningLongSolidusOverlay),
('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
Expand Down
Loading

0 comments on commit dc86c74

Please # to comment.