From b8c6de949e55997d6c8693d84bc5137eb374e48a Mon Sep 17 00:00:00 2001 From: Peter Edberg Date: Tue, 23 Aug 2022 17:52:30 +0000 Subject: [PATCH] ICU-22112 word break updates for @,colon; colon tailorings for fi,sv See #2159 --- icu4c/source/data/brkitr/fi.txt | 8 + icu4c/source/data/brkitr/rules/word.txt | 6 +- icu4c/source/data/brkitr/rules/word_POSIX.txt | 4 +- icu4c/source/data/brkitr/rules/word_fi_sv.txt | 172 ++++++++++++++++++ icu4c/source/data/brkitr/sv.txt | 8 + icu4c/source/data/xml/brkitr/fi.xml | 26 +++ icu4c/source/data/xml/brkitr/sv.xml | 26 +++ icu4c/source/test/intltest/rbbitst.cpp | 11 +- .../source/test/testdata/break_rules/word.txt | 4 +- .../test/testdata/break_rules/word_POSIX.txt | 2 +- icu4c/source/test/testdata/rbbitst.txt | 18 +- icu4j/main/shared/data/icudata.jar | 4 +- icu4j/main/shared/data/icutzdata.jar | 4 +- icu4j/main/shared/data/testdata.jar | 4 +- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 4 +- .../icu/dev/test/rbbi/break_rules/word.txt | 4 +- .../dev/test/rbbi/break_rules/word_POSIX.txt | 2 +- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 18 +- 18 files changed, 301 insertions(+), 24 deletions(-) create mode 100644 icu4c/source/data/brkitr/fi.txt create mode 100644 icu4c/source/data/brkitr/rules/word_fi_sv.txt create mode 100644 icu4c/source/data/brkitr/sv.txt create mode 100644 icu4c/source/data/xml/brkitr/fi.xml create mode 100644 icu4c/source/data/xml/brkitr/sv.xml diff --git a/icu4c/source/data/brkitr/fi.txt b/icu4c/source/data/brkitr/fi.txt new file mode 100644 index 000000000000..e672992edb19 --- /dev/null +++ b/icu4c/source/data/brkitr/fi.txt @@ -0,0 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml +fi{ + boundaries{ + word:process(dependency){"word_fi_sv.brk"} + } +} diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index a2eef17e6529..0f0e734d27c0 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -11,7 +11,7 @@ # These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # # Note: Updates to word.txt will usually need to be merged into -# word_POSIX.txt also. +# word_POSIX.txt and word_fi_sv.txt also. ############################################################################## # @@ -38,11 +38,11 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; $Format = [\p{Word_Break = Format}]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; -$ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} @]; $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet}]; -$MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt index 73ddc8dc19b3..e62fd7fac3b1 100644 --- a/icu4c/source/data/brkitr/rules/word_POSIX.txt +++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt @@ -38,11 +38,11 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; $Format = [\p{Word_Break = Format}]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; -$ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} @]; $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; -$MidLetter = [\p{Word_Break = MidLetter} - [\:]]; +$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; $MidNum = [\p{Word_Break = MidNum} [.]]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4c/source/data/brkitr/rules/word_fi_sv.txt b/icu4c/source/data/brkitr/rules/word_fi_sv.txt new file mode 100644 index 000000000000..544558f91adf --- /dev/null +++ b/icu4c/source/data/brkitr/rules/word_fi_sv.txt @@ -0,0 +1,172 @@ +# +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. +# +# file: word_fi_sv.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 +# +# Note: Updates to word.txt will usually need to be merged into +# word_fi_sv.txt also. + +############################################################################## +# +# Character class definitions from TR 29 +# +############################################################################## + +!!chain; +!!quoted_literals_only; + + +# +# Character Class Definitions. +# + +$Han = [:Han:]; + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter} @]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; + +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; + +# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + + +## ------------------------------------------------- + +# Rule 3 - CR x LF +# +$CR $LF; + +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. +# +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. +# +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # + +# +# rule 5 +# Do not break between most letters. +# +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. +# +^$Regional_Indicator $ExFm* $Regional_Indicator; + +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/icu4c/source/data/brkitr/sv.txt b/icu4c/source/data/brkitr/sv.txt new file mode 100644 index 000000000000..09cef533f290 --- /dev/null +++ b/icu4c/source/data/brkitr/sv.txt @@ -0,0 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml +sv{ + boundaries{ + word:process(dependency){"word_fi_sv.brk"} + } +} diff --git a/icu4c/source/data/xml/brkitr/fi.xml b/icu4c/source/data/xml/brkitr/fi.xml new file mode 100644 index 000000000000..5081e7cd4edf --- /dev/null +++ b/icu4c/source/data/xml/brkitr/fi.xml @@ -0,0 +1,26 @@ + + + + %icu; +] +> + + + + + + + + + + + + + + diff --git a/icu4c/source/data/xml/brkitr/sv.xml b/icu4c/source/data/xml/brkitr/sv.xml new file mode 100644 index 000000000000..5f1566942d57 --- /dev/null +++ b/icu4c/source/data/xml/brkitr/sv.xml @@ -0,0 +1,26 @@ + + + + %icu; +] +> + + + + + + + + + + + + + + diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 09d71a7950a3..e3af10926a17 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1264,12 +1264,17 @@ UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char * {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"}, {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"}, {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"}, + + // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112, + // need to skip some tests in WordBreakTest.txt + {"22127", "WordBreakTest.txt", u"a:"}, + {"22127", "WordBreakTest.txt", u"A:"}, }; for (int n=0; n -•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ +•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct.field<200> \ •for<200> •CS<200>-•types<200>.• •\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• @@ -1593,6 +1594,21 @@ Bangkok)• •for<200> •CS<200>-•types<200>.• •\u06c9<200>\uc799\ufffa• •\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• + + + +•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ +•for<200> •CS<200>-•types<200>.• +•\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• + + + +•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ +•for<200> •CS<200>-•types<200>.• +•\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• # UBreakIteratorType UBRK_CHARACTER, Locale "th" diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 4b3b39be13af..c78dfcea13e4 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf5ab1c8ba2dfa4638a00292e159e22710407f3827327310824d7295371eabbd -size 14166397 +oid sha256:fe0a6ec1cedc0254064bf0b8ab9ea97ced6b7ecb036c968c9b7b1f5e15543493 +size 14171830 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 37e161c9cf3a..ae01fd1a8476 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8a507295001abd5ecaa5ac9e3cb377f21f4fe78610f05ce4bbac8d4b7475d37 -size 93664 +oid sha256:6ec86a03cb8a27a270dc589a433af299c4303b171519c40bf515751d8f726cef +size 93663 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 452debeaa9b1..67e16fecf53b 100644 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ec26b963167ffeabcc86f873ecbee723b7aad80bc0c4593dddb5cef89978213 -size 828453 +oid sha256:d95cc6820315fa875b7ecc5b15e2b1bffa0419d2562efd4b252dc0de4ff165b9 +size 828452 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index fc0fa7f546c0..941024309d56 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -400,11 +400,11 @@ static class RBBIWordMonkey extends RBBIMonkeyKind { fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); - fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter} @]"); fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); - fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); + fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]"); fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt index 5ace30266c8f..8594055f71f9 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt @@ -25,11 +25,11 @@ Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; Format = [\p{Word_Break = Format}]; Katakana = [\p{Word_Break = Katakana}]; Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; -ALetter = [\p{Word_Break = ALetter}]; +ALetter = [\p{Word_Break = ALetter} @]; Single_Quote = [\p{Word_Break = Single_Quote}]; Double_Quote = [\p{Word_Break = Double_Quote}]; MidNumLet = [\p{Word_Break = MidNumLet}]; -MidLetter = [\p{Word_Break = MidLetter}]; +MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; MidNum = [\p{Word_Break = MidNum}]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt index 7fdc1a1ee077..fc4eedeb8bdc 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt @@ -28,7 +28,7 @@ ALetter = [\p{Word_Break = ALetter}]; Single_Quote = [\p{Word_Break = Single_Quote}]; Double_Quote = [\p{Word_Break = Double_Quote}]; MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; -MidLetter = [\p{Word_Break = MidLetter} - [\:]]; +MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; MidNum = [\p{Word_Break = MidNum} [.]]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 132bcb141ad1..6f7555d2f6fe 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1583,9 +1583,10 @@ Bangkok)• -•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ +•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct.field<200> \ •for<200> •CS<200>-•types<200>.• •\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• @@ -1593,6 +1594,21 @@ Bangkok)• •for<200> •CS<200>-•types<200>.• •\u06c9<200>\uc799\ufffa• •\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• + + + +•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ +•for<200> •CS<200>-•types<200>.• +•\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• + + + +•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ +•for<200> •CS<200>-•types<200>.• +•\uFF92\uFF76\uFF9E<400> • +•xx@yy<200>.• # UBreakIteratorType UBRK_CHARACTER, Locale "th"