diff --git a/src/etc/unicode.py b/src/etc/unicode.py index a87c755397d94..f1761c5719a52 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -51,6 +51,30 @@ 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], } + +# Grapheme cluster data +# taken from UAX29, http://www.unicode.org/reports/tr29/ +# these code points are excluded from the Control category +# NOTE: CR and LF are also technically excluded, but for +# the sake of convenience we leave them in the Control group +# and manually check them in the appropriate place. This is +# still compliant with the implementation requirements. +grapheme_control_exceptions = set([0x200c, 0x200d]) + +# the Regional_Indicator category +grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)] + +# "The following ... are specifically excluded" from the SpacingMark category +# http://www.unicode.org/reports/tr29/#SpacingMark +grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038), + (0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c), + (0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9), + (0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64), + (0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)] + +# these are included in the SpacingMark category +grapheme_spacingmark_extra = set([0xe33, 0xeb3]) + def fetch(f): if not os.path.exists(f): os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" @@ -109,7 +133,7 @@ def load_unicode_data(f): canon_decomp[code] = seq # place letter in categories as appropriate - for cat in [gencat] + expanded_categories.get(gencat, []): + for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): if cat not in gencats: gencats[cat] = [] gencats[cat].append(code) @@ -120,6 +144,12 @@ def load_unicode_data(f): combines[combine] = [] combines[combine].append(code) + # generate Not_Assigned from Assigned + gencats["Cn"] = gen_unassigned(gencats["Assigned"]) + # Assigned is not a real category + del(gencats["Assigned"]) + # Other contains Not_Assigned + gencats["C"].extend(gencats["Cn"]) gencats = group_cats(gencats) combines = to_combines(group_cats(combines)) @@ -155,6 +185,11 @@ def ungroup_cat(cat): lo += 1 return cat_out +def gen_unassigned(assigned): + assigned = set(assigned) + return ([i for i in range(0, 0xd800) if i not in assigned] + + [i for i in range(0xe000, 0x110000) if i not in assigned]) + def to_combines(combs): combs_out = [] for comb in combs: @@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower): sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False) f.write("}\n\n") +def emit_grapheme_module(f, grapheme_table, grapheme_cats): + f.write("""pub mod grapheme { + use core::option::{Some, None}; + use core::slice::ImmutableVector; + + #[allow(non_camel_case_types)] + #[deriving(Clone)] + pub enum GraphemeCat { +""") + for cat in grapheme_cats + ["Any"]: + f.write(" GC_" + cat + ",\n") + f.write(""" } + + fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat { + use core::cmp::{Equal, Less, Greater}; + match r.bsearch(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Some(idx) => { + let (_, _, cat) = r[idx]; + cat + } + None => GC_Any + } + } + + pub fn grapheme_category(c: char) -> GraphemeCat { + bsearch_range_value_table(c, grapheme_cat_table) + } + +""") + + emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]", + pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]), + is_pub=False) + f.write("}\n") + def emit_charwidth_module(f, width_table): f.write("pub mod charwidth {\n") f.write(" use core::option::{Option, Some, None};\n") @@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table): f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n") emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False, pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3])) - f.write("}\n") + f.write("}\n\n") def emit_norm_module(f, canon, compat, combine): canon_keys = canon.keys() @@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val): wtable_out.extend(wtable) return wtable_out + + def optimize_width_table(wtable): wtable_out = [] w_this = wtable.pop(0) @@ -487,7 +563,7 @@ def optimize_width_table(wtable): return wtable_out if __name__ == "__main__": - r = "unicode.rs" + r = "tables.rs" if os.path.exists(r): os.remove(r) with open(r, "w") as rf: @@ -498,12 +574,18 @@ def optimize_width_table(wtable): (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt") want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"] - other_derived = ["Default_Ignorable_Code_Point"] + other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"] derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived) scripts = load_properties("Scripts.txt", []) props = load_properties("PropList.txt", ["White_Space", "Join_Control", "Noncharacter_Code_Point"]) + # grapheme cluster category from DerivedCoreProperties + # the rest are defined below + grapheme_cats = {} + grapheme_cats["Extend"] = derived["Grapheme_Extend"] + del(derived["Grapheme_Extend"]) + # bsearch_range_table is used in all the property modules below emit_bsearch_range_table(rf) @@ -533,7 +615,7 @@ def optimize_width_table(wtable): emit_norm_module(rf, canon_decomp, compat_decomp, combines) emit_conversions_module(rf, lowerupper, upperlower) - # character width module + ### character width module width_table = [] for zwcat in ["Me", "Mn", "Cf"]: width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat])) @@ -555,3 +637,40 @@ def optimize_width_table(wtable): # optimize the width table by collapsing adjacent entities when possible width_table = optimize_width_table(width_table) emit_charwidth_module(rf, width_table) + + ### grapheme cluster module + # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values + # Hangul syllable categories + want_hangul = ["L", "V", "T", "LV", "LVT"] + grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul)) + + # Control + # This category also includes Cs (surrogate codepoints), but Rust's `char`s are + # Unicode Scalar Values only, and surrogates are thus invalid `char`s. + grapheme_cats["Control"] = set() + for cat in ["Zl", "Zp", "Cc", "Cf"]: + grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat])) + grapheme_cats["Control"] = group_cat(list( + grapheme_cats["Control"] + - grapheme_control_exceptions + | (set(ungroup_cat(gencats["Cn"])) + & set(ungroup_cat(derived["Default_Ignorable_Code_Point"]))))) + + # Regional Indicator + grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator + + # Prepend - "Currently there are no characters with this value" + # (from UAX#29, Unicode 7.0) + + # SpacingMark + grapheme_cats["SpacingMark"] = group_cat(list( + set(ungroup_cat(gencats["Mc"])) + - set(ungroup_cat(grapheme_cats["Extend"])) + | grapheme_spacingmark_extra + - set(ungroup_cat(grapheme_spacingmark_exceptions)))) + + grapheme_table = [] + for cat in grapheme_cats: + grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]]) + grapheme_table.sort(key=lambda w: w[0]) + emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys()) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index 798046c461d6c..9e98088cf773d 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -88,7 +88,7 @@ pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items}; pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items}; pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange}; pub use core::str::{Str, StrSlice}; -pub use unicode::{Words, UnicodeStrSlice}; +pub use unicode::str::{UnicodeStrSlice, Words, Graphemes, GraphemeIndices}; /* Section: Creating a string @@ -284,8 +284,6 @@ pub struct Decompositions<'a> { impl<'a> Iterator for Decompositions<'a> { #[inline] fn next(&mut self) -> Option { - use unicode::canonical_combining_class; - match self.buffer.as_slice().head() { Some(&(c, 0)) => { self.sorted = false; @@ -309,7 +307,7 @@ impl<'a> Iterator for Decompositions<'a> { let buffer = &mut self.buffer; let sorted = &mut self.sorted; decomposer(ch, |d| { - let class = canonical_combining_class(d); + let class = unicode::char::canonical_combining_class(d); if class == 0 && !*sorted { canonical_sort(buffer.as_mut_slice()); *sorted = true; @@ -1021,7 +1019,7 @@ mod tests { use string::String; use vec::Vec; - use unicode::UnicodeChar; + use unicode::char::UnicodeChar; #[test] fn test_eq_slice() { @@ -2110,7 +2108,7 @@ mod tests { assert_eq!("\u0301a".nfkd_chars().collect::(), String::from_str("\u0301a")); assert_eq!("\ud4db".nfkd_chars().collect::(), -String::from_str("\u1111\u1171\u11b6")); + String::from_str("\u1111\u1171\u11b6")); assert_eq!("\uac1c".nfkd_chars().collect::(), String::from_str("\u1100\u1162")); } @@ -2125,6 +2123,286 @@ String::from_str("\u1111\u1171\u11b6")); assert_eq!(lines, vec!["", "Märy häd ä little lämb", "", "Little lämb"]); } + #[test] + fn test_graphemes() { + use std::iter::order; + // official Unicode test data + // from http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt + let test_same = [ + ("\u0020\u0020", &["\u0020", "\u0020"]), ("\u0020\u0308\u0020", &["\u0020\u0308", + "\u0020"]), ("\u0020\u000D", &["\u0020", "\u000D"]), ("\u0020\u0308\u000D", + &["\u0020\u0308", "\u000D"]), ("\u0020\u000A", &["\u0020", "\u000A"]), + ("\u0020\u0308\u000A", &["\u0020\u0308", "\u000A"]), ("\u0020\u0001", &["\u0020", + "\u0001"]), ("\u0020\u0308\u0001", &["\u0020\u0308", "\u0001"]), ("\u0020\u0300", + &["\u0020\u0300"]), ("\u0020\u0308\u0300", &["\u0020\u0308\u0300"]), ("\u0020\u1100", + &["\u0020", "\u1100"]), ("\u0020\u0308\u1100", &["\u0020\u0308", "\u1100"]), + ("\u0020\u1160", &["\u0020", "\u1160"]), ("\u0020\u0308\u1160", &["\u0020\u0308", + "\u1160"]), ("\u0020\u11A8", &["\u0020", "\u11A8"]), ("\u0020\u0308\u11A8", + &["\u0020\u0308", "\u11A8"]), ("\u0020\uAC00", &["\u0020", "\uAC00"]), + ("\u0020\u0308\uAC00", &["\u0020\u0308", "\uAC00"]), ("\u0020\uAC01", &["\u0020", + "\uAC01"]), ("\u0020\u0308\uAC01", &["\u0020\u0308", "\uAC01"]), ("\u0020\U0001F1E6", + &["\u0020", "\U0001F1E6"]), ("\u0020\u0308\U0001F1E6", &["\u0020\u0308", + "\U0001F1E6"]), ("\u0020\u0378", &["\u0020", "\u0378"]), ("\u0020\u0308\u0378", + &["\u0020\u0308", "\u0378"]), ("\u000D\u0020", &["\u000D", "\u0020"]), + ("\u000D\u0308\u0020", &["\u000D", "\u0308", "\u0020"]), ("\u000D\u000D", &["\u000D", + "\u000D"]), ("\u000D\u0308\u000D", &["\u000D", "\u0308", "\u000D"]), ("\u000D\u000A", + &["\u000D\u000A"]), ("\u000D\u0308\u000A", &["\u000D", "\u0308", "\u000A"]), + ("\u000D\u0001", &["\u000D", "\u0001"]), ("\u000D\u0308\u0001", &["\u000D", "\u0308", + "\u0001"]), ("\u000D\u0300", &["\u000D", "\u0300"]), ("\u000D\u0308\u0300", + &["\u000D", "\u0308\u0300"]), ("\u000D\u0903", &["\u000D", "\u0903"]), + ("\u000D\u1100", &["\u000D", "\u1100"]), ("\u000D\u0308\u1100", &["\u000D", "\u0308", + "\u1100"]), ("\u000D\u1160", &["\u000D", "\u1160"]), ("\u000D\u0308\u1160", + &["\u000D", "\u0308", "\u1160"]), ("\u000D\u11A8", &["\u000D", "\u11A8"]), + ("\u000D\u0308\u11A8", &["\u000D", "\u0308", "\u11A8"]), ("\u000D\uAC00", &["\u000D", + "\uAC00"]), ("\u000D\u0308\uAC00", &["\u000D", "\u0308", "\uAC00"]), ("\u000D\uAC01", + &["\u000D", "\uAC01"]), ("\u000D\u0308\uAC01", &["\u000D", "\u0308", "\uAC01"]), + ("\u000D\U0001F1E6", &["\u000D", "\U0001F1E6"]), ("\u000D\u0308\U0001F1E6", + &["\u000D", "\u0308", "\U0001F1E6"]), ("\u000D\u0378", &["\u000D", "\u0378"]), + ("\u000D\u0308\u0378", &["\u000D", "\u0308", "\u0378"]), ("\u000A\u0020", &["\u000A", + "\u0020"]), ("\u000A\u0308\u0020", &["\u000A", "\u0308", "\u0020"]), ("\u000A\u000D", + &["\u000A", "\u000D"]), ("\u000A\u0308\u000D", &["\u000A", "\u0308", "\u000D"]), + ("\u000A\u000A", &["\u000A", "\u000A"]), ("\u000A\u0308\u000A", &["\u000A", "\u0308", + "\u000A"]), ("\u000A\u0001", &["\u000A", "\u0001"]), ("\u000A\u0308\u0001", + &["\u000A", "\u0308", "\u0001"]), ("\u000A\u0300", &["\u000A", "\u0300"]), + ("\u000A\u0308\u0300", &["\u000A", "\u0308\u0300"]), ("\u000A\u0903", &["\u000A", + "\u0903"]), ("\u000A\u1100", &["\u000A", "\u1100"]), ("\u000A\u0308\u1100", + &["\u000A", "\u0308", "\u1100"]), ("\u000A\u1160", &["\u000A", "\u1160"]), + ("\u000A\u0308\u1160", &["\u000A", "\u0308", "\u1160"]), ("\u000A\u11A8", &["\u000A", + "\u11A8"]), ("\u000A\u0308\u11A8", &["\u000A", "\u0308", "\u11A8"]), ("\u000A\uAC00", + &["\u000A", "\uAC00"]), ("\u000A\u0308\uAC00", &["\u000A", "\u0308", "\uAC00"]), + ("\u000A\uAC01", &["\u000A", "\uAC01"]), ("\u000A\u0308\uAC01", &["\u000A", "\u0308", + "\uAC01"]), ("\u000A\U0001F1E6", &["\u000A", "\U0001F1E6"]), + ("\u000A\u0308\U0001F1E6", &["\u000A", "\u0308", "\U0001F1E6"]), ("\u000A\u0378", + &["\u000A", "\u0378"]), ("\u000A\u0308\u0378", &["\u000A", "\u0308", "\u0378"]), + ("\u0001\u0020", &["\u0001", "\u0020"]), ("\u0001\u0308\u0020", &["\u0001", "\u0308", + "\u0020"]), ("\u0001\u000D", &["\u0001", "\u000D"]), ("\u0001\u0308\u000D", + &["\u0001", "\u0308", "\u000D"]), ("\u0001\u000A", &["\u0001", "\u000A"]), + ("\u0001\u0308\u000A", &["\u0001", "\u0308", "\u000A"]), ("\u0001\u0001", &["\u0001", + "\u0001"]), ("\u0001\u0308\u0001", &["\u0001", "\u0308", "\u0001"]), ("\u0001\u0300", + &["\u0001", "\u0300"]), ("\u0001\u0308\u0300", &["\u0001", "\u0308\u0300"]), + ("\u0001\u0903", &["\u0001", "\u0903"]), ("\u0001\u1100", &["\u0001", "\u1100"]), + ("\u0001\u0308\u1100", &["\u0001", "\u0308", "\u1100"]), ("\u0001\u1160", &["\u0001", + "\u1160"]), ("\u0001\u0308\u1160", &["\u0001", "\u0308", "\u1160"]), ("\u0001\u11A8", + &["\u0001", "\u11A8"]), ("\u0001\u0308\u11A8", &["\u0001", "\u0308", "\u11A8"]), + ("\u0001\uAC00", &["\u0001", "\uAC00"]), ("\u0001\u0308\uAC00", &["\u0001", "\u0308", + "\uAC00"]), ("\u0001\uAC01", &["\u0001", "\uAC01"]), ("\u0001\u0308\uAC01", + &["\u0001", "\u0308", "\uAC01"]), ("\u0001\U0001F1E6", &["\u0001", "\U0001F1E6"]), + ("\u0001\u0308\U0001F1E6", &["\u0001", "\u0308", "\U0001F1E6"]), ("\u0001\u0378", + &["\u0001", "\u0378"]), ("\u0001\u0308\u0378", &["\u0001", "\u0308", "\u0378"]), + ("\u0300\u0020", &["\u0300", "\u0020"]), ("\u0300\u0308\u0020", &["\u0300\u0308", + "\u0020"]), ("\u0300\u000D", &["\u0300", "\u000D"]), ("\u0300\u0308\u000D", + &["\u0300\u0308", "\u000D"]), ("\u0300\u000A", &["\u0300", "\u000A"]), + ("\u0300\u0308\u000A", &["\u0300\u0308", "\u000A"]), ("\u0300\u0001", &["\u0300", + "\u0001"]), ("\u0300\u0308\u0001", &["\u0300\u0308", "\u0001"]), ("\u0300\u0300", + &["\u0300\u0300"]), ("\u0300\u0308\u0300", &["\u0300\u0308\u0300"]), ("\u0300\u1100", + &["\u0300", "\u1100"]), ("\u0300\u0308\u1100", &["\u0300\u0308", "\u1100"]), + ("\u0300\u1160", &["\u0300", "\u1160"]), ("\u0300\u0308\u1160", &["\u0300\u0308", + "\u1160"]), ("\u0300\u11A8", &["\u0300", "\u11A8"]), ("\u0300\u0308\u11A8", + &["\u0300\u0308", "\u11A8"]), ("\u0300\uAC00", &["\u0300", "\uAC00"]), + ("\u0300\u0308\uAC00", &["\u0300\u0308", "\uAC00"]), ("\u0300\uAC01", &["\u0300", + "\uAC01"]), ("\u0300\u0308\uAC01", &["\u0300\u0308", "\uAC01"]), ("\u0300\U0001F1E6", + &["\u0300", "\U0001F1E6"]), ("\u0300\u0308\U0001F1E6", &["\u0300\u0308", + "\U0001F1E6"]), ("\u0300\u0378", &["\u0300", "\u0378"]), ("\u0300\u0308\u0378", + &["\u0300\u0308", "\u0378"]), ("\u0903\u0020", &["\u0903", "\u0020"]), + ("\u0903\u0308\u0020", &["\u0903\u0308", "\u0020"]), ("\u0903\u000D", &["\u0903", + "\u000D"]), ("\u0903\u0308\u000D", &["\u0903\u0308", "\u000D"]), ("\u0903\u000A", + &["\u0903", "\u000A"]), ("\u0903\u0308\u000A", &["\u0903\u0308", "\u000A"]), + ("\u0903\u0001", &["\u0903", "\u0001"]), ("\u0903\u0308\u0001", &["\u0903\u0308", + "\u0001"]), ("\u0903\u0300", &["\u0903\u0300"]), ("\u0903\u0308\u0300", + &["\u0903\u0308\u0300"]), ("\u0903\u1100", &["\u0903", "\u1100"]), + ("\u0903\u0308\u1100", &["\u0903\u0308", "\u1100"]), ("\u0903\u1160", &["\u0903", + "\u1160"]), ("\u0903\u0308\u1160", &["\u0903\u0308", "\u1160"]), ("\u0903\u11A8", + &["\u0903", "\u11A8"]), ("\u0903\u0308\u11A8", &["\u0903\u0308", "\u11A8"]), + ("\u0903\uAC00", &["\u0903", "\uAC00"]), ("\u0903\u0308\uAC00", &["\u0903\u0308", + "\uAC00"]), ("\u0903\uAC01", &["\u0903", "\uAC01"]), ("\u0903\u0308\uAC01", + &["\u0903\u0308", "\uAC01"]), ("\u0903\U0001F1E6", &["\u0903", "\U0001F1E6"]), + ("\u0903\u0308\U0001F1E6", &["\u0903\u0308", "\U0001F1E6"]), ("\u0903\u0378", + &["\u0903", "\u0378"]), ("\u0903\u0308\u0378", &["\u0903\u0308", "\u0378"]), + ("\u1100\u0020", &["\u1100", "\u0020"]), ("\u1100\u0308\u0020", &["\u1100\u0308", + "\u0020"]), ("\u1100\u000D", &["\u1100", "\u000D"]), ("\u1100\u0308\u000D", + &["\u1100\u0308", "\u000D"]), ("\u1100\u000A", &["\u1100", "\u000A"]), + ("\u1100\u0308\u000A", &["\u1100\u0308", "\u000A"]), ("\u1100\u0001", &["\u1100", + "\u0001"]), ("\u1100\u0308\u0001", &["\u1100\u0308", "\u0001"]), ("\u1100\u0300", + &["\u1100\u0300"]), ("\u1100\u0308\u0300", &["\u1100\u0308\u0300"]), ("\u1100\u1100", + &["\u1100\u1100"]), ("\u1100\u0308\u1100", &["\u1100\u0308", "\u1100"]), + ("\u1100\u1160", &["\u1100\u1160"]), ("\u1100\u0308\u1160", &["\u1100\u0308", + "\u1160"]), ("\u1100\u11A8", &["\u1100", "\u11A8"]), ("\u1100\u0308\u11A8", + &["\u1100\u0308", "\u11A8"]), ("\u1100\uAC00", &["\u1100\uAC00"]), + ("\u1100\u0308\uAC00", &["\u1100\u0308", "\uAC00"]), ("\u1100\uAC01", + &["\u1100\uAC01"]), ("\u1100\u0308\uAC01", &["\u1100\u0308", "\uAC01"]), + ("\u1100\U0001F1E6", &["\u1100", "\U0001F1E6"]), ("\u1100\u0308\U0001F1E6", + &["\u1100\u0308", "\U0001F1E6"]), ("\u1100\u0378", &["\u1100", "\u0378"]), + ("\u1100\u0308\u0378", &["\u1100\u0308", "\u0378"]), ("\u1160\u0020", &["\u1160", + "\u0020"]), ("\u1160\u0308\u0020", &["\u1160\u0308", "\u0020"]), ("\u1160\u000D", + &["\u1160", "\u000D"]), ("\u1160\u0308\u000D", &["\u1160\u0308", "\u000D"]), + ("\u1160\u000A", &["\u1160", "\u000A"]), ("\u1160\u0308\u000A", &["\u1160\u0308", + "\u000A"]), ("\u1160\u0001", &["\u1160", "\u0001"]), ("\u1160\u0308\u0001", + &["\u1160\u0308", "\u0001"]), ("\u1160\u0300", &["\u1160\u0300"]), + ("\u1160\u0308\u0300", &["\u1160\u0308\u0300"]), ("\u1160\u1100", &["\u1160", + "\u1100"]), ("\u1160\u0308\u1100", &["\u1160\u0308", "\u1100"]), ("\u1160\u1160", + &["\u1160\u1160"]), ("\u1160\u0308\u1160", &["\u1160\u0308", "\u1160"]), + ("\u1160\u11A8", &["\u1160\u11A8"]), ("\u1160\u0308\u11A8", &["\u1160\u0308", + "\u11A8"]), ("\u1160\uAC00", &["\u1160", "\uAC00"]), ("\u1160\u0308\uAC00", + &["\u1160\u0308", "\uAC00"]), ("\u1160\uAC01", &["\u1160", "\uAC01"]), + ("\u1160\u0308\uAC01", &["\u1160\u0308", "\uAC01"]), ("\u1160\U0001F1E6", &["\u1160", + "\U0001F1E6"]), ("\u1160\u0308\U0001F1E6", &["\u1160\u0308", "\U0001F1E6"]), + ("\u1160\u0378", &["\u1160", "\u0378"]), ("\u1160\u0308\u0378", &["\u1160\u0308", + "\u0378"]), ("\u11A8\u0020", &["\u11A8", "\u0020"]), ("\u11A8\u0308\u0020", + &["\u11A8\u0308", "\u0020"]), ("\u11A8\u000D", &["\u11A8", "\u000D"]), + ("\u11A8\u0308\u000D", &["\u11A8\u0308", "\u000D"]), ("\u11A8\u000A", &["\u11A8", + "\u000A"]), ("\u11A8\u0308\u000A", &["\u11A8\u0308", "\u000A"]), ("\u11A8\u0001", + &["\u11A8", "\u0001"]), ("\u11A8\u0308\u0001", &["\u11A8\u0308", "\u0001"]), + ("\u11A8\u0300", &["\u11A8\u0300"]), ("\u11A8\u0308\u0300", &["\u11A8\u0308\u0300"]), + ("\u11A8\u1100", &["\u11A8", "\u1100"]), ("\u11A8\u0308\u1100", &["\u11A8\u0308", + "\u1100"]), ("\u11A8\u1160", &["\u11A8", "\u1160"]), ("\u11A8\u0308\u1160", + &["\u11A8\u0308", "\u1160"]), ("\u11A8\u11A8", &["\u11A8\u11A8"]), + ("\u11A8\u0308\u11A8", &["\u11A8\u0308", "\u11A8"]), ("\u11A8\uAC00", &["\u11A8", + "\uAC00"]), ("\u11A8\u0308\uAC00", &["\u11A8\u0308", "\uAC00"]), ("\u11A8\uAC01", + &["\u11A8", "\uAC01"]), ("\u11A8\u0308\uAC01", &["\u11A8\u0308", "\uAC01"]), + ("\u11A8\U0001F1E6", &["\u11A8", "\U0001F1E6"]), ("\u11A8\u0308\U0001F1E6", + &["\u11A8\u0308", "\U0001F1E6"]), ("\u11A8\u0378", &["\u11A8", "\u0378"]), + ("\u11A8\u0308\u0378", &["\u11A8\u0308", "\u0378"]), ("\uAC00\u0020", &["\uAC00", + "\u0020"]), ("\uAC00\u0308\u0020", &["\uAC00\u0308", "\u0020"]), ("\uAC00\u000D", + &["\uAC00", "\u000D"]), ("\uAC00\u0308\u000D", &["\uAC00\u0308", "\u000D"]), + ("\uAC00\u000A", &["\uAC00", "\u000A"]), ("\uAC00\u0308\u000A", &["\uAC00\u0308", + "\u000A"]), ("\uAC00\u0001", &["\uAC00", "\u0001"]), ("\uAC00\u0308\u0001", + &["\uAC00\u0308", "\u0001"]), ("\uAC00\u0300", &["\uAC00\u0300"]), + ("\uAC00\u0308\u0300", &["\uAC00\u0308\u0300"]), ("\uAC00\u1100", &["\uAC00", + "\u1100"]), ("\uAC00\u0308\u1100", &["\uAC00\u0308", "\u1100"]), ("\uAC00\u1160", + &["\uAC00\u1160"]), ("\uAC00\u0308\u1160", &["\uAC00\u0308", "\u1160"]), + ("\uAC00\u11A8", &["\uAC00\u11A8"]), ("\uAC00\u0308\u11A8", &["\uAC00\u0308", + "\u11A8"]), ("\uAC00\uAC00", &["\uAC00", "\uAC00"]), ("\uAC00\u0308\uAC00", + &["\uAC00\u0308", "\uAC00"]), ("\uAC00\uAC01", &["\uAC00", "\uAC01"]), + ("\uAC00\u0308\uAC01", &["\uAC00\u0308", "\uAC01"]), ("\uAC00\U0001F1E6", &["\uAC00", + "\U0001F1E6"]), ("\uAC00\u0308\U0001F1E6", &["\uAC00\u0308", "\U0001F1E6"]), + ("\uAC00\u0378", &["\uAC00", "\u0378"]), ("\uAC00\u0308\u0378", &["\uAC00\u0308", + "\u0378"]), ("\uAC01\u0020", &["\uAC01", "\u0020"]), ("\uAC01\u0308\u0020", + &["\uAC01\u0308", "\u0020"]), ("\uAC01\u000D", &["\uAC01", "\u000D"]), + ("\uAC01\u0308\u000D", &["\uAC01\u0308", "\u000D"]), ("\uAC01\u000A", &["\uAC01", + "\u000A"]), ("\uAC01\u0308\u000A", &["\uAC01\u0308", "\u000A"]), ("\uAC01\u0001", + &["\uAC01", "\u0001"]), ("\uAC01\u0308\u0001", &["\uAC01\u0308", "\u0001"]), + ("\uAC01\u0300", &["\uAC01\u0300"]), ("\uAC01\u0308\u0300", &["\uAC01\u0308\u0300"]), + ("\uAC01\u1100", &["\uAC01", "\u1100"]), ("\uAC01\u0308\u1100", &["\uAC01\u0308", + "\u1100"]), ("\uAC01\u1160", &["\uAC01", "\u1160"]), ("\uAC01\u0308\u1160", + &["\uAC01\u0308", "\u1160"]), ("\uAC01\u11A8", &["\uAC01\u11A8"]), + ("\uAC01\u0308\u11A8", &["\uAC01\u0308", "\u11A8"]), ("\uAC01\uAC00", &["\uAC01", + "\uAC00"]), ("\uAC01\u0308\uAC00", &["\uAC01\u0308", "\uAC00"]), ("\uAC01\uAC01", + &["\uAC01", "\uAC01"]), ("\uAC01\u0308\uAC01", &["\uAC01\u0308", "\uAC01"]), + ("\uAC01\U0001F1E6", &["\uAC01", "\U0001F1E6"]), ("\uAC01\u0308\U0001F1E6", + &["\uAC01\u0308", "\U0001F1E6"]), ("\uAC01\u0378", &["\uAC01", "\u0378"]), + ("\uAC01\u0308\u0378", &["\uAC01\u0308", "\u0378"]), ("\U0001F1E6\u0020", + &["\U0001F1E6", "\u0020"]), ("\U0001F1E6\u0308\u0020", &["\U0001F1E6\u0308", + "\u0020"]), ("\U0001F1E6\u000D", &["\U0001F1E6", "\u000D"]), + ("\U0001F1E6\u0308\u000D", &["\U0001F1E6\u0308", "\u000D"]), ("\U0001F1E6\u000A", + &["\U0001F1E6", "\u000A"]), ("\U0001F1E6\u0308\u000A", &["\U0001F1E6\u0308", + "\u000A"]), ("\U0001F1E6\u0001", &["\U0001F1E6", "\u0001"]), + ("\U0001F1E6\u0308\u0001", &["\U0001F1E6\u0308", "\u0001"]), ("\U0001F1E6\u0300", + &["\U0001F1E6\u0300"]), ("\U0001F1E6\u0308\u0300", &["\U0001F1E6\u0308\u0300"]), + ("\U0001F1E6\u1100", &["\U0001F1E6", "\u1100"]), ("\U0001F1E6\u0308\u1100", + &["\U0001F1E6\u0308", "\u1100"]), ("\U0001F1E6\u1160", &["\U0001F1E6", "\u1160"]), + ("\U0001F1E6\u0308\u1160", &["\U0001F1E6\u0308", "\u1160"]), ("\U0001F1E6\u11A8", + &["\U0001F1E6", "\u11A8"]), ("\U0001F1E6\u0308\u11A8", &["\U0001F1E6\u0308", + "\u11A8"]), ("\U0001F1E6\uAC00", &["\U0001F1E6", "\uAC00"]), + ("\U0001F1E6\u0308\uAC00", &["\U0001F1E6\u0308", "\uAC00"]), ("\U0001F1E6\uAC01", + &["\U0001F1E6", "\uAC01"]), ("\U0001F1E6\u0308\uAC01", &["\U0001F1E6\u0308", + "\uAC01"]), ("\U0001F1E6\U0001F1E6", &["\U0001F1E6\U0001F1E6"]), + ("\U0001F1E6\u0308\U0001F1E6", &["\U0001F1E6\u0308", "\U0001F1E6"]), + ("\U0001F1E6\u0378", &["\U0001F1E6", "\u0378"]), ("\U0001F1E6\u0308\u0378", + &["\U0001F1E6\u0308", "\u0378"]), ("\u0378\u0020", &["\u0378", "\u0020"]), + ("\u0378\u0308\u0020", &["\u0378\u0308", "\u0020"]), ("\u0378\u000D", &["\u0378", + "\u000D"]), ("\u0378\u0308\u000D", &["\u0378\u0308", "\u000D"]), ("\u0378\u000A", + &["\u0378", "\u000A"]), ("\u0378\u0308\u000A", &["\u0378\u0308", "\u000A"]), + ("\u0378\u0001", &["\u0378", "\u0001"]), ("\u0378\u0308\u0001", &["\u0378\u0308", + "\u0001"]), ("\u0378\u0300", &["\u0378\u0300"]), ("\u0378\u0308\u0300", + &["\u0378\u0308\u0300"]), ("\u0378\u1100", &["\u0378", "\u1100"]), + ("\u0378\u0308\u1100", &["\u0378\u0308", "\u1100"]), ("\u0378\u1160", &["\u0378", + "\u1160"]), ("\u0378\u0308\u1160", &["\u0378\u0308", "\u1160"]), ("\u0378\u11A8", + &["\u0378", "\u11A8"]), ("\u0378\u0308\u11A8", &["\u0378\u0308", "\u11A8"]), + ("\u0378\uAC00", &["\u0378", "\uAC00"]), ("\u0378\u0308\uAC00", &["\u0378\u0308", + "\uAC00"]), ("\u0378\uAC01", &["\u0378", "\uAC01"]), ("\u0378\u0308\uAC01", + &["\u0378\u0308", "\uAC01"]), ("\u0378\U0001F1E6", &["\u0378", "\U0001F1E6"]), + ("\u0378\u0308\U0001F1E6", &["\u0378\u0308", "\U0001F1E6"]), ("\u0378\u0378", + &["\u0378", "\u0378"]), ("\u0378\u0308\u0378", &["\u0378\u0308", "\u0378"]), + ("\u0061\U0001F1E6\u0062", &["\u0061", "\U0001F1E6", "\u0062"]), + ("\U0001F1F7\U0001F1FA", &["\U0001F1F7\U0001F1FA"]), + ("\U0001F1F7\U0001F1FA\U0001F1F8", &["\U0001F1F7\U0001F1FA\U0001F1F8"]), + ("\U0001F1F7\U0001F1FA\U0001F1F8\U0001F1EA", + &["\U0001F1F7\U0001F1FA\U0001F1F8\U0001F1EA"]), + ("\U0001F1F7\U0001F1FA\u200B\U0001F1F8\U0001F1EA", &["\U0001F1F7\U0001F1FA", "\u200B", + "\U0001F1F8\U0001F1EA"]), ("\U0001F1E6\U0001F1E7\U0001F1E8", + &["\U0001F1E6\U0001F1E7\U0001F1E8"]), ("\U0001F1E6\u200D\U0001F1E7\U0001F1E8", + &["\U0001F1E6\u200D", "\U0001F1E7\U0001F1E8"]), + ("\U0001F1E6\U0001F1E7\u200D\U0001F1E8", &["\U0001F1E6\U0001F1E7\u200D", + "\U0001F1E8"]), ("\u0020\u200D\u0646", &["\u0020\u200D", "\u0646"]), + ("\u0646\u200D\u0020", &["\u0646\u200D", "\u0020"]), + ]; + + let test_diff = [ + ("\u0020\u0903", &["\u0020\u0903"], &["\u0020", "\u0903"]), ("\u0020\u0308\u0903", + &["\u0020\u0308\u0903"], &["\u0020\u0308", "\u0903"]), ("\u000D\u0308\u0903", + &["\u000D", "\u0308\u0903"], &["\u000D", "\u0308", "\u0903"]), ("\u000A\u0308\u0903", + &["\u000A", "\u0308\u0903"], &["\u000A", "\u0308", "\u0903"]), ("\u0001\u0308\u0903", + &["\u0001", "\u0308\u0903"], &["\u0001", "\u0308", "\u0903"]), ("\u0300\u0903", + &["\u0300\u0903"], &["\u0300", "\u0903"]), ("\u0300\u0308\u0903", + &["\u0300\u0308\u0903"], &["\u0300\u0308", "\u0903"]), ("\u0903\u0903", + &["\u0903\u0903"], &["\u0903", "\u0903"]), ("\u0903\u0308\u0903", + &["\u0903\u0308\u0903"], &["\u0903\u0308", "\u0903"]), ("\u1100\u0903", + &["\u1100\u0903"], &["\u1100", "\u0903"]), ("\u1100\u0308\u0903", + &["\u1100\u0308\u0903"], &["\u1100\u0308", "\u0903"]), ("\u1160\u0903", + &["\u1160\u0903"], &["\u1160", "\u0903"]), ("\u1160\u0308\u0903", + &["\u1160\u0308\u0903"], &["\u1160\u0308", "\u0903"]), ("\u11A8\u0903", + &["\u11A8\u0903"], &["\u11A8", "\u0903"]), ("\u11A8\u0308\u0903", + &["\u11A8\u0308\u0903"], &["\u11A8\u0308", "\u0903"]), ("\uAC00\u0903", + &["\uAC00\u0903"], &["\uAC00", "\u0903"]), ("\uAC00\u0308\u0903", + &["\uAC00\u0308\u0903"], &["\uAC00\u0308", "\u0903"]), ("\uAC01\u0903", + &["\uAC01\u0903"], &["\uAC01", "\u0903"]), ("\uAC01\u0308\u0903", + &["\uAC01\u0308\u0903"], &["\uAC01\u0308", "\u0903"]), ("\U0001F1E6\u0903", + &["\U0001F1E6\u0903"], &["\U0001F1E6", "\u0903"]), ("\U0001F1E6\u0308\u0903", + &["\U0001F1E6\u0308\u0903"], &["\U0001F1E6\u0308", "\u0903"]), ("\u0378\u0903", + &["\u0378\u0903"], &["\u0378", "\u0903"]), ("\u0378\u0308\u0903", + &["\u0378\u0308\u0903"], &["\u0378\u0308", "\u0903"]), + ]; + + for &(s, g) in test_same.iter() { + // test forward iterator + assert!(order::equals(s.graphemes(true), g.iter().map(|&x| x))); + assert!(order::equals(s.graphemes(false), g.iter().map(|&x| x))); + + // test reverse iterator + assert!(order::equals(s.graphemes(true).rev(), g.iter().rev().map(|&x| x))); + assert!(order::equals(s.graphemes(false).rev(), g.iter().rev().map(|&x| x))); + } + + for &(s, gt, gf) in test_diff.iter() { + // test forward iterator + assert!(order::equals(s.graphemes(true), gt.iter().map(|&x| x))); + assert!(order::equals(s.graphemes(false), gf.iter().map(|&x| x))); + + // test reverse iterator + assert!(order::equals(s.graphemes(true).rev(), gt.iter().rev().map(|&x| x))); + assert!(order::equals(s.graphemes(false).rev(), gf.iter().rev().map(|&x| x))); + } + + // test the indices iterators + let s = "a̐éö̲\r\n"; + let gr_inds = s.grapheme_indices(true).collect::>(); + assert_eq!(gr_inds.as_slice(), &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]); + let gr_inds = s.grapheme_indices(true).rev().collect::>(); + assert_eq!(gr_inds.as_slice(), &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0u, "a̐")]); + let mut gr_inds = s.grapheme_indices(true); + let e1 = gr_inds.size_hint(); + assert_eq!(e1, (1, Some(13))); + let c = gr_inds.count(); + assert_eq!(c, 4); + let e2 = gr_inds.size_hint(); + assert_eq!(e2, (0, Some(0))); + + // make sure the reverse iterator does the right thing with "\n" at beginning of string + let s = "\n\r\n\r"; + let gr = s.graphemes(true).rev().collect::>(); + assert_eq!(gr.as_slice(), &["\r", "\r\n", "\n"]); + } + #[test] fn test_split_strator() { fn t(s: &str, sep: &str, u: &[&str]) { diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index 6ac092fd8c657..c1c918ad3b318 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -237,7 +237,7 @@ use str::{Str, StrSlice}; use str; use string::String; use uint; -use unicode::UnicodeChar; +use unicode::char::UnicodeChar; use vec::Vec; // Reexports diff --git a/src/libstd/path/windows.rs b/src/libstd/path/windows.rs index 88ae0d4837e56..02d9dc4448984 100644 --- a/src/libstd/path/windows.rs +++ b/src/libstd/path/windows.rs @@ -24,7 +24,7 @@ use option::{Option, Some, None}; use slice::{Vector, ImmutableVector}; use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice}; use string::String; -use unicode::UnicodeChar; +use unicode::char::UnicodeChar; use vec::Vec; use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe}; diff --git a/src/libstd/prelude.rs b/src/libstd/prelude.rs index a20ac112ac52c..eee494c7bc0a1 100644 --- a/src/libstd/prelude.rs +++ b/src/libstd/prelude.rs @@ -59,7 +59,7 @@ #[doc(no_inline)] pub use ascii::{Ascii, AsciiCast, OwnedAsciiCast, AsciiStr}; #[doc(no_inline)] pub use ascii::IntoBytes; #[doc(no_inline)] pub use c_str::ToCStr; -#[doc(no_inline)] pub use char::Char; +#[doc(no_inline)] pub use char::{Char, UnicodeChar}; #[doc(no_inline)] pub use clone::Clone; #[doc(no_inline)] pub use cmp::{PartialEq, PartialOrd, Eq, Ord}; #[doc(no_inline)] pub use cmp::{Ordering, Less, Equal, Greater, Equiv}; @@ -77,7 +77,7 @@ #[doc(no_inline)] pub use ptr::RawPtr; #[doc(no_inline)] pub use io::{Buffer, Writer, Reader, Seek}; #[doc(no_inline)] pub use str::{Str, StrVector, StrSlice, OwnedStr}; -#[doc(no_inline)] pub use str::{IntoMaybeOwned, StrAllocating}; +#[doc(no_inline)] pub use str::{IntoMaybeOwned, StrAllocating, UnicodeStrSlice}; #[doc(no_inline)] pub use to_str::{ToString, IntoStr}; #[doc(no_inline)] pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4}; #[doc(no_inline)] pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8}; @@ -89,7 +89,6 @@ #[doc(no_inline)] pub use slice::{Vector, VectorVector}; #[doc(no_inline)] pub use slice::MutableVectorAllocating; #[doc(no_inline)] pub use string::String; -#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice}; #[doc(no_inline)] pub use vec::Vec; // Reexported runtime types diff --git a/src/libstd/rt/backtrace.rs b/src/libstd/rt/backtrace.rs index 09922b5ad7615..1594e3ba0a248 100644 --- a/src/libstd/rt/backtrace.rs +++ b/src/libstd/rt/backtrace.rs @@ -21,7 +21,7 @@ use os; use result::{Ok, Err}; use str::StrSlice; use sync::atomics; -use unicode::UnicodeChar; +use unicode::char::UnicodeChar; pub use self::imp::write; diff --git a/src/libunicode/lib.rs b/src/libunicode/lib.rs index 608bdbfaf0d12..13e54ed3c977b 100644 --- a/src/libunicode/lib.rs +++ b/src/libunicode/lib.rs @@ -33,13 +33,9 @@ extern crate core; -pub use tables::normalization::canonical_combining_class; +// regex module pub use tables::regex; -pub use u_char::UnicodeChar; -pub use u_str::UnicodeStrSlice; -pub use u_str::Words; - mod decompose; mod tables; mod u_char; @@ -66,11 +62,22 @@ pub mod char { pub use core::char::{from_digit, escape_unicode, escape_default}; pub use core::char::{len_utf8_bytes, Char}; - pub use decompose::decompose_canonical; - pub use decompose::decompose_compatible; + pub use decompose::{decompose_canonical, decompose_compatible}; + + pub use tables::normalization::canonical_combining_class; pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue}; pub use u_char::{is_lowercase, is_uppercase, is_whitespace}; pub use u_char::{is_alphanumeric, is_control, is_digit}; pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar}; } + +pub mod str { + pub use u_str::{UnicodeStrSlice, Words, Graphemes, GraphemeIndices}; +} + +// this lets us use #[deriving(Clone)] +mod std { + pub use core::clone; + pub use core::cmp; +} diff --git a/src/libunicode/tables.rs b/src/libunicode/tables.rs index 7f59656ba399a..650f93abd3e93 100644 --- a/src/libunicode/tables.rs +++ b/src/libunicode/tables.rs @@ -25,13 +25,178 @@ fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { pub mod general_category { pub static C_table: &'static [(char, char)] = &[ - ('\x00', '\x1f'), ('\x7f', '\x9f'), ('\xad', '\xad'), ('\u0600', '\u0605'), ('\u061c', - '\u061c'), ('\u06dd', '\u06dd'), ('\u070f', '\u070f'), ('\u180e', '\u180e'), ('\u200b', - '\u200f'), ('\u202a', '\u202e'), ('\u2060', '\u2064'), ('\u2066', '\u206f'), ('\ue000', - '\ue000'), ('\uf8ff', '\uf8ff'), ('\ufeff', '\ufeff'), ('\ufff9', '\ufffb'), ('\U000110bd', - '\U000110bd'), ('\U0001bca0', '\U0001bca3'), ('\U0001d173', '\U0001d17a'), ('\U000e0001', - '\U000e0001'), ('\U000e0020', '\U000e007f'), ('\U000f0000', '\U000f0000'), ('\U000ffffd', - '\U000ffffd'), ('\U00100000', '\U00100000'), ('\U0010fffd', '\U0010fffd') + ('\x00', '\x1f'), ('\x7f', '\x9f'), ('\xad', '\xad'), ('\u0378', '\u0379'), ('\u0380', + '\u0383'), ('\u038b', '\u038b'), ('\u038d', '\u038d'), ('\u03a2', '\u03a2'), ('\u0530', + '\u0530'), ('\u0557', '\u0558'), ('\u0560', '\u0560'), ('\u0588', '\u0588'), ('\u058b', + '\u058c'), ('\u0590', '\u0590'), ('\u05c8', '\u05cf'), ('\u05eb', '\u05ef'), ('\u05f5', + '\u0605'), ('\u061c', '\u061d'), ('\u06dd', '\u06dd'), ('\u070e', '\u070f'), ('\u074b', + '\u074c'), ('\u07b2', '\u07bf'), ('\u07fb', '\u07ff'), ('\u082e', '\u082f'), ('\u083f', + '\u083f'), ('\u085c', '\u085d'), ('\u085f', '\u089f'), ('\u08b3', '\u08e3'), ('\u0984', + '\u0984'), ('\u098d', '\u098e'), ('\u0991', '\u0992'), ('\u09a9', '\u09a9'), ('\u09b1', + '\u09b1'), ('\u09b3', '\u09b5'), ('\u09ba', '\u09bb'), ('\u09c5', '\u09c6'), ('\u09c9', + '\u09ca'), ('\u09cf', '\u09d6'), ('\u09d8', '\u09db'), ('\u09de', '\u09de'), ('\u09e4', + '\u09e5'), ('\u09fc', '\u0a00'), ('\u0a04', '\u0a04'), ('\u0a0b', '\u0a0e'), ('\u0a11', + '\u0a12'), ('\u0a29', '\u0a29'), ('\u0a31', '\u0a31'), ('\u0a34', '\u0a34'), ('\u0a37', + '\u0a37'), ('\u0a3a', '\u0a3b'), ('\u0a3d', '\u0a3d'), ('\u0a43', '\u0a46'), ('\u0a49', + '\u0a4a'), ('\u0a4e', '\u0a50'), ('\u0a52', '\u0a58'), ('\u0a5d', '\u0a5d'), ('\u0a5f', + '\u0a65'), ('\u0a76', '\u0a80'), ('\u0a84', '\u0a84'), ('\u0a8e', '\u0a8e'), ('\u0a92', + '\u0a92'), ('\u0aa9', '\u0aa9'), ('\u0ab1', '\u0ab1'), ('\u0ab4', '\u0ab4'), ('\u0aba', + '\u0abb'), ('\u0ac6', '\u0ac6'), ('\u0aca', '\u0aca'), ('\u0ace', '\u0acf'), ('\u0ad1', + '\u0adf'), ('\u0ae4', '\u0ae5'), ('\u0af2', '\u0b00'), ('\u0b04', '\u0b04'), ('\u0b0d', + '\u0b0e'), ('\u0b11', '\u0b12'), ('\u0b29', '\u0b29'), ('\u0b31', '\u0b31'), ('\u0b34', + '\u0b34'), ('\u0b3a', '\u0b3b'), ('\u0b45', '\u0b46'), ('\u0b49', '\u0b4a'), ('\u0b4e', + '\u0b55'), ('\u0b58', '\u0b5b'), ('\u0b5e', '\u0b5e'), ('\u0b64', '\u0b65'), ('\u0b78', + '\u0b81'), ('\u0b84', '\u0b84'), ('\u0b8b', '\u0b8d'), ('\u0b91', '\u0b91'), ('\u0b96', + '\u0b98'), ('\u0b9b', '\u0b9b'), ('\u0b9d', '\u0b9d'), ('\u0ba0', '\u0ba2'), ('\u0ba5', + '\u0ba7'), ('\u0bab', '\u0bad'), ('\u0bba', '\u0bbd'), ('\u0bc3', '\u0bc5'), ('\u0bc9', + '\u0bc9'), ('\u0bce', '\u0bcf'), ('\u0bd1', '\u0bd6'), ('\u0bd8', '\u0be5'), ('\u0bfb', + '\u0bff'), ('\u0c04', '\u0c04'), ('\u0c0d', '\u0c0d'), ('\u0c11', '\u0c11'), ('\u0c29', + '\u0c29'), ('\u0c3a', '\u0c3c'), ('\u0c45', '\u0c45'), ('\u0c49', '\u0c49'), ('\u0c4e', + '\u0c54'), ('\u0c57', '\u0c57'), ('\u0c5a', '\u0c5f'), ('\u0c64', '\u0c65'), ('\u0c70', + '\u0c77'), ('\u0c80', '\u0c80'), ('\u0c84', '\u0c84'), ('\u0c8d', '\u0c8d'), ('\u0c91', + '\u0c91'), ('\u0ca9', '\u0ca9'), ('\u0cb4', '\u0cb4'), ('\u0cba', '\u0cbb'), ('\u0cc5', + '\u0cc5'), ('\u0cc9', '\u0cc9'), ('\u0cce', '\u0cd4'), ('\u0cd7', '\u0cdd'), ('\u0cdf', + '\u0cdf'), ('\u0ce4', '\u0ce5'), ('\u0cf0', '\u0cf0'), ('\u0cf3', '\u0d00'), ('\u0d04', + '\u0d04'), ('\u0d0d', '\u0d0d'), ('\u0d11', '\u0d11'), ('\u0d3b', '\u0d3c'), ('\u0d45', + '\u0d45'), ('\u0d49', '\u0d49'), ('\u0d4f', '\u0d56'), ('\u0d58', '\u0d5f'), ('\u0d64', + '\u0d65'), ('\u0d76', '\u0d78'), ('\u0d80', '\u0d81'), ('\u0d84', '\u0d84'), ('\u0d97', + '\u0d99'), ('\u0db2', '\u0db2'), ('\u0dbc', '\u0dbc'), ('\u0dbe', '\u0dbf'), ('\u0dc7', + '\u0dc9'), ('\u0dcb', '\u0dce'), ('\u0dd5', '\u0dd5'), ('\u0dd7', '\u0dd7'), ('\u0de0', + '\u0de5'), ('\u0df0', '\u0df1'), ('\u0df5', '\u0e00'), ('\u0e3b', '\u0e3e'), ('\u0e5c', + '\u0e80'), ('\u0e83', '\u0e83'), ('\u0e85', '\u0e86'), ('\u0e89', '\u0e89'), ('\u0e8b', + '\u0e8c'), ('\u0e8e', '\u0e93'), ('\u0e98', '\u0e98'), ('\u0ea0', '\u0ea0'), ('\u0ea4', + '\u0ea4'), ('\u0ea6', '\u0ea6'), ('\u0ea8', '\u0ea9'), ('\u0eac', '\u0eac'), ('\u0eba', + '\u0eba'), ('\u0ebe', '\u0ebf'), ('\u0ec5', '\u0ec5'), ('\u0ec7', '\u0ec7'), ('\u0ece', + '\u0ecf'), ('\u0eda', '\u0edb'), ('\u0ee0', '\u0eff'), ('\u0f48', '\u0f48'), ('\u0f6d', + '\u0f70'), ('\u0f98', '\u0f98'), ('\u0fbd', '\u0fbd'), ('\u0fcd', '\u0fcd'), ('\u0fdb', + '\u0fff'), ('\u10c6', '\u10c6'), ('\u10c8', '\u10cc'), ('\u10ce', '\u10cf'), ('\u1249', + '\u1249'), ('\u124e', '\u124f'), ('\u1257', '\u1257'), ('\u1259', '\u1259'), ('\u125e', + '\u125f'), ('\u1289', '\u1289'), ('\u128e', '\u128f'), ('\u12b1', '\u12b1'), ('\u12b6', + '\u12b7'), ('\u12bf', '\u12bf'), ('\u12c1', '\u12c1'), ('\u12c6', '\u12c7'), ('\u12d7', + '\u12d7'), ('\u1311', '\u1311'), ('\u1316', '\u1317'), ('\u135b', '\u135c'), ('\u137d', + '\u137f'), ('\u139a', '\u139f'), ('\u13f5', '\u13ff'), ('\u169d', '\u169f'), ('\u16f9', + '\u16ff'), ('\u170d', '\u170d'), ('\u1715', '\u171f'), ('\u1737', '\u173f'), ('\u1754', + '\u175f'), ('\u176d', '\u176d'), ('\u1771', '\u1771'), ('\u1774', '\u177f'), ('\u17de', + '\u17df'), ('\u17ea', '\u17ef'), ('\u17fa', '\u17ff'), ('\u180e', '\u180f'), ('\u181a', + '\u181f'), ('\u1878', '\u187f'), ('\u18ab', '\u18af'), ('\u18f6', '\u18ff'), ('\u191f', + '\u191f'), ('\u192c', '\u192f'), ('\u193c', '\u193f'), ('\u1941', '\u1943'), ('\u196e', + '\u196f'), ('\u1975', '\u197f'), ('\u19ac', '\u19af'), ('\u19ca', '\u19cf'), ('\u19db', + '\u19dd'), ('\u1a1c', '\u1a1d'), ('\u1a5f', '\u1a5f'), ('\u1a7d', '\u1a7e'), ('\u1a8a', + '\u1a8f'), ('\u1a9a', '\u1a9f'), ('\u1aae', '\u1aaf'), ('\u1abf', '\u1aff'), ('\u1b4c', + '\u1b4f'), ('\u1b7d', '\u1b7f'), ('\u1bf4', '\u1bfb'), ('\u1c38', '\u1c3a'), ('\u1c4a', + '\u1c4c'), ('\u1c80', '\u1cbf'), ('\u1cc8', '\u1ccf'), ('\u1cf7', '\u1cf7'), ('\u1cfa', + '\u1cff'), ('\u1df6', '\u1dfb'), ('\u1f16', '\u1f17'), ('\u1f1e', '\u1f1f'), ('\u1f46', + '\u1f47'), ('\u1f4e', '\u1f4f'), ('\u1f58', '\u1f58'), ('\u1f5a', '\u1f5a'), ('\u1f5c', + '\u1f5c'), ('\u1f5e', '\u1f5e'), ('\u1f7e', '\u1f7f'), ('\u1fb5', '\u1fb5'), ('\u1fc5', + '\u1fc5'), ('\u1fd4', '\u1fd5'), ('\u1fdc', '\u1fdc'), ('\u1ff0', '\u1ff1'), ('\u1ff5', + '\u1ff5'), ('\u1fff', '\u1fff'), ('\u200b', '\u200f'), ('\u202a', '\u202e'), ('\u2060', + '\u206f'), ('\u2072', '\u2073'), ('\u208f', '\u208f'), ('\u209d', '\u209f'), ('\u20be', + '\u20cf'), ('\u20f1', '\u20ff'), ('\u218a', '\u218f'), ('\u23fb', '\u23ff'), ('\u2427', + '\u243f'), ('\u244b', '\u245f'), ('\u2b74', '\u2b75'), ('\u2b96', '\u2b97'), ('\u2bba', + '\u2bbc'), ('\u2bc9', '\u2bc9'), ('\u2bd2', '\u2bff'), ('\u2c2f', '\u2c2f'), ('\u2c5f', + '\u2c5f'), ('\u2cf4', '\u2cf8'), ('\u2d26', '\u2d26'), ('\u2d28', '\u2d2c'), ('\u2d2e', + '\u2d2f'), ('\u2d68', '\u2d6e'), ('\u2d71', '\u2d7e'), ('\u2d97', '\u2d9f'), ('\u2da7', + '\u2da7'), ('\u2daf', '\u2daf'), ('\u2db7', '\u2db7'), ('\u2dbf', '\u2dbf'), ('\u2dc7', + '\u2dc7'), ('\u2dcf', '\u2dcf'), ('\u2dd7', '\u2dd7'), ('\u2ddf', '\u2ddf'), ('\u2e43', + '\u2e7f'), ('\u2e9a', '\u2e9a'), ('\u2ef4', '\u2eff'), ('\u2fd6', '\u2fef'), ('\u2ffc', + '\u2fff'), ('\u3040', '\u3040'), ('\u3097', '\u3098'), ('\u3100', '\u3104'), ('\u312e', + '\u3130'), ('\u318f', '\u318f'), ('\u31bb', '\u31bf'), ('\u31e4', '\u31ef'), ('\u321f', + '\u321f'), ('\u32ff', '\u32ff'), ('\u3401', '\u4db4'), ('\u4db6', '\u4dbf'), ('\u4e01', + '\u9fcb'), ('\u9fcd', '\u9fff'), ('\ua48d', '\ua48f'), ('\ua4c7', '\ua4cf'), ('\ua62c', + '\ua63f'), ('\ua69e', '\ua69e'), ('\ua6f8', '\ua6ff'), ('\ua78f', '\ua78f'), ('\ua7ae', + '\ua7af'), ('\ua7b2', '\ua7f6'), ('\ua82c', '\ua82f'), ('\ua83a', '\ua83f'), ('\ua878', + '\ua87f'), ('\ua8c5', '\ua8cd'), ('\ua8da', '\ua8df'), ('\ua8fc', '\ua8ff'), ('\ua954', + '\ua95e'), ('\ua97d', '\ua97f'), ('\ua9ce', '\ua9ce'), ('\ua9da', '\ua9dd'), ('\ua9ff', + '\ua9ff'), ('\uaa37', '\uaa3f'), ('\uaa4e', '\uaa4f'), ('\uaa5a', '\uaa5b'), ('\uaac3', + '\uaada'), ('\uaaf7', '\uab00'), ('\uab07', '\uab08'), ('\uab0f', '\uab10'), ('\uab17', + '\uab1f'), ('\uab27', '\uab27'), ('\uab2f', '\uab2f'), ('\uab60', '\uab63'), ('\uab66', + '\uabbf'), ('\uabee', '\uabef'), ('\uabfa', '\uabff'), ('\uac01', '\ud7a2'), ('\ud7a4', + '\ud7af'), ('\ud7c7', '\ud7ca'), ('\ud7fc', '\ud7ff'), ('\ue000', '\uf8ff'), ('\ufa6e', + '\ufa6f'), ('\ufada', '\ufaff'), ('\ufb07', '\ufb12'), ('\ufb18', '\ufb1c'), ('\ufb37', + '\ufb37'), ('\ufb3d', '\ufb3d'), ('\ufb3f', '\ufb3f'), ('\ufb42', '\ufb42'), ('\ufb45', + '\ufb45'), ('\ufbc2', '\ufbd2'), ('\ufd40', '\ufd4f'), ('\ufd90', '\ufd91'), ('\ufdc8', + '\ufdef'), ('\ufdfe', '\ufdff'), ('\ufe1a', '\ufe1f'), ('\ufe2e', '\ufe2f'), ('\ufe53', + '\ufe53'), ('\ufe67', '\ufe67'), ('\ufe6c', '\ufe6f'), ('\ufe75', '\ufe75'), ('\ufefd', + '\uff00'), ('\uffbf', '\uffc1'), ('\uffc8', '\uffc9'), ('\uffd0', '\uffd1'), ('\uffd8', + '\uffd9'), ('\uffdd', '\uffdf'), ('\uffe7', '\uffe7'), ('\uffef', '\ufffb'), ('\ufffe', + '\uffff'), ('\U0001000c', '\U0001000c'), ('\U00010027', '\U00010027'), ('\U0001003b', + '\U0001003b'), ('\U0001003e', '\U0001003e'), ('\U0001004e', '\U0001004f'), ('\U0001005e', + '\U0001007f'), ('\U000100fb', '\U000100ff'), ('\U00010103', '\U00010106'), ('\U00010134', + '\U00010136'), ('\U0001018d', '\U0001018f'), ('\U0001019c', '\U0001019f'), ('\U000101a1', + '\U000101cf'), ('\U000101fe', '\U0001027f'), ('\U0001029d', '\U0001029f'), ('\U000102d1', + '\U000102df'), ('\U000102fc', '\U000102ff'), ('\U00010324', '\U0001032f'), ('\U0001034b', + '\U0001034f'), ('\U0001037b', '\U0001037f'), ('\U0001039e', '\U0001039e'), ('\U000103c4', + '\U000103c7'), ('\U000103d6', '\U000103ff'), ('\U0001049e', '\U0001049f'), ('\U000104aa', + '\U000104ff'), ('\U00010528', '\U0001052f'), ('\U00010564', '\U0001056e'), ('\U00010570', + '\U000105ff'), ('\U00010737', '\U0001073f'), ('\U00010756', '\U0001075f'), ('\U00010768', + '\U000107ff'), ('\U00010806', '\U00010807'), ('\U00010809', '\U00010809'), ('\U00010836', + '\U00010836'), ('\U00010839', '\U0001083b'), ('\U0001083d', '\U0001083e'), ('\U00010856', + '\U00010856'), ('\U0001089f', '\U000108a6'), ('\U000108b0', '\U000108ff'), ('\U0001091c', + '\U0001091e'), ('\U0001093a', '\U0001093e'), ('\U00010940', '\U0001097f'), ('\U000109b8', + '\U000109bd'), ('\U000109c0', '\U000109ff'), ('\U00010a04', '\U00010a04'), ('\U00010a07', + '\U00010a0b'), ('\U00010a14', '\U00010a14'), ('\U00010a18', '\U00010a18'), ('\U00010a34', + '\U00010a37'), ('\U00010a3b', '\U00010a3e'), ('\U00010a48', '\U00010a4f'), ('\U00010a59', + '\U00010a5f'), ('\U00010aa0', '\U00010abf'), ('\U00010ae7', '\U00010aea'), ('\U00010af7', + '\U00010aff'), ('\U00010b36', '\U00010b38'), ('\U00010b56', '\U00010b57'), ('\U00010b73', + '\U00010b77'), ('\U00010b92', '\U00010b98'), ('\U00010b9d', '\U00010ba8'), ('\U00010bb0', + '\U00010bff'), ('\U00010c49', '\U00010e5f'), ('\U00010e7f', '\U00010fff'), ('\U0001104e', + '\U00011051'), ('\U00011070', '\U0001107e'), ('\U000110bd', '\U000110bd'), ('\U000110c2', + '\U000110cf'), ('\U000110e9', '\U000110ef'), ('\U000110fa', '\U000110ff'), ('\U00011135', + '\U00011135'), ('\U00011144', '\U0001114f'), ('\U00011177', '\U0001117f'), ('\U000111c9', + '\U000111cc'), ('\U000111ce', '\U000111cf'), ('\U000111db', '\U000111e0'), ('\U000111f5', + '\U000111ff'), ('\U00011212', '\U00011212'), ('\U0001123e', '\U000112af'), ('\U000112eb', + '\U000112ef'), ('\U000112fa', '\U00011300'), ('\U00011304', '\U00011304'), ('\U0001130d', + '\U0001130e'), ('\U00011311', '\U00011312'), ('\U00011329', '\U00011329'), ('\U00011331', + '\U00011331'), ('\U00011334', '\U00011334'), ('\U0001133a', '\U0001133b'), ('\U00011345', + '\U00011346'), ('\U00011349', '\U0001134a'), ('\U0001134e', '\U00011356'), ('\U00011358', + '\U0001135c'), ('\U00011364', '\U00011365'), ('\U0001136d', '\U0001136f'), ('\U00011375', + '\U0001147f'), ('\U000114c8', '\U000114cf'), ('\U000114da', '\U0001157f'), ('\U000115b6', + '\U000115b7'), ('\U000115ca', '\U000115ff'), ('\U00011645', '\U0001164f'), ('\U0001165a', + '\U0001167f'), ('\U000116b8', '\U000116bf'), ('\U000116ca', '\U0001189f'), ('\U000118f3', + '\U000118fe'), ('\U00011900', '\U00011abf'), ('\U00011af9', '\U00011fff'), ('\U00012399', + '\U000123ff'), ('\U0001246f', '\U0001246f'), ('\U00012475', '\U00012fff'), ('\U0001342f', + '\U000167ff'), ('\U00016a39', '\U00016a3f'), ('\U00016a5f', '\U00016a5f'), ('\U00016a6a', + '\U00016a6d'), ('\U00016a70', '\U00016acf'), ('\U00016aee', '\U00016aef'), ('\U00016af6', + '\U00016aff'), ('\U00016b46', '\U00016b4f'), ('\U00016b5a', '\U00016b5a'), ('\U00016b62', + '\U00016b62'), ('\U00016b78', '\U00016b7c'), ('\U00016b90', '\U00016eff'), ('\U00016f45', + '\U00016f4f'), ('\U00016f7f', '\U00016f8e'), ('\U00016fa0', '\U0001afff'), ('\U0001b002', + '\U0001bbff'), ('\U0001bc6b', '\U0001bc6f'), ('\U0001bc7d', '\U0001bc7f'), ('\U0001bc89', + '\U0001bc8f'), ('\U0001bc9a', '\U0001bc9b'), ('\U0001bca0', '\U0001cfff'), ('\U0001d0f6', + '\U0001d0ff'), ('\U0001d127', '\U0001d128'), ('\U0001d173', '\U0001d17a'), ('\U0001d1de', + '\U0001d1ff'), ('\U0001d246', '\U0001d2ff'), ('\U0001d357', '\U0001d35f'), ('\U0001d372', + '\U0001d3ff'), ('\U0001d455', '\U0001d455'), ('\U0001d49d', '\U0001d49d'), ('\U0001d4a0', + '\U0001d4a1'), ('\U0001d4a3', '\U0001d4a4'), ('\U0001d4a7', '\U0001d4a8'), ('\U0001d4ad', + '\U0001d4ad'), ('\U0001d4ba', '\U0001d4ba'), ('\U0001d4bc', '\U0001d4bc'), ('\U0001d4c4', + '\U0001d4c4'), ('\U0001d506', '\U0001d506'), ('\U0001d50b', '\U0001d50c'), ('\U0001d515', + '\U0001d515'), ('\U0001d51d', '\U0001d51d'), ('\U0001d53a', '\U0001d53a'), ('\U0001d53f', + '\U0001d53f'), ('\U0001d545', '\U0001d545'), ('\U0001d547', '\U0001d549'), ('\U0001d551', + '\U0001d551'), ('\U0001d6a6', '\U0001d6a7'), ('\U0001d7cc', '\U0001d7cd'), ('\U0001d800', + '\U0001e7ff'), ('\U0001e8c5', '\U0001e8c6'), ('\U0001e8d7', '\U0001edff'), ('\U0001ee04', + '\U0001ee04'), ('\U0001ee20', '\U0001ee20'), ('\U0001ee23', '\U0001ee23'), ('\U0001ee25', + '\U0001ee26'), ('\U0001ee28', '\U0001ee28'), ('\U0001ee33', '\U0001ee33'), ('\U0001ee38', + '\U0001ee38'), ('\U0001ee3a', '\U0001ee3a'), ('\U0001ee3c', '\U0001ee41'), ('\U0001ee43', + '\U0001ee46'), ('\U0001ee48', '\U0001ee48'), ('\U0001ee4a', '\U0001ee4a'), ('\U0001ee4c', + '\U0001ee4c'), ('\U0001ee50', '\U0001ee50'), ('\U0001ee53', '\U0001ee53'), ('\U0001ee55', + '\U0001ee56'), ('\U0001ee58', '\U0001ee58'), ('\U0001ee5a', '\U0001ee5a'), ('\U0001ee5c', + '\U0001ee5c'), ('\U0001ee5e', '\U0001ee5e'), ('\U0001ee60', '\U0001ee60'), ('\U0001ee63', + '\U0001ee63'), ('\U0001ee65', '\U0001ee66'), ('\U0001ee6b', '\U0001ee6b'), ('\U0001ee73', + '\U0001ee73'), ('\U0001ee78', '\U0001ee78'), ('\U0001ee7d', '\U0001ee7d'), ('\U0001ee7f', + '\U0001ee7f'), ('\U0001ee8a', '\U0001ee8a'), ('\U0001ee9c', '\U0001eea0'), ('\U0001eea4', + '\U0001eea4'), ('\U0001eeaa', '\U0001eeaa'), ('\U0001eebc', '\U0001eeef'), ('\U0001eef2', + '\U0001efff'), ('\U0001f02c', '\U0001f02f'), ('\U0001f094', '\U0001f09f'), ('\U0001f0af', + '\U0001f0b0'), ('\U0001f0c0', '\U0001f0c0'), ('\U0001f0d0', '\U0001f0d0'), ('\U0001f0f6', + '\U0001f0ff'), ('\U0001f10d', '\U0001f10f'), ('\U0001f12f', '\U0001f12f'), ('\U0001f16c', + '\U0001f16f'), ('\U0001f19b', '\U0001f1e5'), ('\U0001f203', '\U0001f20f'), ('\U0001f23b', + '\U0001f23f'), ('\U0001f249', '\U0001f24f'), ('\U0001f252', '\U0001f2ff'), ('\U0001f32d', + '\U0001f32f'), ('\U0001f37e', '\U0001f37f'), ('\U0001f3cf', '\U0001f3d3'), ('\U0001f3f8', + '\U0001f3ff'), ('\U0001f4ff', '\U0001f4ff'), ('\U0001f54b', '\U0001f54f'), ('\U0001f57a', + '\U0001f57a'), ('\U0001f5a4', '\U0001f5a4'), ('\U0001f643', '\U0001f644'), ('\U0001f6d0', + '\U0001f6df'), ('\U0001f6ed', '\U0001f6ef'), ('\U0001f6f4', '\U0001f6ff'), ('\U0001f774', + '\U0001f77f'), ('\U0001f7d5', '\U0001f7ff'), ('\U0001f80c', '\U0001f80f'), ('\U0001f848', + '\U0001f84f'), ('\U0001f85a', '\U0001f85f'), ('\U0001f888', '\U0001f88f'), ('\U0001f8ae', + '\U0001ffff'), ('\U00020001', '\U0002a6d5'), ('\U0002a6d7', '\U0002a6ff'), ('\U0002a701', + '\U0002b733'), ('\U0002b735', '\U0002b73f'), ('\U0002b741', '\U0002b81c'), ('\U0002b81e', + '\U0002f7ff'), ('\U0002fa1e', '\U000e00ff'), ('\U000e01f0', '\U0010ffff') ]; pub static Cc_table: &'static [(char, char)] = &[ @@ -50,6 +215,181 @@ pub mod general_category { ('\U000e0001', '\U000e0001'), ('\U000e0020', '\U000e007f') ]; + pub static Cn_table: &'static [(char, char)] = &[ + ('\u0378', '\u0379'), ('\u0380', '\u0383'), ('\u038b', '\u038b'), ('\u038d', '\u038d'), + ('\u03a2', '\u03a2'), ('\u0530', '\u0530'), ('\u0557', '\u0558'), ('\u0560', '\u0560'), + ('\u0588', '\u0588'), ('\u058b', '\u058c'), ('\u0590', '\u0590'), ('\u05c8', '\u05cf'), + ('\u05eb', '\u05ef'), ('\u05f5', '\u05ff'), ('\u061d', '\u061d'), ('\u070e', '\u070e'), + ('\u074b', '\u074c'), ('\u07b2', '\u07bf'), ('\u07fb', '\u07ff'), ('\u082e', '\u082f'), + ('\u083f', '\u083f'), ('\u085c', '\u085d'), ('\u085f', '\u089f'), ('\u08b3', '\u08e3'), + ('\u0984', '\u0984'), ('\u098d', '\u098e'), ('\u0991', '\u0992'), ('\u09a9', '\u09a9'), + ('\u09b1', '\u09b1'), ('\u09b3', '\u09b5'), ('\u09ba', '\u09bb'), ('\u09c5', '\u09c6'), + ('\u09c9', '\u09ca'), ('\u09cf', '\u09d6'), ('\u09d8', '\u09db'), ('\u09de', '\u09de'), + ('\u09e4', '\u09e5'), ('\u09fc', '\u0a00'), ('\u0a04', '\u0a04'), ('\u0a0b', '\u0a0e'), + ('\u0a11', '\u0a12'), ('\u0a29', '\u0a29'), ('\u0a31', '\u0a31'), ('\u0a34', '\u0a34'), + ('\u0a37', '\u0a37'), ('\u0a3a', '\u0a3b'), ('\u0a3d', '\u0a3d'), ('\u0a43', '\u0a46'), + ('\u0a49', '\u0a4a'), ('\u0a4e', '\u0a50'), ('\u0a52', '\u0a58'), ('\u0a5d', '\u0a5d'), + ('\u0a5f', '\u0a65'), ('\u0a76', '\u0a80'), ('\u0a84', '\u0a84'), ('\u0a8e', '\u0a8e'), + ('\u0a92', '\u0a92'), ('\u0aa9', '\u0aa9'), ('\u0ab1', '\u0ab1'), ('\u0ab4', '\u0ab4'), + ('\u0aba', '\u0abb'), ('\u0ac6', '\u0ac6'), ('\u0aca', '\u0aca'), ('\u0ace', '\u0acf'), + ('\u0ad1', '\u0adf'), ('\u0ae4', '\u0ae5'), ('\u0af2', '\u0b00'), ('\u0b04', '\u0b04'), + ('\u0b0d', '\u0b0e'), ('\u0b11', '\u0b12'), ('\u0b29', '\u0b29'), ('\u0b31', '\u0b31'), + ('\u0b34', '\u0b34'), ('\u0b3a', '\u0b3b'), ('\u0b45', '\u0b46'), ('\u0b49', '\u0b4a'), + ('\u0b4e', '\u0b55'), ('\u0b58', '\u0b5b'), ('\u0b5e', '\u0b5e'), ('\u0b64', '\u0b65'), + ('\u0b78', '\u0b81'), ('\u0b84', '\u0b84'), ('\u0b8b', '\u0b8d'), ('\u0b91', '\u0b91'), + ('\u0b96', '\u0b98'), ('\u0b9b', '\u0b9b'), ('\u0b9d', '\u0b9d'), ('\u0ba0', '\u0ba2'), + ('\u0ba5', '\u0ba7'), ('\u0bab', '\u0bad'), ('\u0bba', '\u0bbd'), ('\u0bc3', '\u0bc5'), + ('\u0bc9', '\u0bc9'), ('\u0bce', '\u0bcf'), ('\u0bd1', '\u0bd6'), ('\u0bd8', '\u0be5'), + ('\u0bfb', '\u0bff'), ('\u0c04', '\u0c04'), ('\u0c0d', '\u0c0d'), ('\u0c11', '\u0c11'), + ('\u0c29', '\u0c29'), ('\u0c3a', '\u0c3c'), ('\u0c45', '\u0c45'), ('\u0c49', '\u0c49'), + ('\u0c4e', '\u0c54'), ('\u0c57', '\u0c57'), ('\u0c5a', '\u0c5f'), ('\u0c64', '\u0c65'), + ('\u0c70', '\u0c77'), ('\u0c80', '\u0c80'), ('\u0c84', '\u0c84'), ('\u0c8d', '\u0c8d'), + ('\u0c91', '\u0c91'), ('\u0ca9', '\u0ca9'), ('\u0cb4', '\u0cb4'), ('\u0cba', '\u0cbb'), + ('\u0cc5', '\u0cc5'), ('\u0cc9', '\u0cc9'), ('\u0cce', '\u0cd4'), ('\u0cd7', '\u0cdd'), + ('\u0cdf', '\u0cdf'), ('\u0ce4', '\u0ce5'), ('\u0cf0', '\u0cf0'), ('\u0cf3', '\u0d00'), + ('\u0d04', '\u0d04'), ('\u0d0d', '\u0d0d'), ('\u0d11', '\u0d11'), ('\u0d3b', '\u0d3c'), + ('\u0d45', '\u0d45'), ('\u0d49', '\u0d49'), ('\u0d4f', '\u0d56'), ('\u0d58', '\u0d5f'), + ('\u0d64', '\u0d65'), ('\u0d76', '\u0d78'), ('\u0d80', '\u0d81'), ('\u0d84', '\u0d84'), + ('\u0d97', '\u0d99'), ('\u0db2', '\u0db2'), ('\u0dbc', '\u0dbc'), ('\u0dbe', '\u0dbf'), + ('\u0dc7', '\u0dc9'), ('\u0dcb', '\u0dce'), ('\u0dd5', '\u0dd5'), ('\u0dd7', '\u0dd7'), + ('\u0de0', '\u0de5'), ('\u0df0', '\u0df1'), ('\u0df5', '\u0e00'), ('\u0e3b', '\u0e3e'), + ('\u0e5c', '\u0e80'), ('\u0e83', '\u0e83'), ('\u0e85', '\u0e86'), ('\u0e89', '\u0e89'), + ('\u0e8b', '\u0e8c'), ('\u0e8e', '\u0e93'), ('\u0e98', '\u0e98'), ('\u0ea0', '\u0ea0'), + ('\u0ea4', '\u0ea4'), ('\u0ea6', '\u0ea6'), ('\u0ea8', '\u0ea9'), ('\u0eac', '\u0eac'), + ('\u0eba', '\u0eba'), ('\u0ebe', '\u0ebf'), ('\u0ec5', '\u0ec5'), ('\u0ec7', '\u0ec7'), + ('\u0ece', '\u0ecf'), ('\u0eda', '\u0edb'), ('\u0ee0', '\u0eff'), ('\u0f48', '\u0f48'), + ('\u0f6d', '\u0f70'), ('\u0f98', '\u0f98'), ('\u0fbd', '\u0fbd'), ('\u0fcd', '\u0fcd'), + ('\u0fdb', '\u0fff'), ('\u10c6', '\u10c6'), ('\u10c8', '\u10cc'), ('\u10ce', '\u10cf'), + ('\u1249', '\u1249'), ('\u124e', '\u124f'), ('\u1257', '\u1257'), ('\u1259', '\u1259'), + ('\u125e', '\u125f'), ('\u1289', '\u1289'), ('\u128e', '\u128f'), ('\u12b1', '\u12b1'), + ('\u12b6', '\u12b7'), ('\u12bf', '\u12bf'), ('\u12c1', '\u12c1'), ('\u12c6', '\u12c7'), + ('\u12d7', '\u12d7'), ('\u1311', '\u1311'), ('\u1316', '\u1317'), ('\u135b', '\u135c'), + ('\u137d', '\u137f'), ('\u139a', '\u139f'), ('\u13f5', '\u13ff'), ('\u169d', '\u169f'), + ('\u16f9', '\u16ff'), ('\u170d', '\u170d'), ('\u1715', '\u171f'), ('\u1737', '\u173f'), + ('\u1754', '\u175f'), ('\u176d', '\u176d'), ('\u1771', '\u1771'), ('\u1774', '\u177f'), + ('\u17de', '\u17df'), ('\u17ea', '\u17ef'), ('\u17fa', '\u17ff'), ('\u180f', '\u180f'), + ('\u181a', '\u181f'), ('\u1878', '\u187f'), ('\u18ab', '\u18af'), ('\u18f6', '\u18ff'), + ('\u191f', '\u191f'), ('\u192c', '\u192f'), ('\u193c', '\u193f'), ('\u1941', '\u1943'), + ('\u196e', '\u196f'), ('\u1975', '\u197f'), ('\u19ac', '\u19af'), ('\u19ca', '\u19cf'), + ('\u19db', '\u19dd'), ('\u1a1c', '\u1a1d'), ('\u1a5f', '\u1a5f'), ('\u1a7d', '\u1a7e'), + ('\u1a8a', '\u1a8f'), ('\u1a9a', '\u1a9f'), ('\u1aae', '\u1aaf'), ('\u1abf', '\u1aff'), + ('\u1b4c', '\u1b4f'), ('\u1b7d', '\u1b7f'), ('\u1bf4', '\u1bfb'), ('\u1c38', '\u1c3a'), + ('\u1c4a', '\u1c4c'), ('\u1c80', '\u1cbf'), ('\u1cc8', '\u1ccf'), ('\u1cf7', '\u1cf7'), + ('\u1cfa', '\u1cff'), ('\u1df6', '\u1dfb'), ('\u1f16', '\u1f17'), ('\u1f1e', '\u1f1f'), + ('\u1f46', '\u1f47'), ('\u1f4e', '\u1f4f'), ('\u1f58', '\u1f58'), ('\u1f5a', '\u1f5a'), + ('\u1f5c', '\u1f5c'), ('\u1f5e', '\u1f5e'), ('\u1f7e', '\u1f7f'), ('\u1fb5', '\u1fb5'), + ('\u1fc5', '\u1fc5'), ('\u1fd4', '\u1fd5'), ('\u1fdc', '\u1fdc'), ('\u1ff0', '\u1ff1'), + ('\u1ff5', '\u1ff5'), ('\u1fff', '\u1fff'), ('\u2065', '\u2065'), ('\u2072', '\u2073'), + ('\u208f', '\u208f'), ('\u209d', '\u209f'), ('\u20be', '\u20cf'), ('\u20f1', '\u20ff'), + ('\u218a', '\u218f'), ('\u23fb', '\u23ff'), ('\u2427', '\u243f'), ('\u244b', '\u245f'), + ('\u2b74', '\u2b75'), ('\u2b96', '\u2b97'), ('\u2bba', '\u2bbc'), ('\u2bc9', '\u2bc9'), + ('\u2bd2', '\u2bff'), ('\u2c2f', '\u2c2f'), ('\u2c5f', '\u2c5f'), ('\u2cf4', '\u2cf8'), + ('\u2d26', '\u2d26'), ('\u2d28', '\u2d2c'), ('\u2d2e', '\u2d2f'), ('\u2d68', '\u2d6e'), + ('\u2d71', '\u2d7e'), ('\u2d97', '\u2d9f'), ('\u2da7', '\u2da7'), ('\u2daf', '\u2daf'), + ('\u2db7', '\u2db7'), ('\u2dbf', '\u2dbf'), ('\u2dc7', '\u2dc7'), ('\u2dcf', '\u2dcf'), + ('\u2dd7', '\u2dd7'), ('\u2ddf', '\u2ddf'), ('\u2e43', '\u2e7f'), ('\u2e9a', '\u2e9a'), + ('\u2ef4', '\u2eff'), ('\u2fd6', '\u2fef'), ('\u2ffc', '\u2fff'), ('\u3040', '\u3040'), + ('\u3097', '\u3098'), ('\u3100', '\u3104'), ('\u312e', '\u3130'), ('\u318f', '\u318f'), + ('\u31bb', '\u31bf'), ('\u31e4', '\u31ef'), ('\u321f', '\u321f'), ('\u32ff', '\u32ff'), + ('\u3401', '\u4db4'), ('\u4db6', '\u4dbf'), ('\u4e01', '\u9fcb'), ('\u9fcd', '\u9fff'), + ('\ua48d', '\ua48f'), ('\ua4c7', '\ua4cf'), ('\ua62c', '\ua63f'), ('\ua69e', '\ua69e'), + ('\ua6f8', '\ua6ff'), ('\ua78f', '\ua78f'), ('\ua7ae', '\ua7af'), ('\ua7b2', '\ua7f6'), + ('\ua82c', '\ua82f'), ('\ua83a', '\ua83f'), ('\ua878', '\ua87f'), ('\ua8c5', '\ua8cd'), + ('\ua8da', '\ua8df'), ('\ua8fc', '\ua8ff'), ('\ua954', '\ua95e'), ('\ua97d', '\ua97f'), + ('\ua9ce', '\ua9ce'), ('\ua9da', '\ua9dd'), ('\ua9ff', '\ua9ff'), ('\uaa37', '\uaa3f'), + ('\uaa4e', '\uaa4f'), ('\uaa5a', '\uaa5b'), ('\uaac3', '\uaada'), ('\uaaf7', '\uab00'), + ('\uab07', '\uab08'), ('\uab0f', '\uab10'), ('\uab17', '\uab1f'), ('\uab27', '\uab27'), + ('\uab2f', '\uab2f'), ('\uab60', '\uab63'), ('\uab66', '\uabbf'), ('\uabee', '\uabef'), + ('\uabfa', '\uabff'), ('\uac01', '\ud7a2'), ('\ud7a4', '\ud7af'), ('\ud7c7', '\ud7ca'), + ('\ud7fc', '\ud7ff'), ('\ue001', '\uf8fe'), ('\ufa6e', '\ufa6f'), ('\ufada', '\ufaff'), + ('\ufb07', '\ufb12'), ('\ufb18', '\ufb1c'), ('\ufb37', '\ufb37'), ('\ufb3d', '\ufb3d'), + ('\ufb3f', '\ufb3f'), ('\ufb42', '\ufb42'), ('\ufb45', '\ufb45'), ('\ufbc2', '\ufbd2'), + ('\ufd40', '\ufd4f'), ('\ufd90', '\ufd91'), ('\ufdc8', '\ufdef'), ('\ufdfe', '\ufdff'), + ('\ufe1a', '\ufe1f'), ('\ufe2e', '\ufe2f'), ('\ufe53', '\ufe53'), ('\ufe67', '\ufe67'), + ('\ufe6c', '\ufe6f'), ('\ufe75', '\ufe75'), ('\ufefd', '\ufefe'), ('\uff00', '\uff00'), + ('\uffbf', '\uffc1'), ('\uffc8', '\uffc9'), ('\uffd0', '\uffd1'), ('\uffd8', '\uffd9'), + ('\uffdd', '\uffdf'), ('\uffe7', '\uffe7'), ('\uffef', '\ufff8'), ('\ufffe', '\uffff'), + ('\U0001000c', '\U0001000c'), ('\U00010027', '\U00010027'), ('\U0001003b', '\U0001003b'), + ('\U0001003e', '\U0001003e'), ('\U0001004e', '\U0001004f'), ('\U0001005e', '\U0001007f'), + ('\U000100fb', '\U000100ff'), ('\U00010103', '\U00010106'), ('\U00010134', '\U00010136'), + ('\U0001018d', '\U0001018f'), ('\U0001019c', '\U0001019f'), ('\U000101a1', '\U000101cf'), + ('\U000101fe', '\U0001027f'), ('\U0001029d', '\U0001029f'), ('\U000102d1', '\U000102df'), + ('\U000102fc', '\U000102ff'), ('\U00010324', '\U0001032f'), ('\U0001034b', '\U0001034f'), + ('\U0001037b', '\U0001037f'), ('\U0001039e', '\U0001039e'), ('\U000103c4', '\U000103c7'), + ('\U000103d6', '\U000103ff'), ('\U0001049e', '\U0001049f'), ('\U000104aa', '\U000104ff'), + ('\U00010528', '\U0001052f'), ('\U00010564', '\U0001056e'), ('\U00010570', '\U000105ff'), + ('\U00010737', '\U0001073f'), ('\U00010756', '\U0001075f'), ('\U00010768', '\U000107ff'), + ('\U00010806', '\U00010807'), ('\U00010809', '\U00010809'), ('\U00010836', '\U00010836'), + ('\U00010839', '\U0001083b'), ('\U0001083d', '\U0001083e'), ('\U00010856', '\U00010856'), + ('\U0001089f', '\U000108a6'), ('\U000108b0', '\U000108ff'), ('\U0001091c', '\U0001091e'), + ('\U0001093a', '\U0001093e'), ('\U00010940', '\U0001097f'), ('\U000109b8', '\U000109bd'), + ('\U000109c0', '\U000109ff'), ('\U00010a04', '\U00010a04'), ('\U00010a07', '\U00010a0b'), + ('\U00010a14', '\U00010a14'), ('\U00010a18', '\U00010a18'), ('\U00010a34', '\U00010a37'), + ('\U00010a3b', '\U00010a3e'), ('\U00010a48', '\U00010a4f'), ('\U00010a59', '\U00010a5f'), + ('\U00010aa0', '\U00010abf'), ('\U00010ae7', '\U00010aea'), ('\U00010af7', '\U00010aff'), + ('\U00010b36', '\U00010b38'), ('\U00010b56', '\U00010b57'), ('\U00010b73', '\U00010b77'), + ('\U00010b92', '\U00010b98'), ('\U00010b9d', '\U00010ba8'), ('\U00010bb0', '\U00010bff'), + ('\U00010c49', '\U00010e5f'), ('\U00010e7f', '\U00010fff'), ('\U0001104e', '\U00011051'), + ('\U00011070', '\U0001107e'), ('\U000110c2', '\U000110cf'), ('\U000110e9', '\U000110ef'), + ('\U000110fa', '\U000110ff'), ('\U00011135', '\U00011135'), ('\U00011144', '\U0001114f'), + ('\U00011177', '\U0001117f'), ('\U000111c9', '\U000111cc'), ('\U000111ce', '\U000111cf'), + ('\U000111db', '\U000111e0'), ('\U000111f5', '\U000111ff'), ('\U00011212', '\U00011212'), + ('\U0001123e', '\U000112af'), ('\U000112eb', '\U000112ef'), ('\U000112fa', '\U00011300'), + ('\U00011304', '\U00011304'), ('\U0001130d', '\U0001130e'), ('\U00011311', '\U00011312'), + ('\U00011329', '\U00011329'), ('\U00011331', '\U00011331'), ('\U00011334', '\U00011334'), + ('\U0001133a', '\U0001133b'), ('\U00011345', '\U00011346'), ('\U00011349', '\U0001134a'), + ('\U0001134e', '\U00011356'), ('\U00011358', '\U0001135c'), ('\U00011364', '\U00011365'), + ('\U0001136d', '\U0001136f'), ('\U00011375', '\U0001147f'), ('\U000114c8', '\U000114cf'), + ('\U000114da', '\U0001157f'), ('\U000115b6', '\U000115b7'), ('\U000115ca', '\U000115ff'), + ('\U00011645', '\U0001164f'), ('\U0001165a', '\U0001167f'), ('\U000116b8', '\U000116bf'), + ('\U000116ca', '\U0001189f'), ('\U000118f3', '\U000118fe'), ('\U00011900', '\U00011abf'), + ('\U00011af9', '\U00011fff'), ('\U00012399', '\U000123ff'), ('\U0001246f', '\U0001246f'), + ('\U00012475', '\U00012fff'), ('\U0001342f', '\U000167ff'), ('\U00016a39', '\U00016a3f'), + ('\U00016a5f', '\U00016a5f'), ('\U00016a6a', '\U00016a6d'), ('\U00016a70', '\U00016acf'), + ('\U00016aee', '\U00016aef'), ('\U00016af6', '\U00016aff'), ('\U00016b46', '\U00016b4f'), + ('\U00016b5a', '\U00016b5a'), ('\U00016b62', '\U00016b62'), ('\U00016b78', '\U00016b7c'), + ('\U00016b90', '\U00016eff'), ('\U00016f45', '\U00016f4f'), ('\U00016f7f', '\U00016f8e'), + ('\U00016fa0', '\U0001afff'), ('\U0001b002', '\U0001bbff'), ('\U0001bc6b', '\U0001bc6f'), + ('\U0001bc7d', '\U0001bc7f'), ('\U0001bc89', '\U0001bc8f'), ('\U0001bc9a', '\U0001bc9b'), + ('\U0001bca4', '\U0001cfff'), ('\U0001d0f6', '\U0001d0ff'), ('\U0001d127', '\U0001d128'), + ('\U0001d1de', '\U0001d1ff'), ('\U0001d246', '\U0001d2ff'), ('\U0001d357', '\U0001d35f'), + ('\U0001d372', '\U0001d3ff'), ('\U0001d455', '\U0001d455'), ('\U0001d49d', '\U0001d49d'), + ('\U0001d4a0', '\U0001d4a1'), ('\U0001d4a3', '\U0001d4a4'), ('\U0001d4a7', '\U0001d4a8'), + ('\U0001d4ad', '\U0001d4ad'), ('\U0001d4ba', '\U0001d4ba'), ('\U0001d4bc', '\U0001d4bc'), + ('\U0001d4c4', '\U0001d4c4'), ('\U0001d506', '\U0001d506'), ('\U0001d50b', '\U0001d50c'), + ('\U0001d515', '\U0001d515'), ('\U0001d51d', '\U0001d51d'), ('\U0001d53a', '\U0001d53a'), + ('\U0001d53f', '\U0001d53f'), ('\U0001d545', '\U0001d545'), ('\U0001d547', '\U0001d549'), + ('\U0001d551', '\U0001d551'), ('\U0001d6a6', '\U0001d6a7'), ('\U0001d7cc', '\U0001d7cd'), + ('\U0001d800', '\U0001e7ff'), ('\U0001e8c5', '\U0001e8c6'), ('\U0001e8d7', '\U0001edff'), + ('\U0001ee04', '\U0001ee04'), ('\U0001ee20', '\U0001ee20'), ('\U0001ee23', '\U0001ee23'), + ('\U0001ee25', '\U0001ee26'), ('\U0001ee28', '\U0001ee28'), ('\U0001ee33', '\U0001ee33'), + ('\U0001ee38', '\U0001ee38'), ('\U0001ee3a', '\U0001ee3a'), ('\U0001ee3c', '\U0001ee41'), + ('\U0001ee43', '\U0001ee46'), ('\U0001ee48', '\U0001ee48'), ('\U0001ee4a', '\U0001ee4a'), + ('\U0001ee4c', '\U0001ee4c'), ('\U0001ee50', '\U0001ee50'), ('\U0001ee53', '\U0001ee53'), + ('\U0001ee55', '\U0001ee56'), ('\U0001ee58', '\U0001ee58'), ('\U0001ee5a', '\U0001ee5a'), + ('\U0001ee5c', '\U0001ee5c'), ('\U0001ee5e', '\U0001ee5e'), ('\U0001ee60', '\U0001ee60'), + ('\U0001ee63', '\U0001ee63'), ('\U0001ee65', '\U0001ee66'), ('\U0001ee6b', '\U0001ee6b'), + ('\U0001ee73', '\U0001ee73'), ('\U0001ee78', '\U0001ee78'), ('\U0001ee7d', '\U0001ee7d'), + ('\U0001ee7f', '\U0001ee7f'), ('\U0001ee8a', '\U0001ee8a'), ('\U0001ee9c', '\U0001eea0'), + ('\U0001eea4', '\U0001eea4'), ('\U0001eeaa', '\U0001eeaa'), ('\U0001eebc', '\U0001eeef'), + ('\U0001eef2', '\U0001efff'), ('\U0001f02c', '\U0001f02f'), ('\U0001f094', '\U0001f09f'), + ('\U0001f0af', '\U0001f0b0'), ('\U0001f0c0', '\U0001f0c0'), ('\U0001f0d0', '\U0001f0d0'), + ('\U0001f0f6', '\U0001f0ff'), ('\U0001f10d', '\U0001f10f'), ('\U0001f12f', '\U0001f12f'), + ('\U0001f16c', '\U0001f16f'), ('\U0001f19b', '\U0001f1e5'), ('\U0001f203', '\U0001f20f'), + ('\U0001f23b', '\U0001f23f'), ('\U0001f249', '\U0001f24f'), ('\U0001f252', '\U0001f2ff'), + ('\U0001f32d', '\U0001f32f'), ('\U0001f37e', '\U0001f37f'), ('\U0001f3cf', '\U0001f3d3'), + ('\U0001f3f8', '\U0001f3ff'), ('\U0001f4ff', '\U0001f4ff'), ('\U0001f54b', '\U0001f54f'), + ('\U0001f57a', '\U0001f57a'), ('\U0001f5a4', '\U0001f5a4'), ('\U0001f643', '\U0001f644'), + ('\U0001f6d0', '\U0001f6df'), ('\U0001f6ed', '\U0001f6ef'), ('\U0001f6f4', '\U0001f6ff'), + ('\U0001f774', '\U0001f77f'), ('\U0001f7d5', '\U0001f7ff'), ('\U0001f80c', '\U0001f80f'), + ('\U0001f848', '\U0001f84f'), ('\U0001f85a', '\U0001f85f'), ('\U0001f888', '\U0001f88f'), + ('\U0001f8ae', '\U0001ffff'), ('\U00020001', '\U0002a6d5'), ('\U0002a6d7', '\U0002a6ff'), + ('\U0002a701', '\U0002b733'), ('\U0002b735', '\U0002b73f'), ('\U0002b741', '\U0002b81c'), + ('\U0002b81e', '\U0002f7ff'), ('\U0002fa1e', '\U000e0000'), ('\U000e0002', '\U000e001f'), + ('\U000e0080', '\U000e00ff'), ('\U000e01f0', '\U000effff'), ('\U000f0001', '\U000ffffc'), + ('\U000ffffe', '\U000fffff'), ('\U00100001', '\U0010fffc'), ('\U0010fffe', '\U0010ffff') + ]; + pub static Co_table: &'static [(char, char)] = &[ ('\ue000', '\ue000'), ('\uf8ff', '\uf8ff'), ('\U000f0000', '\U000f0000'), ('\U000ffffd', '\U000ffffd'), ('\U00100000', '\U00100000'), ('\U0010fffd', '\U0010fffd') @@ -1540,6 +1880,17 @@ pub mod derived_property { super::bsearch_range_table(c, Alphabetic_table) } + pub static Default_Ignorable_Code_Point_table: &'static [(char, char)] = &[ + ('\xad', '\xad'), ('\u034f', '\u034f'), ('\u061c', '\u061c'), ('\u115f', '\u1160'), + ('\u17b4', '\u17b5'), ('\u180b', '\u180d'), ('\u180e', '\u180e'), ('\u200b', '\u200f'), + ('\u202a', '\u202e'), ('\u2060', '\u2064'), ('\u2065', '\u2065'), ('\u2066', '\u206f'), + ('\u3164', '\u3164'), ('\ufe00', '\ufe0f'), ('\ufeff', '\ufeff'), ('\uffa0', '\uffa0'), + ('\ufff0', '\ufff8'), ('\U0001bca0', '\U0001bca3'), ('\U0001d173', '\U0001d17a'), + ('\U000e0000', '\U000e0000'), ('\U000e0001', '\U000e0001'), ('\U000e0002', '\U000e001f'), + ('\U000e0020', '\U000e007f'), ('\U000e0080', '\U000e00ff'), ('\U000e0100', '\U000e01ef'), + ('\U000e01f0', '\U000e0fff') + ]; + pub static Lowercase_table: &'static [(char, char)] = &[ ('\x61', '\x7a'), ('\xaa', '\xaa'), ('\xb5', '\xb5'), ('\xba', '\xba'), ('\xdf', '\xf6'), ('\xf8', '\xff'), ('\u0101', '\u0101'), ('\u0103', '\u0103'), ('\u0105', '\u0105'), @@ -3264,6 +3615,15 @@ pub mod property { ('\u200c', '\u200d') ]; + pub static Noncharacter_Code_Point_table: &'static [(char, char)] = &[ + ('\ufdd0', '\ufdef'), ('\ufffe', '\uffff'), ('\U0001fffe', '\U0001ffff'), ('\U0002fffe', + '\U0002ffff'), ('\U0003fffe', '\U0003ffff'), ('\U0004fffe', '\U0004ffff'), ('\U0005fffe', + '\U0005ffff'), ('\U0006fffe', '\U0006ffff'), ('\U0007fffe', '\U0007ffff'), ('\U0008fffe', + '\U0008ffff'), ('\U0009fffe', '\U0009ffff'), ('\U000afffe', '\U000affff'), ('\U000bfffe', + '\U000bffff'), ('\U000cfffe', '\U000cffff'), ('\U000dfffe', '\U000dffff'), ('\U000efffe', + '\U000effff'), ('\U000ffffe', '\U000fffff') + ]; + pub static White_Space_table: &'static [(char, char)] = &[ ('\x09', '\x0d'), ('\x20', '\x20'), ('\x85', '\x85'), ('\xa0', '\xa0'), ('\u1680', '\u1680'), ('\u2000', '\u200a'), ('\u2028', '\u2028'), ('\u2029', '\u2029'), ('\u202f', @@ -3290,10 +3650,12 @@ pub mod regex { super::script::Carian_table), ("Caucasian_Albanian", super::script::Caucasian_Albanian_table), ("Cc", super::general_category::Cc_table), ("Cf", super::general_category::Cf_table), ("Chakma", super::script::Chakma_table), ("Cham", - super::script::Cham_table), ("Cherokee", super::script::Cherokee_table), ("Co", - super::general_category::Co_table), ("Common", super::script::Common_table), ("Coptic", - super::script::Coptic_table), ("Cuneiform", super::script::Cuneiform_table), ("Cypriot", - super::script::Cypriot_table), ("Cyrillic", super::script::Cyrillic_table), ("Deseret", + super::script::Cham_table), ("Cherokee", super::script::Cherokee_table), ("Cn", + super::general_category::Cn_table), ("Co", super::general_category::Co_table), ("Common", + super::script::Common_table), ("Coptic", super::script::Coptic_table), ("Cuneiform", + super::script::Cuneiform_table), ("Cypriot", super::script::Cypriot_table), ("Cyrillic", + super::script::Cyrillic_table), ("Default_Ignorable_Code_Point", + super::derived_property::Default_Ignorable_Code_Point_table), ("Deseret", super::script::Deseret_table), ("Devanagari", super::script::Devanagari_table), ("Duployan", super::script::Duployan_table), ("Egyptian_Hieroglyphs", super::script::Egyptian_Hieroglyphs_table), ("Elbasan", super::script::Elbasan_table), @@ -3333,7 +3695,8 @@ pub mod regex { super::script::Myanmar_table), ("N", super::general_category::N_table), ("Nabataean", super::script::Nabataean_table), ("Nd", super::general_category::Nd_table), ("New_Tai_Lue", super::script::New_Tai_Lue_table), ("Nko", super::script::Nko_table), ("Nl", - super::general_category::Nl_table), ("No", super::general_category::No_table), ("Ogham", + super::general_category::Nl_table), ("No", super::general_category::No_table), + ("Noncharacter_Code_Point", super::property::Noncharacter_Code_Point_table), ("Ogham", super::script::Ogham_table), ("Ol_Chiki", super::script::Ol_Chiki_table), ("Old_Italic", super::script::Old_Italic_table), ("Old_North_Arabian", super::script::Old_North_Arabian_table), ("Old_Permic", super::script::Old_Permic_table), @@ -6443,3 +6806,501 @@ pub mod charwidth { ]; } + +pub mod grapheme { + use core::option::{Some, None}; + use core::slice::ImmutableVector; + + #[allow(non_camel_case_types)] + #[deriving(Clone)] + pub enum GraphemeCat { + GC_LV, + GC_LVT, + GC_T, + GC_Extend, + GC_V, + GC_Control, + GC_SpacingMark, + GC_L, + GC_RegionalIndicator, + GC_Any, + } + + fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat { + use core::cmp::{Equal, Less, Greater}; + match r.bsearch(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Some(idx) => { + let (_, _, cat) = r[idx]; + cat + } + None => GC_Any + } + } + + pub fn grapheme_category(c: char) -> GraphemeCat { + bsearch_range_value_table(c, grapheme_cat_table) + } + + static grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[ + ('\x00', '\x1f', GC_Control), ('\x7f', '\x9f', GC_Control), ('\xad', '\xad', GC_Control), + ('\u0300', '\u036f', GC_Extend), ('\u0483', '\u0487', GC_Extend), ('\u0488', '\u0489', + GC_Extend), ('\u0591', '\u05bd', GC_Extend), ('\u05bf', '\u05bf', GC_Extend), ('\u05c1', + '\u05c2', GC_Extend), ('\u05c4', '\u05c5', GC_Extend), ('\u05c7', '\u05c7', GC_Extend), + ('\u0600', '\u0605', GC_Control), ('\u0610', '\u061a', GC_Extend), ('\u061c', '\u061c', + GC_Control), ('\u064b', '\u065f', GC_Extend), ('\u0670', '\u0670', GC_Extend), ('\u06d6', + '\u06dc', GC_Extend), ('\u06dd', '\u06dd', GC_Control), ('\u06df', '\u06e4', GC_Extend), + ('\u06e7', '\u06e8', GC_Extend), ('\u06ea', '\u06ed', GC_Extend), ('\u070f', '\u070f', + GC_Control), ('\u0711', '\u0711', GC_Extend), ('\u0730', '\u074a', GC_Extend), ('\u07a6', + '\u07b0', GC_Extend), ('\u07eb', '\u07f3', GC_Extend), ('\u0816', '\u0819', GC_Extend), + ('\u081b', '\u0823', GC_Extend), ('\u0825', '\u0827', GC_Extend), ('\u0829', '\u082d', + GC_Extend), ('\u0859', '\u085b', GC_Extend), ('\u08e4', '\u0902', GC_Extend), ('\u0903', + '\u0903', GC_SpacingMark), ('\u093a', '\u093a', GC_Extend), ('\u093b', '\u093b', + GC_SpacingMark), ('\u093c', '\u093c', GC_Extend), ('\u093e', '\u0940', GC_SpacingMark), + ('\u0941', '\u0948', GC_Extend), ('\u0949', '\u094c', GC_SpacingMark), ('\u094d', '\u094d', + GC_Extend), ('\u094e', '\u094f', GC_SpacingMark), ('\u0951', '\u0957', GC_Extend), + ('\u0962', '\u0963', GC_Extend), ('\u0981', '\u0981', GC_Extend), ('\u0982', '\u0983', + GC_SpacingMark), ('\u09bc', '\u09bc', GC_Extend), ('\u09be', '\u09be', GC_Extend), + ('\u09bf', '\u09c0', GC_SpacingMark), ('\u09c1', '\u09c4', GC_Extend), ('\u09c7', '\u09c8', + GC_SpacingMark), ('\u09cb', '\u09cc', GC_SpacingMark), ('\u09cd', '\u09cd', GC_Extend), + ('\u09d7', '\u09d7', GC_Extend), ('\u09e2', '\u09e3', GC_Extend), ('\u0a01', '\u0a02', + GC_Extend), ('\u0a03', '\u0a03', GC_SpacingMark), ('\u0a3c', '\u0a3c', GC_Extend), + ('\u0a3e', '\u0a40', GC_SpacingMark), ('\u0a41', '\u0a42', GC_Extend), ('\u0a47', '\u0a48', + GC_Extend), ('\u0a4b', '\u0a4d', GC_Extend), ('\u0a51', '\u0a51', GC_Extend), ('\u0a70', + '\u0a71', GC_Extend), ('\u0a75', '\u0a75', GC_Extend), ('\u0a81', '\u0a82', GC_Extend), + ('\u0a83', '\u0a83', GC_SpacingMark), ('\u0abc', '\u0abc', GC_Extend), ('\u0abe', '\u0ac0', + GC_SpacingMark), ('\u0ac1', '\u0ac5', GC_Extend), ('\u0ac7', '\u0ac8', GC_Extend), + ('\u0ac9', '\u0ac9', GC_SpacingMark), ('\u0acb', '\u0acc', GC_SpacingMark), ('\u0acd', + '\u0acd', GC_Extend), ('\u0ae2', '\u0ae3', GC_Extend), ('\u0b01', '\u0b01', GC_Extend), + ('\u0b02', '\u0b03', GC_SpacingMark), ('\u0b3c', '\u0b3c', GC_Extend), ('\u0b3e', '\u0b3e', + GC_Extend), ('\u0b3f', '\u0b3f', GC_Extend), ('\u0b40', '\u0b40', GC_SpacingMark), + ('\u0b41', '\u0b44', GC_Extend), ('\u0b47', '\u0b48', GC_SpacingMark), ('\u0b4b', '\u0b4c', + GC_SpacingMark), ('\u0b4d', '\u0b4d', GC_Extend), ('\u0b56', '\u0b56', GC_Extend), + ('\u0b57', '\u0b57', GC_Extend), ('\u0b62', '\u0b63', GC_Extend), ('\u0b82', '\u0b82', + GC_Extend), ('\u0bbe', '\u0bbe', GC_Extend), ('\u0bbf', '\u0bbf', GC_SpacingMark), + ('\u0bc0', '\u0bc0', GC_Extend), ('\u0bc1', '\u0bc2', GC_SpacingMark), ('\u0bc6', '\u0bc8', + GC_SpacingMark), ('\u0bca', '\u0bcc', GC_SpacingMark), ('\u0bcd', '\u0bcd', GC_Extend), + ('\u0bd7', '\u0bd7', GC_Extend), ('\u0c00', '\u0c00', GC_Extend), ('\u0c01', '\u0c03', + GC_SpacingMark), ('\u0c3e', '\u0c40', GC_Extend), ('\u0c41', '\u0c44', GC_SpacingMark), + ('\u0c46', '\u0c48', GC_Extend), ('\u0c4a', '\u0c4d', GC_Extend), ('\u0c55', '\u0c56', + GC_Extend), ('\u0c62', '\u0c63', GC_Extend), ('\u0c81', '\u0c81', GC_Extend), ('\u0c82', + '\u0c83', GC_SpacingMark), ('\u0cbc', '\u0cbc', GC_Extend), ('\u0cbe', '\u0cbe', + GC_SpacingMark), ('\u0cbf', '\u0cbf', GC_Extend), ('\u0cc0', '\u0cc1', GC_SpacingMark), + ('\u0cc2', '\u0cc2', GC_Extend), ('\u0cc3', '\u0cc4', GC_SpacingMark), ('\u0cc6', '\u0cc6', + GC_Extend), ('\u0cc7', '\u0cc8', GC_SpacingMark), ('\u0cca', '\u0ccb', GC_SpacingMark), + ('\u0ccc', '\u0ccd', GC_Extend), ('\u0cd5', '\u0cd6', GC_Extend), ('\u0ce2', '\u0ce3', + GC_Extend), ('\u0d01', '\u0d01', GC_Extend), ('\u0d02', '\u0d03', GC_SpacingMark), + ('\u0d3e', '\u0d3e', GC_Extend), ('\u0d3f', '\u0d40', GC_SpacingMark), ('\u0d41', '\u0d44', + GC_Extend), ('\u0d46', '\u0d48', GC_SpacingMark), ('\u0d4a', '\u0d4c', GC_SpacingMark), + ('\u0d4d', '\u0d4d', GC_Extend), ('\u0d57', '\u0d57', GC_Extend), ('\u0d62', '\u0d63', + GC_Extend), ('\u0d82', '\u0d83', GC_SpacingMark), ('\u0dca', '\u0dca', GC_Extend), + ('\u0dcf', '\u0dcf', GC_Extend), ('\u0dd0', '\u0dd1', GC_SpacingMark), ('\u0dd2', '\u0dd4', + GC_Extend), ('\u0dd6', '\u0dd6', GC_Extend), ('\u0dd8', '\u0dde', GC_SpacingMark), + ('\u0ddf', '\u0ddf', GC_Extend), ('\u0df2', '\u0df3', GC_SpacingMark), ('\u0e31', '\u0e31', + GC_Extend), ('\u0e33', '\u0e33', GC_SpacingMark), ('\u0e34', '\u0e3a', GC_Extend), + ('\u0e47', '\u0e4e', GC_Extend), ('\u0eb1', '\u0eb1', GC_Extend), ('\u0eb3', '\u0eb3', + GC_SpacingMark), ('\u0eb4', '\u0eb9', GC_Extend), ('\u0ebb', '\u0ebc', GC_Extend), + ('\u0ec8', '\u0ecd', GC_Extend), ('\u0f18', '\u0f19', GC_Extend), ('\u0f35', '\u0f35', + GC_Extend), ('\u0f37', '\u0f37', GC_Extend), ('\u0f39', '\u0f39', GC_Extend), ('\u0f3e', + '\u0f3f', GC_SpacingMark), ('\u0f71', '\u0f7e', GC_Extend), ('\u0f7f', '\u0f7f', + GC_SpacingMark), ('\u0f80', '\u0f84', GC_Extend), ('\u0f86', '\u0f87', GC_Extend), + ('\u0f8d', '\u0f97', GC_Extend), ('\u0f99', '\u0fbc', GC_Extend), ('\u0fc6', '\u0fc6', + GC_Extend), ('\u102b', '\u102c', GC_SpacingMark), ('\u102d', '\u1030', GC_Extend), + ('\u1031', '\u1031', GC_SpacingMark), ('\u1032', '\u1037', GC_Extend), ('\u1038', '\u1038', + GC_SpacingMark), ('\u1039', '\u103a', GC_Extend), ('\u103b', '\u103c', GC_SpacingMark), + ('\u103d', '\u103e', GC_Extend), ('\u1056', '\u1057', GC_SpacingMark), ('\u1058', '\u1059', + GC_Extend), ('\u105e', '\u1060', GC_Extend), ('\u1062', '\u1064', GC_SpacingMark), + ('\u1067', '\u106d', GC_SpacingMark), ('\u1071', '\u1074', GC_Extend), ('\u1082', '\u1082', + GC_Extend), ('\u1083', '\u1084', GC_SpacingMark), ('\u1085', '\u1086', GC_Extend), + ('\u1087', '\u108c', GC_SpacingMark), ('\u108d', '\u108d', GC_Extend), ('\u108f', '\u108f', + GC_SpacingMark), ('\u109a', '\u109c', GC_SpacingMark), ('\u109d', '\u109d', GC_Extend), + ('\u1100', '\u115f', GC_L), ('\u1160', '\u11a7', GC_V), ('\u11a8', '\u11ff', GC_T), + ('\u135d', '\u135f', GC_Extend), ('\u1712', '\u1714', GC_Extend), ('\u1732', '\u1734', + GC_Extend), ('\u1752', '\u1753', GC_Extend), ('\u1772', '\u1773', GC_Extend), ('\u17b4', + '\u17b5', GC_Extend), ('\u17b6', '\u17b6', GC_SpacingMark), ('\u17b7', '\u17bd', GC_Extend), + ('\u17be', '\u17c5', GC_SpacingMark), ('\u17c6', '\u17c6', GC_Extend), ('\u17c7', '\u17c8', + GC_SpacingMark), ('\u17c9', '\u17d3', GC_Extend), ('\u17dd', '\u17dd', GC_Extend), + ('\u180b', '\u180d', GC_Extend), ('\u180e', '\u180e', GC_Control), ('\u18a9', '\u18a9', + GC_Extend), ('\u1920', '\u1922', GC_Extend), ('\u1923', '\u1926', GC_SpacingMark), + ('\u1927', '\u1928', GC_Extend), ('\u1929', '\u192b', GC_SpacingMark), ('\u1930', '\u1931', + GC_SpacingMark), ('\u1932', '\u1932', GC_Extend), ('\u1933', '\u1938', GC_SpacingMark), + ('\u1939', '\u193b', GC_Extend), ('\u19b0', '\u19c0', GC_SpacingMark), ('\u19c8', '\u19c9', + GC_SpacingMark), ('\u1a17', '\u1a18', GC_Extend), ('\u1a19', '\u1a1a', GC_SpacingMark), + ('\u1a1b', '\u1a1b', GC_Extend), ('\u1a55', '\u1a55', GC_SpacingMark), ('\u1a56', '\u1a56', + GC_Extend), ('\u1a57', '\u1a57', GC_SpacingMark), ('\u1a58', '\u1a5e', GC_Extend), + ('\u1a60', '\u1a60', GC_Extend), ('\u1a61', '\u1a61', GC_SpacingMark), ('\u1a62', '\u1a62', + GC_Extend), ('\u1a63', '\u1a64', GC_SpacingMark), ('\u1a65', '\u1a6c', GC_Extend), + ('\u1a6d', '\u1a72', GC_SpacingMark), ('\u1a73', '\u1a7c', GC_Extend), ('\u1a7f', '\u1a7f', + GC_Extend), ('\u1ab0', '\u1abd', GC_Extend), ('\u1abe', '\u1abe', GC_Extend), ('\u1b00', + '\u1b03', GC_Extend), ('\u1b04', '\u1b04', GC_SpacingMark), ('\u1b34', '\u1b34', GC_Extend), + ('\u1b35', '\u1b35', GC_SpacingMark), ('\u1b36', '\u1b3a', GC_Extend), ('\u1b3b', '\u1b3b', + GC_SpacingMark), ('\u1b3c', '\u1b3c', GC_Extend), ('\u1b3d', '\u1b41', GC_SpacingMark), + ('\u1b42', '\u1b42', GC_Extend), ('\u1b43', '\u1b44', GC_SpacingMark), ('\u1b6b', '\u1b73', + GC_Extend), ('\u1b80', '\u1b81', GC_Extend), ('\u1b82', '\u1b82', GC_SpacingMark), + ('\u1ba1', '\u1ba1', GC_SpacingMark), ('\u1ba2', '\u1ba5', GC_Extend), ('\u1ba6', '\u1ba7', + GC_SpacingMark), ('\u1ba8', '\u1ba9', GC_Extend), ('\u1baa', '\u1baa', GC_SpacingMark), + ('\u1bab', '\u1bad', GC_Extend), ('\u1be6', '\u1be6', GC_Extend), ('\u1be7', '\u1be7', + GC_SpacingMark), ('\u1be8', '\u1be9', GC_Extend), ('\u1bea', '\u1bec', GC_SpacingMark), + ('\u1bed', '\u1bed', GC_Extend), ('\u1bee', '\u1bee', GC_SpacingMark), ('\u1bef', '\u1bf1', + GC_Extend), ('\u1bf2', '\u1bf3', GC_SpacingMark), ('\u1c24', '\u1c2b', GC_SpacingMark), + ('\u1c2c', '\u1c33', GC_Extend), ('\u1c34', '\u1c35', GC_SpacingMark), ('\u1c36', '\u1c37', + GC_Extend), ('\u1cd0', '\u1cd2', GC_Extend), ('\u1cd4', '\u1ce0', GC_Extend), ('\u1ce1', + '\u1ce1', GC_SpacingMark), ('\u1ce2', '\u1ce8', GC_Extend), ('\u1ced', '\u1ced', GC_Extend), + ('\u1cf2', '\u1cf3', GC_SpacingMark), ('\u1cf4', '\u1cf4', GC_Extend), ('\u1cf8', '\u1cf9', + GC_Extend), ('\u1dc0', '\u1df5', GC_Extend), ('\u1dfc', '\u1dff', GC_Extend), ('\u200b', + '\u200b', GC_Control), ('\u200c', '\u200d', GC_Extend), ('\u200e', '\u200f', GC_Control), + ('\u2028', '\u202e', GC_Control), ('\u2060', '\u206f', GC_Control), ('\u20d0', '\u20dc', + GC_Extend), ('\u20dd', '\u20e0', GC_Extend), ('\u20e1', '\u20e1', GC_Extend), ('\u20e2', + '\u20e4', GC_Extend), ('\u20e5', '\u20f0', GC_Extend), ('\u2cef', '\u2cf1', GC_Extend), + ('\u2d7f', '\u2d7f', GC_Extend), ('\u2de0', '\u2dff', GC_Extend), ('\u302a', '\u302d', + GC_Extend), ('\u302e', '\u302f', GC_Extend), ('\u3099', '\u309a', GC_Extend), ('\ua66f', + '\ua66f', GC_Extend), ('\ua670', '\ua672', GC_Extend), ('\ua674', '\ua67d', GC_Extend), + ('\ua69f', '\ua69f', GC_Extend), ('\ua6f0', '\ua6f1', GC_Extend), ('\ua802', '\ua802', + GC_Extend), ('\ua806', '\ua806', GC_Extend), ('\ua80b', '\ua80b', GC_Extend), ('\ua823', + '\ua824', GC_SpacingMark), ('\ua825', '\ua826', GC_Extend), ('\ua827', '\ua827', + GC_SpacingMark), ('\ua880', '\ua881', GC_SpacingMark), ('\ua8b4', '\ua8c3', GC_SpacingMark), + ('\ua8c4', '\ua8c4', GC_Extend), ('\ua8e0', '\ua8f1', GC_Extend), ('\ua926', '\ua92d', + GC_Extend), ('\ua947', '\ua951', GC_Extend), ('\ua952', '\ua953', GC_SpacingMark), + ('\ua960', '\ua97c', GC_L), ('\ua980', '\ua982', GC_Extend), ('\ua983', '\ua983', + GC_SpacingMark), ('\ua9b3', '\ua9b3', GC_Extend), ('\ua9b4', '\ua9b5', GC_SpacingMark), + ('\ua9b6', '\ua9b9', GC_Extend), ('\ua9ba', '\ua9bb', GC_SpacingMark), ('\ua9bc', '\ua9bc', + GC_Extend), ('\ua9bd', '\ua9c0', GC_SpacingMark), ('\ua9e5', '\ua9e5', GC_Extend), + ('\uaa29', '\uaa2e', GC_Extend), ('\uaa2f', '\uaa30', GC_SpacingMark), ('\uaa31', '\uaa32', + GC_Extend), ('\uaa33', '\uaa34', GC_SpacingMark), ('\uaa35', '\uaa36', GC_Extend), + ('\uaa43', '\uaa43', GC_Extend), ('\uaa4c', '\uaa4c', GC_Extend), ('\uaa4d', '\uaa4d', + GC_SpacingMark), ('\uaa7b', '\uaa7b', GC_SpacingMark), ('\uaa7c', '\uaa7c', GC_Extend), + ('\uaa7d', '\uaa7d', GC_SpacingMark), ('\uaab0', '\uaab0', GC_Extend), ('\uaab2', '\uaab4', + GC_Extend), ('\uaab7', '\uaab8', GC_Extend), ('\uaabe', '\uaabf', GC_Extend), ('\uaac1', + '\uaac1', GC_Extend), ('\uaaeb', '\uaaeb', GC_SpacingMark), ('\uaaec', '\uaaed', GC_Extend), + ('\uaaee', '\uaaef', GC_SpacingMark), ('\uaaf5', '\uaaf5', GC_SpacingMark), ('\uaaf6', + '\uaaf6', GC_Extend), ('\uabe3', '\uabe4', GC_SpacingMark), ('\uabe5', '\uabe5', GC_Extend), + ('\uabe6', '\uabe7', GC_SpacingMark), ('\uabe8', '\uabe8', GC_Extend), ('\uabe9', '\uabea', + GC_SpacingMark), ('\uabec', '\uabec', GC_SpacingMark), ('\uabed', '\uabed', GC_Extend), + ('\uac00', '\uac00', GC_LV), ('\uac01', '\uac1b', GC_LVT), ('\uac1c', '\uac1c', GC_LV), + ('\uac1d', '\uac37', GC_LVT), ('\uac38', '\uac38', GC_LV), ('\uac39', '\uac53', GC_LVT), + ('\uac54', '\uac54', GC_LV), ('\uac55', '\uac6f', GC_LVT), ('\uac70', '\uac70', GC_LV), + ('\uac71', '\uac8b', GC_LVT), ('\uac8c', '\uac8c', GC_LV), ('\uac8d', '\uaca7', GC_LVT), + ('\uaca8', '\uaca8', GC_LV), ('\uaca9', '\uacc3', GC_LVT), ('\uacc4', '\uacc4', GC_LV), + ('\uacc5', '\uacdf', GC_LVT), ('\uace0', '\uace0', GC_LV), ('\uace1', '\uacfb', GC_LVT), + ('\uacfc', '\uacfc', GC_LV), ('\uacfd', '\uad17', GC_LVT), ('\uad18', '\uad18', GC_LV), + ('\uad19', '\uad33', GC_LVT), ('\uad34', '\uad34', GC_LV), ('\uad35', '\uad4f', GC_LVT), + ('\uad50', '\uad50', GC_LV), ('\uad51', '\uad6b', GC_LVT), ('\uad6c', '\uad6c', GC_LV), + ('\uad6d', '\uad87', GC_LVT), ('\uad88', '\uad88', GC_LV), ('\uad89', '\uada3', GC_LVT), + ('\uada4', '\uada4', GC_LV), ('\uada5', '\uadbf', GC_LVT), ('\uadc0', '\uadc0', GC_LV), + ('\uadc1', '\uaddb', GC_LVT), ('\uaddc', '\uaddc', GC_LV), ('\uaddd', '\uadf7', GC_LVT), + ('\uadf8', '\uadf8', GC_LV), ('\uadf9', '\uae13', GC_LVT), ('\uae14', '\uae14', GC_LV), + ('\uae15', '\uae2f', GC_LVT), ('\uae30', '\uae30', GC_LV), ('\uae31', '\uae4b', GC_LVT), + ('\uae4c', '\uae4c', GC_LV), ('\uae4d', '\uae67', GC_LVT), ('\uae68', '\uae68', GC_LV), + ('\uae69', '\uae83', GC_LVT), ('\uae84', '\uae84', GC_LV), ('\uae85', '\uae9f', GC_LVT), + ('\uaea0', '\uaea0', GC_LV), ('\uaea1', '\uaebb', GC_LVT), ('\uaebc', '\uaebc', GC_LV), + ('\uaebd', '\uaed7', GC_LVT), ('\uaed8', '\uaed8', GC_LV), ('\uaed9', '\uaef3', GC_LVT), + ('\uaef4', '\uaef4', GC_LV), ('\uaef5', '\uaf0f', GC_LVT), ('\uaf10', '\uaf10', GC_LV), + ('\uaf11', '\uaf2b', GC_LVT), ('\uaf2c', '\uaf2c', GC_LV), ('\uaf2d', '\uaf47', GC_LVT), + ('\uaf48', '\uaf48', GC_LV), ('\uaf49', '\uaf63', GC_LVT), ('\uaf64', '\uaf64', GC_LV), + ('\uaf65', '\uaf7f', GC_LVT), ('\uaf80', '\uaf80', GC_LV), ('\uaf81', '\uaf9b', GC_LVT), + ('\uaf9c', '\uaf9c', GC_LV), ('\uaf9d', '\uafb7', GC_LVT), ('\uafb8', '\uafb8', GC_LV), + ('\uafb9', '\uafd3', GC_LVT), ('\uafd4', '\uafd4', GC_LV), ('\uafd5', '\uafef', GC_LVT), + ('\uaff0', '\uaff0', GC_LV), ('\uaff1', '\ub00b', GC_LVT), ('\ub00c', '\ub00c', GC_LV), + ('\ub00d', '\ub027', GC_LVT), ('\ub028', '\ub028', GC_LV), ('\ub029', '\ub043', GC_LVT), + ('\ub044', '\ub044', GC_LV), ('\ub045', '\ub05f', GC_LVT), ('\ub060', '\ub060', GC_LV), + ('\ub061', '\ub07b', GC_LVT), ('\ub07c', '\ub07c', GC_LV), ('\ub07d', '\ub097', GC_LVT), + ('\ub098', '\ub098', GC_LV), ('\ub099', '\ub0b3', GC_LVT), ('\ub0b4', '\ub0b4', GC_LV), + ('\ub0b5', '\ub0cf', GC_LVT), ('\ub0d0', '\ub0d0', GC_LV), ('\ub0d1', '\ub0eb', GC_LVT), + ('\ub0ec', '\ub0ec', GC_LV), ('\ub0ed', '\ub107', GC_LVT), ('\ub108', '\ub108', GC_LV), + ('\ub109', '\ub123', GC_LVT), ('\ub124', '\ub124', GC_LV), ('\ub125', '\ub13f', GC_LVT), + ('\ub140', '\ub140', GC_LV), ('\ub141', '\ub15b', GC_LVT), ('\ub15c', '\ub15c', GC_LV), + ('\ub15d', '\ub177', GC_LVT), ('\ub178', '\ub178', GC_LV), ('\ub179', '\ub193', GC_LVT), + ('\ub194', '\ub194', GC_LV), ('\ub195', '\ub1af', GC_LVT), ('\ub1b0', '\ub1b0', GC_LV), + ('\ub1b1', '\ub1cb', GC_LVT), ('\ub1cc', '\ub1cc', GC_LV), ('\ub1cd', '\ub1e7', GC_LVT), + ('\ub1e8', '\ub1e8', GC_LV), ('\ub1e9', '\ub203', GC_LVT), ('\ub204', '\ub204', GC_LV), + ('\ub205', '\ub21f', GC_LVT), ('\ub220', '\ub220', GC_LV), ('\ub221', '\ub23b', GC_LVT), + ('\ub23c', '\ub23c', GC_LV), ('\ub23d', '\ub257', GC_LVT), ('\ub258', '\ub258', GC_LV), + ('\ub259', '\ub273', GC_LVT), ('\ub274', '\ub274', GC_LV), ('\ub275', '\ub28f', GC_LVT), + ('\ub290', '\ub290', GC_LV), ('\ub291', '\ub2ab', GC_LVT), ('\ub2ac', '\ub2ac', GC_LV), + ('\ub2ad', '\ub2c7', GC_LVT), ('\ub2c8', '\ub2c8', GC_LV), ('\ub2c9', '\ub2e3', GC_LVT), + ('\ub2e4', '\ub2e4', GC_LV), ('\ub2e5', '\ub2ff', GC_LVT), ('\ub300', '\ub300', GC_LV), + ('\ub301', '\ub31b', GC_LVT), ('\ub31c', '\ub31c', GC_LV), ('\ub31d', '\ub337', GC_LVT), + ('\ub338', '\ub338', GC_LV), ('\ub339', '\ub353', GC_LVT), ('\ub354', '\ub354', GC_LV), + ('\ub355', '\ub36f', GC_LVT), ('\ub370', '\ub370', GC_LV), ('\ub371', '\ub38b', GC_LVT), + ('\ub38c', '\ub38c', GC_LV), ('\ub38d', '\ub3a7', GC_LVT), ('\ub3a8', '\ub3a8', GC_LV), + ('\ub3a9', '\ub3c3', GC_LVT), ('\ub3c4', '\ub3c4', GC_LV), ('\ub3c5', '\ub3df', GC_LVT), + ('\ub3e0', '\ub3e0', GC_LV), ('\ub3e1', '\ub3fb', GC_LVT), ('\ub3fc', '\ub3fc', GC_LV), + ('\ub3fd', '\ub417', GC_LVT), ('\ub418', '\ub418', GC_LV), ('\ub419', '\ub433', GC_LVT), + ('\ub434', '\ub434', GC_LV), ('\ub435', '\ub44f', GC_LVT), ('\ub450', '\ub450', GC_LV), + ('\ub451', '\ub46b', GC_LVT), ('\ub46c', '\ub46c', GC_LV), ('\ub46d', '\ub487', GC_LVT), + ('\ub488', '\ub488', GC_LV), ('\ub489', '\ub4a3', GC_LVT), ('\ub4a4', '\ub4a4', GC_LV), + ('\ub4a5', '\ub4bf', GC_LVT), ('\ub4c0', '\ub4c0', GC_LV), ('\ub4c1', '\ub4db', GC_LVT), + ('\ub4dc', '\ub4dc', GC_LV), ('\ub4dd', '\ub4f7', GC_LVT), ('\ub4f8', '\ub4f8', GC_LV), + ('\ub4f9', '\ub513', GC_LVT), ('\ub514', '\ub514', GC_LV), ('\ub515', '\ub52f', GC_LVT), + ('\ub530', '\ub530', GC_LV), ('\ub531', '\ub54b', GC_LVT), ('\ub54c', '\ub54c', GC_LV), + ('\ub54d', '\ub567', GC_LVT), ('\ub568', '\ub568', GC_LV), ('\ub569', '\ub583', GC_LVT), + ('\ub584', '\ub584', GC_LV), ('\ub585', '\ub59f', GC_LVT), ('\ub5a0', '\ub5a0', GC_LV), + ('\ub5a1', '\ub5bb', GC_LVT), ('\ub5bc', '\ub5bc', GC_LV), ('\ub5bd', '\ub5d7', GC_LVT), + ('\ub5d8', '\ub5d8', GC_LV), ('\ub5d9', '\ub5f3', GC_LVT), ('\ub5f4', '\ub5f4', GC_LV), + ('\ub5f5', '\ub60f', GC_LVT), ('\ub610', '\ub610', GC_LV), ('\ub611', '\ub62b', GC_LVT), + ('\ub62c', '\ub62c', GC_LV), ('\ub62d', '\ub647', GC_LVT), ('\ub648', '\ub648', GC_LV), + ('\ub649', '\ub663', GC_LVT), ('\ub664', '\ub664', GC_LV), ('\ub665', '\ub67f', GC_LVT), + ('\ub680', '\ub680', GC_LV), ('\ub681', '\ub69b', GC_LVT), ('\ub69c', '\ub69c', GC_LV), + ('\ub69d', '\ub6b7', GC_LVT), ('\ub6b8', '\ub6b8', GC_LV), ('\ub6b9', '\ub6d3', GC_LVT), + ('\ub6d4', '\ub6d4', GC_LV), ('\ub6d5', '\ub6ef', GC_LVT), ('\ub6f0', '\ub6f0', GC_LV), + ('\ub6f1', '\ub70b', GC_LVT), ('\ub70c', '\ub70c', GC_LV), ('\ub70d', '\ub727', GC_LVT), + ('\ub728', '\ub728', GC_LV), ('\ub729', '\ub743', GC_LVT), ('\ub744', '\ub744', GC_LV), + ('\ub745', '\ub75f', GC_LVT), ('\ub760', '\ub760', GC_LV), ('\ub761', '\ub77b', GC_LVT), + ('\ub77c', '\ub77c', GC_LV), ('\ub77d', '\ub797', GC_LVT), ('\ub798', '\ub798', GC_LV), + ('\ub799', '\ub7b3', GC_LVT), ('\ub7b4', '\ub7b4', GC_LV), ('\ub7b5', '\ub7cf', GC_LVT), + ('\ub7d0', '\ub7d0', GC_LV), ('\ub7d1', '\ub7eb', GC_LVT), ('\ub7ec', '\ub7ec', GC_LV), + ('\ub7ed', '\ub807', GC_LVT), ('\ub808', '\ub808', GC_LV), ('\ub809', '\ub823', GC_LVT), + ('\ub824', '\ub824', GC_LV), ('\ub825', '\ub83f', GC_LVT), ('\ub840', '\ub840', GC_LV), + ('\ub841', '\ub85b', GC_LVT), ('\ub85c', '\ub85c', GC_LV), ('\ub85d', '\ub877', GC_LVT), + ('\ub878', '\ub878', GC_LV), ('\ub879', '\ub893', GC_LVT), ('\ub894', '\ub894', GC_LV), + ('\ub895', '\ub8af', GC_LVT), ('\ub8b0', '\ub8b0', GC_LV), ('\ub8b1', '\ub8cb', GC_LVT), + ('\ub8cc', '\ub8cc', GC_LV), ('\ub8cd', '\ub8e7', GC_LVT), ('\ub8e8', '\ub8e8', GC_LV), + ('\ub8e9', '\ub903', GC_LVT), ('\ub904', '\ub904', GC_LV), ('\ub905', '\ub91f', GC_LVT), + ('\ub920', '\ub920', GC_LV), ('\ub921', '\ub93b', GC_LVT), ('\ub93c', '\ub93c', GC_LV), + ('\ub93d', '\ub957', GC_LVT), ('\ub958', '\ub958', GC_LV), ('\ub959', '\ub973', GC_LVT), + ('\ub974', '\ub974', GC_LV), ('\ub975', '\ub98f', GC_LVT), ('\ub990', '\ub990', GC_LV), + ('\ub991', '\ub9ab', GC_LVT), ('\ub9ac', '\ub9ac', GC_LV), ('\ub9ad', '\ub9c7', GC_LVT), + ('\ub9c8', '\ub9c8', GC_LV), ('\ub9c9', '\ub9e3', GC_LVT), ('\ub9e4', '\ub9e4', GC_LV), + ('\ub9e5', '\ub9ff', GC_LVT), ('\uba00', '\uba00', GC_LV), ('\uba01', '\uba1b', GC_LVT), + ('\uba1c', '\uba1c', GC_LV), ('\uba1d', '\uba37', GC_LVT), ('\uba38', '\uba38', GC_LV), + ('\uba39', '\uba53', GC_LVT), ('\uba54', '\uba54', GC_LV), ('\uba55', '\uba6f', GC_LVT), + ('\uba70', '\uba70', GC_LV), ('\uba71', '\uba8b', GC_LVT), ('\uba8c', '\uba8c', GC_LV), + ('\uba8d', '\ubaa7', GC_LVT), ('\ubaa8', '\ubaa8', GC_LV), ('\ubaa9', '\ubac3', GC_LVT), + ('\ubac4', '\ubac4', GC_LV), ('\ubac5', '\ubadf', GC_LVT), ('\ubae0', '\ubae0', GC_LV), + ('\ubae1', '\ubafb', GC_LVT), ('\ubafc', '\ubafc', GC_LV), ('\ubafd', '\ubb17', GC_LVT), + ('\ubb18', '\ubb18', GC_LV), ('\ubb19', '\ubb33', GC_LVT), ('\ubb34', '\ubb34', GC_LV), + ('\ubb35', '\ubb4f', GC_LVT), ('\ubb50', '\ubb50', GC_LV), ('\ubb51', '\ubb6b', GC_LVT), + ('\ubb6c', '\ubb6c', GC_LV), ('\ubb6d', '\ubb87', GC_LVT), ('\ubb88', '\ubb88', GC_LV), + ('\ubb89', '\ubba3', GC_LVT), ('\ubba4', '\ubba4', GC_LV), ('\ubba5', '\ubbbf', GC_LVT), + ('\ubbc0', '\ubbc0', GC_LV), ('\ubbc1', '\ubbdb', GC_LVT), ('\ubbdc', '\ubbdc', GC_LV), + ('\ubbdd', '\ubbf7', GC_LVT), ('\ubbf8', '\ubbf8', GC_LV), ('\ubbf9', '\ubc13', GC_LVT), + ('\ubc14', '\ubc14', GC_LV), ('\ubc15', '\ubc2f', GC_LVT), ('\ubc30', '\ubc30', GC_LV), + ('\ubc31', '\ubc4b', GC_LVT), ('\ubc4c', '\ubc4c', GC_LV), ('\ubc4d', '\ubc67', GC_LVT), + ('\ubc68', '\ubc68', GC_LV), ('\ubc69', '\ubc83', GC_LVT), ('\ubc84', '\ubc84', GC_LV), + ('\ubc85', '\ubc9f', GC_LVT), ('\ubca0', '\ubca0', GC_LV), ('\ubca1', '\ubcbb', GC_LVT), + ('\ubcbc', '\ubcbc', GC_LV), ('\ubcbd', '\ubcd7', GC_LVT), ('\ubcd8', '\ubcd8', GC_LV), + ('\ubcd9', '\ubcf3', GC_LVT), ('\ubcf4', '\ubcf4', GC_LV), ('\ubcf5', '\ubd0f', GC_LVT), + ('\ubd10', '\ubd10', GC_LV), ('\ubd11', '\ubd2b', GC_LVT), ('\ubd2c', '\ubd2c', GC_LV), + ('\ubd2d', '\ubd47', GC_LVT), ('\ubd48', '\ubd48', GC_LV), ('\ubd49', '\ubd63', GC_LVT), + ('\ubd64', '\ubd64', GC_LV), ('\ubd65', '\ubd7f', GC_LVT), ('\ubd80', '\ubd80', GC_LV), + ('\ubd81', '\ubd9b', GC_LVT), ('\ubd9c', '\ubd9c', GC_LV), ('\ubd9d', '\ubdb7', GC_LVT), + ('\ubdb8', '\ubdb8', GC_LV), ('\ubdb9', '\ubdd3', GC_LVT), ('\ubdd4', '\ubdd4', GC_LV), + ('\ubdd5', '\ubdef', GC_LVT), ('\ubdf0', '\ubdf0', GC_LV), ('\ubdf1', '\ube0b', GC_LVT), + ('\ube0c', '\ube0c', GC_LV), ('\ube0d', '\ube27', GC_LVT), ('\ube28', '\ube28', GC_LV), + ('\ube29', '\ube43', GC_LVT), ('\ube44', '\ube44', GC_LV), ('\ube45', '\ube5f', GC_LVT), + ('\ube60', '\ube60', GC_LV), ('\ube61', '\ube7b', GC_LVT), ('\ube7c', '\ube7c', GC_LV), + ('\ube7d', '\ube97', GC_LVT), ('\ube98', '\ube98', GC_LV), ('\ube99', '\ubeb3', GC_LVT), + ('\ubeb4', '\ubeb4', GC_LV), ('\ubeb5', '\ubecf', GC_LVT), ('\ubed0', '\ubed0', GC_LV), + ('\ubed1', '\ubeeb', GC_LVT), ('\ubeec', '\ubeec', GC_LV), ('\ubeed', '\ubf07', GC_LVT), + ('\ubf08', '\ubf08', GC_LV), ('\ubf09', '\ubf23', GC_LVT), ('\ubf24', '\ubf24', GC_LV), + ('\ubf25', '\ubf3f', GC_LVT), ('\ubf40', '\ubf40', GC_LV), ('\ubf41', '\ubf5b', GC_LVT), + ('\ubf5c', '\ubf5c', GC_LV), ('\ubf5d', '\ubf77', GC_LVT), ('\ubf78', '\ubf78', GC_LV), + ('\ubf79', '\ubf93', GC_LVT), ('\ubf94', '\ubf94', GC_LV), ('\ubf95', '\ubfaf', GC_LVT), + ('\ubfb0', '\ubfb0', GC_LV), ('\ubfb1', '\ubfcb', GC_LVT), ('\ubfcc', '\ubfcc', GC_LV), + ('\ubfcd', '\ubfe7', GC_LVT), ('\ubfe8', '\ubfe8', GC_LV), ('\ubfe9', '\uc003', GC_LVT), + ('\uc004', '\uc004', GC_LV), ('\uc005', '\uc01f', GC_LVT), ('\uc020', '\uc020', GC_LV), + ('\uc021', '\uc03b', GC_LVT), ('\uc03c', '\uc03c', GC_LV), ('\uc03d', '\uc057', GC_LVT), + ('\uc058', '\uc058', GC_LV), ('\uc059', '\uc073', GC_LVT), ('\uc074', '\uc074', GC_LV), + ('\uc075', '\uc08f', GC_LVT), ('\uc090', '\uc090', GC_LV), ('\uc091', '\uc0ab', GC_LVT), + ('\uc0ac', '\uc0ac', GC_LV), ('\uc0ad', '\uc0c7', GC_LVT), ('\uc0c8', '\uc0c8', GC_LV), + ('\uc0c9', '\uc0e3', GC_LVT), ('\uc0e4', '\uc0e4', GC_LV), ('\uc0e5', '\uc0ff', GC_LVT), + ('\uc100', '\uc100', GC_LV), ('\uc101', '\uc11b', GC_LVT), ('\uc11c', '\uc11c', GC_LV), + ('\uc11d', '\uc137', GC_LVT), ('\uc138', '\uc138', GC_LV), ('\uc139', '\uc153', GC_LVT), + ('\uc154', '\uc154', GC_LV), ('\uc155', '\uc16f', GC_LVT), ('\uc170', '\uc170', GC_LV), + ('\uc171', '\uc18b', GC_LVT), ('\uc18c', '\uc18c', GC_LV), ('\uc18d', '\uc1a7', GC_LVT), + ('\uc1a8', '\uc1a8', GC_LV), ('\uc1a9', '\uc1c3', GC_LVT), ('\uc1c4', '\uc1c4', GC_LV), + ('\uc1c5', '\uc1df', GC_LVT), ('\uc1e0', '\uc1e0', GC_LV), ('\uc1e1', '\uc1fb', GC_LVT), + ('\uc1fc', '\uc1fc', GC_LV), ('\uc1fd', '\uc217', GC_LVT), ('\uc218', '\uc218', GC_LV), + ('\uc219', '\uc233', GC_LVT), ('\uc234', '\uc234', GC_LV), ('\uc235', '\uc24f', GC_LVT), + ('\uc250', '\uc250', GC_LV), ('\uc251', '\uc26b', GC_LVT), ('\uc26c', '\uc26c', GC_LV), + ('\uc26d', '\uc287', GC_LVT), ('\uc288', '\uc288', GC_LV), ('\uc289', '\uc2a3', GC_LVT), + ('\uc2a4', '\uc2a4', GC_LV), ('\uc2a5', '\uc2bf', GC_LVT), ('\uc2c0', '\uc2c0', GC_LV), + ('\uc2c1', '\uc2db', GC_LVT), ('\uc2dc', '\uc2dc', GC_LV), ('\uc2dd', '\uc2f7', GC_LVT), + ('\uc2f8', '\uc2f8', GC_LV), ('\uc2f9', '\uc313', GC_LVT), ('\uc314', '\uc314', GC_LV), + ('\uc315', '\uc32f', GC_LVT), ('\uc330', '\uc330', GC_LV), ('\uc331', '\uc34b', GC_LVT), + ('\uc34c', '\uc34c', GC_LV), ('\uc34d', '\uc367', GC_LVT), ('\uc368', '\uc368', GC_LV), + ('\uc369', '\uc383', GC_LVT), ('\uc384', '\uc384', GC_LV), ('\uc385', '\uc39f', GC_LVT), + ('\uc3a0', '\uc3a0', GC_LV), ('\uc3a1', '\uc3bb', GC_LVT), ('\uc3bc', '\uc3bc', GC_LV), + ('\uc3bd', '\uc3d7', GC_LVT), ('\uc3d8', '\uc3d8', GC_LV), ('\uc3d9', '\uc3f3', GC_LVT), + ('\uc3f4', '\uc3f4', GC_LV), ('\uc3f5', '\uc40f', GC_LVT), ('\uc410', '\uc410', GC_LV), + ('\uc411', '\uc42b', GC_LVT), ('\uc42c', '\uc42c', GC_LV), ('\uc42d', '\uc447', GC_LVT), + ('\uc448', '\uc448', GC_LV), ('\uc449', '\uc463', GC_LVT), ('\uc464', '\uc464', GC_LV), + ('\uc465', '\uc47f', GC_LVT), ('\uc480', '\uc480', GC_LV), ('\uc481', '\uc49b', GC_LVT), + ('\uc49c', '\uc49c', GC_LV), ('\uc49d', '\uc4b7', GC_LVT), ('\uc4b8', '\uc4b8', GC_LV), + ('\uc4b9', '\uc4d3', GC_LVT), ('\uc4d4', '\uc4d4', GC_LV), ('\uc4d5', '\uc4ef', GC_LVT), + ('\uc4f0', '\uc4f0', GC_LV), ('\uc4f1', '\uc50b', GC_LVT), ('\uc50c', '\uc50c', GC_LV), + ('\uc50d', '\uc527', GC_LVT), ('\uc528', '\uc528', GC_LV), ('\uc529', '\uc543', GC_LVT), + ('\uc544', '\uc544', GC_LV), ('\uc545', '\uc55f', GC_LVT), ('\uc560', '\uc560', GC_LV), + ('\uc561', '\uc57b', GC_LVT), ('\uc57c', '\uc57c', GC_LV), ('\uc57d', '\uc597', GC_LVT), + ('\uc598', '\uc598', GC_LV), ('\uc599', '\uc5b3', GC_LVT), ('\uc5b4', '\uc5b4', GC_LV), + ('\uc5b5', '\uc5cf', GC_LVT), ('\uc5d0', '\uc5d0', GC_LV), ('\uc5d1', '\uc5eb', GC_LVT), + ('\uc5ec', '\uc5ec', GC_LV), ('\uc5ed', '\uc607', GC_LVT), ('\uc608', '\uc608', GC_LV), + ('\uc609', '\uc623', GC_LVT), ('\uc624', '\uc624', GC_LV), ('\uc625', '\uc63f', GC_LVT), + ('\uc640', '\uc640', GC_LV), ('\uc641', '\uc65b', GC_LVT), ('\uc65c', '\uc65c', GC_LV), + ('\uc65d', '\uc677', GC_LVT), ('\uc678', '\uc678', GC_LV), ('\uc679', '\uc693', GC_LVT), + ('\uc694', '\uc694', GC_LV), ('\uc695', '\uc6af', GC_LVT), ('\uc6b0', '\uc6b0', GC_LV), + ('\uc6b1', '\uc6cb', GC_LVT), ('\uc6cc', '\uc6cc', GC_LV), ('\uc6cd', '\uc6e7', GC_LVT), + ('\uc6e8', '\uc6e8', GC_LV), ('\uc6e9', '\uc703', GC_LVT), ('\uc704', '\uc704', GC_LV), + ('\uc705', '\uc71f', GC_LVT), ('\uc720', '\uc720', GC_LV), ('\uc721', '\uc73b', GC_LVT), + ('\uc73c', '\uc73c', GC_LV), ('\uc73d', '\uc757', GC_LVT), ('\uc758', '\uc758', GC_LV), + ('\uc759', '\uc773', GC_LVT), ('\uc774', '\uc774', GC_LV), ('\uc775', '\uc78f', GC_LVT), + ('\uc790', '\uc790', GC_LV), ('\uc791', '\uc7ab', GC_LVT), ('\uc7ac', '\uc7ac', GC_LV), + ('\uc7ad', '\uc7c7', GC_LVT), ('\uc7c8', '\uc7c8', GC_LV), ('\uc7c9', '\uc7e3', GC_LVT), + ('\uc7e4', '\uc7e4', GC_LV), ('\uc7e5', '\uc7ff', GC_LVT), ('\uc800', '\uc800', GC_LV), + ('\uc801', '\uc81b', GC_LVT), ('\uc81c', '\uc81c', GC_LV), ('\uc81d', '\uc837', GC_LVT), + ('\uc838', '\uc838', GC_LV), ('\uc839', '\uc853', GC_LVT), ('\uc854', '\uc854', GC_LV), + ('\uc855', '\uc86f', GC_LVT), ('\uc870', '\uc870', GC_LV), ('\uc871', '\uc88b', GC_LVT), + ('\uc88c', '\uc88c', GC_LV), ('\uc88d', '\uc8a7', GC_LVT), ('\uc8a8', '\uc8a8', GC_LV), + ('\uc8a9', '\uc8c3', GC_LVT), ('\uc8c4', '\uc8c4', GC_LV), ('\uc8c5', '\uc8df', GC_LVT), + ('\uc8e0', '\uc8e0', GC_LV), ('\uc8e1', '\uc8fb', GC_LVT), ('\uc8fc', '\uc8fc', GC_LV), + ('\uc8fd', '\uc917', GC_LVT), ('\uc918', '\uc918', GC_LV), ('\uc919', '\uc933', GC_LVT), + ('\uc934', '\uc934', GC_LV), ('\uc935', '\uc94f', GC_LVT), ('\uc950', '\uc950', GC_LV), + ('\uc951', '\uc96b', GC_LVT), ('\uc96c', '\uc96c', GC_LV), ('\uc96d', '\uc987', GC_LVT), + ('\uc988', '\uc988', GC_LV), ('\uc989', '\uc9a3', GC_LVT), ('\uc9a4', '\uc9a4', GC_LV), + ('\uc9a5', '\uc9bf', GC_LVT), ('\uc9c0', '\uc9c0', GC_LV), ('\uc9c1', '\uc9db', GC_LVT), + ('\uc9dc', '\uc9dc', GC_LV), ('\uc9dd', '\uc9f7', GC_LVT), ('\uc9f8', '\uc9f8', GC_LV), + ('\uc9f9', '\uca13', GC_LVT), ('\uca14', '\uca14', GC_LV), ('\uca15', '\uca2f', GC_LVT), + ('\uca30', '\uca30', GC_LV), ('\uca31', '\uca4b', GC_LVT), ('\uca4c', '\uca4c', GC_LV), + ('\uca4d', '\uca67', GC_LVT), ('\uca68', '\uca68', GC_LV), ('\uca69', '\uca83', GC_LVT), + ('\uca84', '\uca84', GC_LV), ('\uca85', '\uca9f', GC_LVT), ('\ucaa0', '\ucaa0', GC_LV), + ('\ucaa1', '\ucabb', GC_LVT), ('\ucabc', '\ucabc', GC_LV), ('\ucabd', '\ucad7', GC_LVT), + ('\ucad8', '\ucad8', GC_LV), ('\ucad9', '\ucaf3', GC_LVT), ('\ucaf4', '\ucaf4', GC_LV), + ('\ucaf5', '\ucb0f', GC_LVT), ('\ucb10', '\ucb10', GC_LV), ('\ucb11', '\ucb2b', GC_LVT), + ('\ucb2c', '\ucb2c', GC_LV), ('\ucb2d', '\ucb47', GC_LVT), ('\ucb48', '\ucb48', GC_LV), + ('\ucb49', '\ucb63', GC_LVT), ('\ucb64', '\ucb64', GC_LV), ('\ucb65', '\ucb7f', GC_LVT), + ('\ucb80', '\ucb80', GC_LV), ('\ucb81', '\ucb9b', GC_LVT), ('\ucb9c', '\ucb9c', GC_LV), + ('\ucb9d', '\ucbb7', GC_LVT), ('\ucbb8', '\ucbb8', GC_LV), ('\ucbb9', '\ucbd3', GC_LVT), + ('\ucbd4', '\ucbd4', GC_LV), ('\ucbd5', '\ucbef', GC_LVT), ('\ucbf0', '\ucbf0', GC_LV), + ('\ucbf1', '\ucc0b', GC_LVT), ('\ucc0c', '\ucc0c', GC_LV), ('\ucc0d', '\ucc27', GC_LVT), + ('\ucc28', '\ucc28', GC_LV), ('\ucc29', '\ucc43', GC_LVT), ('\ucc44', '\ucc44', GC_LV), + ('\ucc45', '\ucc5f', GC_LVT), ('\ucc60', '\ucc60', GC_LV), ('\ucc61', '\ucc7b', GC_LVT), + ('\ucc7c', '\ucc7c', GC_LV), ('\ucc7d', '\ucc97', GC_LVT), ('\ucc98', '\ucc98', GC_LV), + ('\ucc99', '\uccb3', GC_LVT), ('\uccb4', '\uccb4', GC_LV), ('\uccb5', '\ucccf', GC_LVT), + ('\uccd0', '\uccd0', GC_LV), ('\uccd1', '\ucceb', GC_LVT), ('\uccec', '\uccec', GC_LV), + ('\ucced', '\ucd07', GC_LVT), ('\ucd08', '\ucd08', GC_LV), ('\ucd09', '\ucd23', GC_LVT), + ('\ucd24', '\ucd24', GC_LV), ('\ucd25', '\ucd3f', GC_LVT), ('\ucd40', '\ucd40', GC_LV), + ('\ucd41', '\ucd5b', GC_LVT), ('\ucd5c', '\ucd5c', GC_LV), ('\ucd5d', '\ucd77', GC_LVT), + ('\ucd78', '\ucd78', GC_LV), ('\ucd79', '\ucd93', GC_LVT), ('\ucd94', '\ucd94', GC_LV), + ('\ucd95', '\ucdaf', GC_LVT), ('\ucdb0', '\ucdb0', GC_LV), ('\ucdb1', '\ucdcb', GC_LVT), + ('\ucdcc', '\ucdcc', GC_LV), ('\ucdcd', '\ucde7', GC_LVT), ('\ucde8', '\ucde8', GC_LV), + ('\ucde9', '\uce03', GC_LVT), ('\uce04', '\uce04', GC_LV), ('\uce05', '\uce1f', GC_LVT), + ('\uce20', '\uce20', GC_LV), ('\uce21', '\uce3b', GC_LVT), ('\uce3c', '\uce3c', GC_LV), + ('\uce3d', '\uce57', GC_LVT), ('\uce58', '\uce58', GC_LV), ('\uce59', '\uce73', GC_LVT), + ('\uce74', '\uce74', GC_LV), ('\uce75', '\uce8f', GC_LVT), ('\uce90', '\uce90', GC_LV), + ('\uce91', '\uceab', GC_LVT), ('\uceac', '\uceac', GC_LV), ('\ucead', '\ucec7', GC_LVT), + ('\ucec8', '\ucec8', GC_LV), ('\ucec9', '\ucee3', GC_LVT), ('\ucee4', '\ucee4', GC_LV), + ('\ucee5', '\uceff', GC_LVT), ('\ucf00', '\ucf00', GC_LV), ('\ucf01', '\ucf1b', GC_LVT), + ('\ucf1c', '\ucf1c', GC_LV), ('\ucf1d', '\ucf37', GC_LVT), ('\ucf38', '\ucf38', GC_LV), + ('\ucf39', '\ucf53', GC_LVT), ('\ucf54', '\ucf54', GC_LV), ('\ucf55', '\ucf6f', GC_LVT), + ('\ucf70', '\ucf70', GC_LV), ('\ucf71', '\ucf8b', GC_LVT), ('\ucf8c', '\ucf8c', GC_LV), + ('\ucf8d', '\ucfa7', GC_LVT), ('\ucfa8', '\ucfa8', GC_LV), ('\ucfa9', '\ucfc3', GC_LVT), + ('\ucfc4', '\ucfc4', GC_LV), ('\ucfc5', '\ucfdf', GC_LVT), ('\ucfe0', '\ucfe0', GC_LV), + ('\ucfe1', '\ucffb', GC_LVT), ('\ucffc', '\ucffc', GC_LV), ('\ucffd', '\ud017', GC_LVT), + ('\ud018', '\ud018', GC_LV), ('\ud019', '\ud033', GC_LVT), ('\ud034', '\ud034', GC_LV), + ('\ud035', '\ud04f', GC_LVT), ('\ud050', '\ud050', GC_LV), ('\ud051', '\ud06b', GC_LVT), + ('\ud06c', '\ud06c', GC_LV), ('\ud06d', '\ud087', GC_LVT), ('\ud088', '\ud088', GC_LV), + ('\ud089', '\ud0a3', GC_LVT), ('\ud0a4', '\ud0a4', GC_LV), ('\ud0a5', '\ud0bf', GC_LVT), + ('\ud0c0', '\ud0c0', GC_LV), ('\ud0c1', '\ud0db', GC_LVT), ('\ud0dc', '\ud0dc', GC_LV), + ('\ud0dd', '\ud0f7', GC_LVT), ('\ud0f8', '\ud0f8', GC_LV), ('\ud0f9', '\ud113', GC_LVT), + ('\ud114', '\ud114', GC_LV), ('\ud115', '\ud12f', GC_LVT), ('\ud130', '\ud130', GC_LV), + ('\ud131', '\ud14b', GC_LVT), ('\ud14c', '\ud14c', GC_LV), ('\ud14d', '\ud167', GC_LVT), + ('\ud168', '\ud168', GC_LV), ('\ud169', '\ud183', GC_LVT), ('\ud184', '\ud184', GC_LV), + ('\ud185', '\ud19f', GC_LVT), ('\ud1a0', '\ud1a0', GC_LV), ('\ud1a1', '\ud1bb', GC_LVT), + ('\ud1bc', '\ud1bc', GC_LV), ('\ud1bd', '\ud1d7', GC_LVT), ('\ud1d8', '\ud1d8', GC_LV), + ('\ud1d9', '\ud1f3', GC_LVT), ('\ud1f4', '\ud1f4', GC_LV), ('\ud1f5', '\ud20f', GC_LVT), + ('\ud210', '\ud210', GC_LV), ('\ud211', '\ud22b', GC_LVT), ('\ud22c', '\ud22c', GC_LV), + ('\ud22d', '\ud247', GC_LVT), ('\ud248', '\ud248', GC_LV), ('\ud249', '\ud263', GC_LVT), + ('\ud264', '\ud264', GC_LV), ('\ud265', '\ud27f', GC_LVT), ('\ud280', '\ud280', GC_LV), + ('\ud281', '\ud29b', GC_LVT), ('\ud29c', '\ud29c', GC_LV), ('\ud29d', '\ud2b7', GC_LVT), + ('\ud2b8', '\ud2b8', GC_LV), ('\ud2b9', '\ud2d3', GC_LVT), ('\ud2d4', '\ud2d4', GC_LV), + ('\ud2d5', '\ud2ef', GC_LVT), ('\ud2f0', '\ud2f0', GC_LV), ('\ud2f1', '\ud30b', GC_LVT), + ('\ud30c', '\ud30c', GC_LV), ('\ud30d', '\ud327', GC_LVT), ('\ud328', '\ud328', GC_LV), + ('\ud329', '\ud343', GC_LVT), ('\ud344', '\ud344', GC_LV), ('\ud345', '\ud35f', GC_LVT), + ('\ud360', '\ud360', GC_LV), ('\ud361', '\ud37b', GC_LVT), ('\ud37c', '\ud37c', GC_LV), + ('\ud37d', '\ud397', GC_LVT), ('\ud398', '\ud398', GC_LV), ('\ud399', '\ud3b3', GC_LVT), + ('\ud3b4', '\ud3b4', GC_LV), ('\ud3b5', '\ud3cf', GC_LVT), ('\ud3d0', '\ud3d0', GC_LV), + ('\ud3d1', '\ud3eb', GC_LVT), ('\ud3ec', '\ud3ec', GC_LV), ('\ud3ed', '\ud407', GC_LVT), + ('\ud408', '\ud408', GC_LV), ('\ud409', '\ud423', GC_LVT), ('\ud424', '\ud424', GC_LV), + ('\ud425', '\ud43f', GC_LVT), ('\ud440', '\ud440', GC_LV), ('\ud441', '\ud45b', GC_LVT), + ('\ud45c', '\ud45c', GC_LV), ('\ud45d', '\ud477', GC_LVT), ('\ud478', '\ud478', GC_LV), + ('\ud479', '\ud493', GC_LVT), ('\ud494', '\ud494', GC_LV), ('\ud495', '\ud4af', GC_LVT), + ('\ud4b0', '\ud4b0', GC_LV), ('\ud4b1', '\ud4cb', GC_LVT), ('\ud4cc', '\ud4cc', GC_LV), + ('\ud4cd', '\ud4e7', GC_LVT), ('\ud4e8', '\ud4e8', GC_LV), ('\ud4e9', '\ud503', GC_LVT), + ('\ud504', '\ud504', GC_LV), ('\ud505', '\ud51f', GC_LVT), ('\ud520', '\ud520', GC_LV), + ('\ud521', '\ud53b', GC_LVT), ('\ud53c', '\ud53c', GC_LV), ('\ud53d', '\ud557', GC_LVT), + ('\ud558', '\ud558', GC_LV), ('\ud559', '\ud573', GC_LVT), ('\ud574', '\ud574', GC_LV), + ('\ud575', '\ud58f', GC_LVT), ('\ud590', '\ud590', GC_LV), ('\ud591', '\ud5ab', GC_LVT), + ('\ud5ac', '\ud5ac', GC_LV), ('\ud5ad', '\ud5c7', GC_LVT), ('\ud5c8', '\ud5c8', GC_LV), + ('\ud5c9', '\ud5e3', GC_LVT), ('\ud5e4', '\ud5e4', GC_LV), ('\ud5e5', '\ud5ff', GC_LVT), + ('\ud600', '\ud600', GC_LV), ('\ud601', '\ud61b', GC_LVT), ('\ud61c', '\ud61c', GC_LV), + ('\ud61d', '\ud637', GC_LVT), ('\ud638', '\ud638', GC_LV), ('\ud639', '\ud653', GC_LVT), + ('\ud654', '\ud654', GC_LV), ('\ud655', '\ud66f', GC_LVT), ('\ud670', '\ud670', GC_LV), + ('\ud671', '\ud68b', GC_LVT), ('\ud68c', '\ud68c', GC_LV), ('\ud68d', '\ud6a7', GC_LVT), + ('\ud6a8', '\ud6a8', GC_LV), ('\ud6a9', '\ud6c3', GC_LVT), ('\ud6c4', '\ud6c4', GC_LV), + ('\ud6c5', '\ud6df', GC_LVT), ('\ud6e0', '\ud6e0', GC_LV), ('\ud6e1', '\ud6fb', GC_LVT), + ('\ud6fc', '\ud6fc', GC_LV), ('\ud6fd', '\ud717', GC_LVT), ('\ud718', '\ud718', GC_LV), + ('\ud719', '\ud733', GC_LVT), ('\ud734', '\ud734', GC_LV), ('\ud735', '\ud74f', GC_LVT), + ('\ud750', '\ud750', GC_LV), ('\ud751', '\ud76b', GC_LVT), ('\ud76c', '\ud76c', GC_LV), + ('\ud76d', '\ud787', GC_LVT), ('\ud788', '\ud788', GC_LV), ('\ud789', '\ud7a3', GC_LVT), + ('\ud7b0', '\ud7c6', GC_V), ('\ud7cb', '\ud7fb', GC_T), ('\ufb1e', '\ufb1e', GC_Extend), + ('\ufe00', '\ufe0f', GC_Extend), ('\ufe20', '\ufe2d', GC_Extend), ('\ufeff', '\ufeff', + GC_Control), ('\uff9e', '\uff9f', GC_Extend), ('\ufff0', '\ufffb', GC_Control), + ('\U000101fd', '\U000101fd', GC_Extend), ('\U000102e0', '\U000102e0', GC_Extend), + ('\U00010376', '\U0001037a', GC_Extend), ('\U00010a01', '\U00010a03', GC_Extend), + ('\U00010a05', '\U00010a06', GC_Extend), ('\U00010a0c', '\U00010a0f', GC_Extend), + ('\U00010a38', '\U00010a3a', GC_Extend), ('\U00010a3f', '\U00010a3f', GC_Extend), + ('\U00010ae5', '\U00010ae6', GC_Extend), ('\U00011000', '\U00011000', GC_SpacingMark), + ('\U00011001', '\U00011001', GC_Extend), ('\U00011002', '\U00011002', GC_SpacingMark), + ('\U00011038', '\U00011046', GC_Extend), ('\U0001107f', '\U00011081', GC_Extend), + ('\U00011082', '\U00011082', GC_SpacingMark), ('\U000110b0', '\U000110b2', GC_SpacingMark), + ('\U000110b3', '\U000110b6', GC_Extend), ('\U000110b7', '\U000110b8', GC_SpacingMark), + ('\U000110b9', '\U000110ba', GC_Extend), ('\U000110bd', '\U000110bd', GC_Control), + ('\U00011100', '\U00011102', GC_Extend), ('\U00011127', '\U0001112b', GC_Extend), + ('\U0001112c', '\U0001112c', GC_SpacingMark), ('\U0001112d', '\U00011134', GC_Extend), + ('\U00011173', '\U00011173', GC_Extend), ('\U00011180', '\U00011181', GC_Extend), + ('\U00011182', '\U00011182', GC_SpacingMark), ('\U000111b3', '\U000111b5', GC_SpacingMark), + ('\U000111b6', '\U000111be', GC_Extend), ('\U000111bf', '\U000111c0', GC_SpacingMark), + ('\U0001122c', '\U0001122e', GC_SpacingMark), ('\U0001122f', '\U00011231', GC_Extend), + ('\U00011232', '\U00011233', GC_SpacingMark), ('\U00011234', '\U00011234', GC_Extend), + ('\U00011235', '\U00011235', GC_SpacingMark), ('\U00011236', '\U00011237', GC_Extend), + ('\U000112df', '\U000112df', GC_Extend), ('\U000112e0', '\U000112e2', GC_SpacingMark), + ('\U000112e3', '\U000112ea', GC_Extend), ('\U00011301', '\U00011301', GC_Extend), + ('\U00011302', '\U00011303', GC_SpacingMark), ('\U0001133c', '\U0001133c', GC_Extend), + ('\U0001133e', '\U0001133e', GC_Extend), ('\U0001133f', '\U0001133f', GC_SpacingMark), + ('\U00011340', '\U00011340', GC_Extend), ('\U00011341', '\U00011344', GC_SpacingMark), + ('\U00011347', '\U00011348', GC_SpacingMark), ('\U0001134b', '\U0001134d', GC_SpacingMark), + ('\U00011357', '\U00011357', GC_Extend), ('\U00011362', '\U00011363', GC_SpacingMark), + ('\U00011366', '\U0001136c', GC_Extend), ('\U00011370', '\U00011374', GC_Extend), + ('\U000114b0', '\U000114b0', GC_Extend), ('\U000114b1', '\U000114b2', GC_SpacingMark), + ('\U000114b3', '\U000114b8', GC_Extend), ('\U000114b9', '\U000114b9', GC_SpacingMark), + ('\U000114ba', '\U000114ba', GC_Extend), ('\U000114bb', '\U000114bc', GC_SpacingMark), + ('\U000114bd', '\U000114bd', GC_Extend), ('\U000114be', '\U000114be', GC_SpacingMark), + ('\U000114bf', '\U000114c0', GC_Extend), ('\U000114c1', '\U000114c1', GC_SpacingMark), + ('\U000114c2', '\U000114c3', GC_Extend), ('\U000115af', '\U000115af', GC_Extend), + ('\U000115b0', '\U000115b1', GC_SpacingMark), ('\U000115b2', '\U000115b5', GC_Extend), + ('\U000115b8', '\U000115bb', GC_SpacingMark), ('\U000115bc', '\U000115bd', GC_Extend), + ('\U000115be', '\U000115be', GC_SpacingMark), ('\U000115bf', '\U000115c0', GC_Extend), + ('\U00011630', '\U00011632', GC_SpacingMark), ('\U00011633', '\U0001163a', GC_Extend), + ('\U0001163b', '\U0001163c', GC_SpacingMark), ('\U0001163d', '\U0001163d', GC_Extend), + ('\U0001163e', '\U0001163e', GC_SpacingMark), ('\U0001163f', '\U00011640', GC_Extend), + ('\U000116ab', '\U000116ab', GC_Extend), ('\U000116ac', '\U000116ac', GC_SpacingMark), + ('\U000116ad', '\U000116ad', GC_Extend), ('\U000116ae', '\U000116af', GC_SpacingMark), + ('\U000116b0', '\U000116b5', GC_Extend), ('\U000116b6', '\U000116b6', GC_SpacingMark), + ('\U000116b7', '\U000116b7', GC_Extend), ('\U00016af0', '\U00016af4', GC_Extend), + ('\U00016b30', '\U00016b36', GC_Extend), ('\U00016f51', '\U00016f7e', GC_SpacingMark), + ('\U00016f8f', '\U00016f92', GC_Extend), ('\U0001bc9d', '\U0001bc9e', GC_Extend), + ('\U0001bca0', '\U0001bca3', GC_Control), ('\U0001d165', '\U0001d165', GC_Extend), + ('\U0001d166', '\U0001d166', GC_SpacingMark), ('\U0001d167', '\U0001d169', GC_Extend), + ('\U0001d16d', '\U0001d16d', GC_SpacingMark), ('\U0001d16e', '\U0001d172', GC_Extend), + ('\U0001d173', '\U0001d17a', GC_Control), ('\U0001d17b', '\U0001d182', GC_Extend), + ('\U0001d185', '\U0001d18b', GC_Extend), ('\U0001d1aa', '\U0001d1ad', GC_Extend), + ('\U0001d242', '\U0001d244', GC_Extend), ('\U0001e8d0', '\U0001e8d6', GC_Extend), + ('\U0001f1e6', '\U0001f1ff', GC_RegionalIndicator), ('\U000e0000', '\U000e00ff', + GC_Control), ('\U000e0100', '\U000e01ef', GC_Extend), ('\U000e01f0', '\U000e0fff', + GC_Control) + ]; + +} diff --git a/src/libunicode/u_str.rs b/src/libunicode/u_str.rs index b0c40cdbcf9c5..263cf5a730a57 100644 --- a/src/libunicode/u_str.rs +++ b/src/libunicode/u_str.rs @@ -15,12 +15,15 @@ * methods provided by the UnicodeChar trait. */ +use core::clone::Clone; +use core::cmp; use core::collections::Collection; -use core::iter::{Filter, AdditiveIterator}; +use core::iter::{Filter, AdditiveIterator, Iterator, DoubleEndedIterator}; +use core::option::{Option, None, Some}; use core::str::{CharSplits, StrSlice}; -use core::iter::Iterator; use u_char; use u_char::UnicodeChar; +use tables::grapheme::GraphemeCat; /// An iterator over the words of a string, separated by a sequence of whitespace pub type Words<'a> = @@ -28,6 +31,36 @@ pub type Words<'a> = /// Methods for Unicode string slices pub trait UnicodeStrSlice<'a> { + /// Returns an iterator over the + /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) + /// of the string. + /// + /// If `is_extended` is true, the iterator is over the *extended grapheme clusters*; + /// otherwise, the iterator is over the *legacy grapheme clusters*. + /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) + /// recommends extended grapheme cluster boundaries for general processing. + /// + /// # Example + /// + /// ```rust + /// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::>(); + /// assert_eq!(gr1.as_slice(), &["a\u0310", "e\u0301", "o\u0308\u0332"]); + /// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::>(); + /// assert_eq!(gr2.as_slice(), &["a", "\r\n", "b", "🇷🇺🇸🇹"]); + /// ``` + fn graphemes(&self, is_extended: bool) -> Graphemes<'a>; + + /// Returns an iterator over the grapheme clusters of self and their byte offsets. + /// See `graphemes()` method for more information. + /// + /// # Example + /// + /// ```rust + /// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::>(); + /// assert_eq!(gr_inds.as_slice(), &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]); + /// ``` + fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'a>; + /// An iterator over the words of a string (subsequences separated /// by any sequence of whitespace). Sequences of whitespace are /// collapsed, so empty "words" are not included. @@ -92,6 +125,16 @@ pub trait UnicodeStrSlice<'a> { } impl<'a> UnicodeStrSlice<'a> for &'a str { + #[inline] + fn graphemes(&self, is_extended: bool) -> Graphemes<'a> { + Graphemes { string: *self, extended: is_extended, cat: None, catb: None } + } + + #[inline] + fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'a> { + GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) } + } + #[inline] fn words(&self) -> Words<'a> { self.split(u_char::is_whitespace).filter(|s| !s.is_empty()) @@ -123,3 +166,257 @@ impl<'a> UnicodeStrSlice<'a> for &'a str { self.trim_right_chars(u_char::is_whitespace) } } + +/// External iterator for grapheme clusters and byte offsets. +#[deriving(Clone)] +pub struct GraphemeIndices<'a> { + start_offset: uint, + iter: Graphemes<'a>, +} + +impl<'a> Iterator<(uint, &'a str)> for GraphemeIndices<'a> { + #[inline] + fn next(&mut self) -> Option<(uint, &'a str)> { + self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s)) + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + self.iter.size_hint() + } +} + +impl<'a> DoubleEndedIterator<(uint, &'a str)> for GraphemeIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(uint, &'a str)> { + self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s)) + } +} + +/// External iterator for a string's +/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries). +#[deriving(Clone)] +pub struct Graphemes<'a> { + string: &'a str, + extended: bool, + cat: Option, + catb: Option, +} + +// state machine for cluster boundary rules +#[deriving(PartialEq,Eq)] +enum GraphemeState { + Start, + FindExtend, + HangulL, + HangulLV, + HangulLVT, + Regional, +} + +impl<'a> Iterator<&'a str> for Graphemes<'a> { + #[inline] + fn size_hint(&self) -> (uint, Option) { + let slen = self.string.len(); + (cmp::min(slen, 1u), Some(slen)) + } + + #[inline] + fn next(&mut self) -> Option<&'a str> { + use gr = tables::grapheme; + if self.string.len() == 0 { + return None; + } + + let mut take_curr = true; + let mut idx = 0; + let mut state = Start; + let mut cat = gr::GC_Any; + for (curr, ch) in self.string.char_indices() { + idx = curr; + + // retrieve cached category, if any + // We do this because most of the time we would end up + // looking up each character twice. + cat = match self.cat { + None => gr::grapheme_category(ch), + _ => self.cat.take_unwrap() + }; + + if match cat { + gr::GC_Extend => true, + gr::GC_SpacingMark if self.extended => true, + _ => false + } { + state = FindExtend; // rule GB9/GB9a + continue; + } + + state = match state { + Start if '\r' == ch => { + let slen = self.string.len(); + let nidx = idx + 1; + if nidx != slen && self.string.char_at(nidx) == '\n' { + idx = nidx; // rule GB3 + } + break; // rule GB4 + } + Start => match cat { + gr::GC_Control => break, + gr::GC_L => HangulL, + gr::GC_LV | gr::GC_V => HangulLV, + gr::GC_LVT | gr::GC_T => HangulLVT, + gr::GC_RegionalIndicator => Regional, + _ => FindExtend + }, + FindExtend => { // found non-extending when looking for extending + take_curr = false; + break; + }, + HangulL => match cat { // rule GB6: L x (L|V|LV|LVT) + gr::GC_L => continue, + gr::GC_LV | gr::GC_V => HangulLV, + gr::GC_LVT => HangulLVT, + _ => { + take_curr = false; + break; + } + }, + HangulLV => match cat { // rule GB7: (LV|V) x (V|T) + gr::GC_V => continue, + gr::GC_T => HangulLVT, + _ => { + take_curr = false; + break; + } + }, + HangulLVT => match cat { // rule GB8: (LVT|T) x T + gr::GC_T => continue, + _ => { + take_curr = false; + break; + } + }, + Regional => match cat { // rule GB8a + gr::GC_RegionalIndicator => continue, + _ => { + take_curr = false; + break; + } + } + } + } + + self.cat = if take_curr { + idx = self.string.char_range_at(idx).next; + None + } else { + Some(cat) + }; + + let retstr = self.string.slice_to(idx); + self.string = self.string.slice_from(idx); + Some(retstr) + } +} + +impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + use gr = tables::grapheme; + if self.string.len() == 0 { + return None; + } + + let mut take_curr = true; + let mut idx = self.string.len(); + let mut previdx = idx; + let mut state = Start; + let mut cat = gr::GC_Any; + for (curr, ch) in self.string.char_indices().rev() { + previdx = idx; + idx = curr; + + // cached category, if any + cat = match self.catb { + None => gr::grapheme_category(ch), + _ => self.catb.take_unwrap() + }; + + // a matching state machine that runs *backwards* across an input string + // note that this has some implications for the Hangul matching, since + // we now need to know what the rightward letter is: + // + // Right to left, we have: + // L x L + // V x (L|V|LV) + // T x (V|T|LV|LVT) + // HangulL means the letter to the right is L + // HangulLV means the letter to the right is V + // HangulLVT means the letter to the right is T + state = match state { + Start if '\n' == ch => { + if idx > 0 && '\r' == self.string.char_at_reverse(idx) { + idx -= 1; // rule GB3 + } + break; // rule GB4 + }, + Start | FindExtend => match cat { + gr::GC_Extend => FindExtend, + gr::GC_SpacingMark if self.extended => FindExtend, + gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL, + gr::GC_V => HangulLV, + gr::GC_T => HangulLVT, + gr::GC_RegionalIndicator => Regional, + gr::GC_Control => { + take_curr = Start == state; + break; + }, + _ => break + }, + HangulL => match cat { // char to right is an L + gr::GC_L => continue, // L x L is the only legal match + _ => { + take_curr = false; + break; + } + }, + HangulLV => match cat { // char to right is a V + gr::GC_V => continue, // V x V, right char is still V + gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L + _ => { + take_curr = false; + break; + } + }, + HangulLVT => match cat { // char to right is a T + gr::GC_T => continue, // T x T, right char is still T + gr::GC_V => HangulLV, // V x T, right char is now V + gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L + _ => { + take_curr = false; + break; + } + }, + Regional => match cat { // rule GB8a + gr::GC_RegionalIndicator => continue, + _ => { + take_curr = false; + break; + } + } + } + } + + self.catb = if take_curr { + None + } else { + idx = previdx; + Some(cat) + }; + + let retstr = self.string.slice_from(idx); + self.string = self.string.slice_to(idx); + Some(retstr) + } +}