diff --git a/scripts/unicode.py b/scripts/unicode.py index 580b3c0..617f8cd 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -280,7 +280,8 @@ def emit_break_module(f, break_table, break_cats, name): f.write((" %sC_" % Name[0]) + cat + ",\n") f.write(""" } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -288,14 +289,20 @@ def emit_break_module(f, break_table, break_cats, name): else { Greater } }) { Ok(idx) => { - let (_, _, cat) = r[idx]; - cat + let (lower, upper, cat) = r[idx]; + (lower as u32, upper as u32, cat) + } + Err(idx) => { + ( + if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + %sC_Any, + ) } - Err(_) => %sC_Any } } - pub fn %s_category(c: char) -> %sCat { + pub fn %s_category(c: char) -> (u32, u32, %sCat) { bsearch_range_value_table(c, %s_cat_table) } diff --git a/src/grapheme.rs b/src/grapheme.rs index cde6526..b66536e 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -178,6 +178,8 @@ pub struct GraphemeCursor { // Set if a call to `prev_boundary` or `next_boundary` was suspended due // to needing more input. resuming: bool, + // Cached grapheme category and associated scalar value range. + grapheme_cat_cache: (u32, u32, GraphemeCat), } /// An error return indicating that not enough content was available in the @@ -276,9 +278,20 @@ impl GraphemeCursor { pre_context_offset: None, ris_count: None, resuming: false, + grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control), } } + fn grapheme_category(&mut self, ch: char) -> GraphemeCat { + use tables::grapheme as gr; + // If this char isn't within the cached range, update the cache to the + // range that includes it. + if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 { + self.grapheme_cat_cache = gr::grapheme_category(ch); + } + self.grapheme_cat_cache.2 + } + // Not sure I'm gonna keep this, the advantage over new() seems thin. /// Set the cursor to a new location in the same string. @@ -349,7 +362,7 @@ impl GraphemeCursor { self.pre_context_offset = None; if self.is_extended && chunk_start + chunk.len() == self.offset { let ch = chunk.chars().rev().next().unwrap(); - if gr::grapheme_category(ch) == gr::GC_Prepend { + if self.grapheme_category(ch) == gr::GC_Prepend { self.decide(false); // GB9b return; } @@ -359,7 +372,7 @@ impl GraphemeCursor { GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start { let ch = chunk.chars().rev().next().unwrap(); - self.cat_before = Some(gr::grapheme_category(ch)); + self.cat_before = Some(self.grapheme_category(ch)); }, } } @@ -393,7 +406,7 @@ impl GraphemeCursor { use tables::grapheme as gr; let mut ris_count = self.ris_count.unwrap_or(0); for ch in chunk.chars().rev() { - if gr::grapheme_category(ch) != gr::GC_Regional_Indicator { + if self.grapheme_category(ch) != gr::GC_Regional_Indicator { self.ris_count = Some(ris_count); self.decide((ris_count % 2) == 0); return; @@ -413,13 +426,13 @@ impl GraphemeCursor { use tables::grapheme as gr; let mut iter = chunk.chars().rev(); if let Some(ch) = iter.next() { - if gr::grapheme_category(ch) != gr::GC_ZWJ { + if self.grapheme_category(ch) != gr::GC_ZWJ { self.decide(true); return; } } for ch in iter { - match gr::grapheme_category(ch) { + match self.grapheme_category(ch) { gr::GC_Extend => (), gr::GC_Extended_Pictographic => { self.decide(false); @@ -481,7 +494,7 @@ impl GraphemeCursor { let offset_in_chunk = self.offset - chunk_start; if self.cat_after.is_none() { let ch = chunk[offset_in_chunk..].chars().next().unwrap(); - self.cat_after = Some(gr::grapheme_category(ch)); + self.cat_after = Some(self.grapheme_category(ch)); } if self.offset == chunk_start { let mut need_pre_context = true; @@ -497,7 +510,7 @@ impl GraphemeCursor { } if self.cat_before.is_none() { let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap(); - self.cat_before = Some(gr::grapheme_category(ch)); + self.cat_before = Some(self.grapheme_category(ch)); } match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) { PairResult::NotBreak => return self.decision(false), @@ -553,7 +566,6 @@ impl GraphemeCursor { /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None)); /// ``` pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { - use tables::grapheme as gr; if self.offset == self.len { return Ok(None); } @@ -562,14 +574,14 @@ impl GraphemeCursor { loop { if self.resuming { if self.cat_after.is_none() { - self.cat_after = Some(gr::grapheme_category(ch)); + self.cat_after = Some(self.grapheme_category(ch)); } } else { self.offset += ch.len_utf8(); self.state = GraphemeState::Unknown; self.cat_before = self.cat_after.take(); if self.cat_before.is_none() { - self.cat_before = Some(gr::grapheme_category(ch)); + self.cat_before = Some(self.grapheme_category(ch)); } if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator { self.ris_count = self.ris_count.map(|c| c + 1); @@ -578,7 +590,7 @@ impl GraphemeCursor { } if let Some(next_ch) = iter.next() { ch = next_ch; - self.cat_after = Some(gr::grapheme_category(ch)); + self.cat_after = Some(self.grapheme_category(ch)); } else if self.offset == self.len { self.decide(true); } else { @@ -629,7 +641,6 @@ impl GraphemeCursor { /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None)); /// ``` pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { - use tables::grapheme as gr; if self.offset == 0 { return Ok(None); } @@ -644,7 +655,7 @@ impl GraphemeCursor { return Err(GraphemeIncomplete::PrevChunk); } if self.resuming { - self.cat_before = Some(gr::grapheme_category(ch)); + self.cat_before = Some(self.grapheme_category(ch)); } else { self.offset -= ch.len_utf8(); self.cat_after = self.cat_before.take(); @@ -654,12 +665,12 @@ impl GraphemeCursor { } if let Some(prev_ch) = iter.next() { ch = prev_ch; - self.cat_before = Some(gr::grapheme_category(ch)); + self.cat_before = Some(self.grapheme_category(ch)); } else if self.offset == 0 { self.decide(true); } else { self.resuming = true; - self.cat_after = Some(gr::grapheme_category(ch)); + self.cat_after = Some(self.grapheme_category(ch)); return Err(GraphemeIncomplete::PrevChunk); } } diff --git a/src/sentence.rs b/src/sentence.rs index c16c927..48cafba 100644 --- a/src/sentence.rs +++ b/src/sentence.rs @@ -115,7 +115,7 @@ mod fwd { for next_char in ahead.chars() { //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower - match se::sentence_category(next_char) { + match se::sentence_category(next_char).2 { se::SC_Lower => return true, se::SC_OLetter | se::SC_Upper | @@ -182,7 +182,7 @@ mod fwd { let position_before = self.pos; let state_before = self.state.clone(); - let next_cat = se::sentence_category(next_char); + let next_cat = se::sentence_category(next_char).2; self.pos += next_char.len_utf8(); self.state = self.state.next(next_cat); diff --git a/src/tables.rs b/src/tables.rs index bfd7290..8e98c9a 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -345,7 +345,8 @@ pub mod grapheme { GC_ZWJ, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -353,14 +354,20 @@ pub mod grapheme { else { Greater } }) { Ok(idx) => { - let (_, _, cat) = r[idx]; - cat + let (lower, upper, cat) = r[idx]; + (lower as u32, upper as u32, cat) + } + Err(idx) => { + ( + if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + GC_Any, + ) } - Err(_) => GC_Any } } - pub fn grapheme_category(c: char) -> GraphemeCat { + pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) { bsearch_range_value_table(c, grapheme_cat_table) } @@ -980,7 +987,8 @@ pub mod word { WC_ZWJ, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> WordCat { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -988,14 +996,20 @@ pub mod word { else { Greater } }) { Ok(idx) => { - let (_, _, cat) = r[idx]; - cat + let (lower, upper, cat) = r[idx]; + (lower as u32, upper as u32, cat) + } + Err(idx) => { + ( + if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + WC_Any, + ) } - Err(_) => WC_Any } } - pub fn word_category(c: char) -> WordCat { + pub fn word_category(c: char) -> (u32, u32, WordCat) { bsearch_range_value_table(c, word_cat_table) } @@ -1439,7 +1453,8 @@ pub mod emoji { EC_Extended_Pictographic, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> EmojiCat { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1447,14 +1462,20 @@ pub mod emoji { else { Greater } }) { Ok(idx) => { - let (_, _, cat) = r[idx]; - cat + let (lower, upper, cat) = r[idx]; + (lower as u32, upper as u32, cat) + } + Err(idx) => { + ( + if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + EC_Any, + ) } - Err(_) => EC_Any } } - pub fn emoji_category(c: char) -> EmojiCat { + pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) { bsearch_range_value_table(c, emoji_cat_table) } @@ -1535,7 +1556,8 @@ pub mod sentence { SC_Upper, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> SentenceCat { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1543,14 +1565,20 @@ pub mod sentence { else { Greater } }) { Ok(idx) => { - let (_, _, cat) = r[idx]; - cat + let (lower, upper, cat) = r[idx]; + (lower as u32, upper as u32, cat) + } + Err(idx) => { + ( + if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + SC_Any, + ) } - Err(_) => SC_Any } } - pub fn sentence_category(c: char) -> SentenceCat { + pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) { bsearch_range_value_table(c, sentence_cat_table) } diff --git a/src/word.rs b/src/word.rs index 6e9c049..5cf111a 100644 --- a/src/word.rs +++ b/src/word.rs @@ -125,7 +125,7 @@ enum RegionalState { fn is_emoji(ch: char) -> bool { use tables::emoji; - emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic + emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic } impl<'a> Iterator for UWordBounds<'a> { @@ -164,7 +164,7 @@ impl<'a> Iterator for UWordBounds<'a> { prev_zwj = cat == wd::WC_ZWJ; // if there's a category cached, grab it cat = match self.cat { - None => wd::word_category(ch), + None => wd::word_category(ch).2, _ => self.cat.take().unwrap() }; take_cat = true; @@ -391,7 +391,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { // if there's a category cached, grab it cat = match self.catb { - None => wd::word_category(ch), + None => wd::word_category(ch).2, _ => self.catb.take().unwrap() }; take_cat = true; @@ -533,7 +533,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { if regional_state == RegionalState::Unknown { let count = self.string[..previdx] .chars().rev() - .map(|c| wd::word_category(c)) + .map(|c| wd::word_category(c).2) .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)) .take_while(|&c| c == wd::WC_Regional_Indicator) .count(); @@ -624,7 +624,7 @@ impl<'a> UWordBounds<'a> { let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); if nidx < self.string.len() { let nch = self.string[nidx..].chars().next().unwrap(); - Some(wd::word_category(nch)) + Some(wd::word_category(nch).2) } else { None } @@ -635,7 +635,7 @@ impl<'a> UWordBounds<'a> { use tables::word as wd; if idx > 0 { let nch = self.string[..idx].chars().next_back().unwrap(); - Some(wd::word_category(nch)) + Some(wd::word_category(nch).2) } else { None }