diff --git a/scripts/unicode.py b/scripts/unicode.py
index 580b3c0..617f8cd 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -280,7 +280,8 @@ def emit_break_module(f, break_table, break_cats, name):
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
+ use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -288,14 +289,20 @@ def emit_break_module(f, break_table, break_cats, name):
else { Greater }
}) {
Ok(idx) => {
- let (_, _, cat) = r[idx];
- cat
+ let (lower, upper, cat) = r[idx];
+ (lower as u32, upper as u32, cat)
+ }
+ Err(idx) => {
+ (
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ %sC_Any,
+ )
}
- Err(_) => %sC_Any
}
}
- pub fn %s_category(c: char) -> %sCat {
+ pub fn %s_category(c: char) -> (u32, u32, %sCat) {
bsearch_range_value_table(c, %s_cat_table)
}
diff --git a/src/grapheme.rs b/src/grapheme.rs
index cde6526..b66536e 100644
--- a/src/grapheme.rs
+++ b/src/grapheme.rs
@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
// to needing more input.
resuming: bool,
+ // Cached grapheme category and associated scalar value range.
+ grapheme_cat_cache: (u32, u32, GraphemeCat),
}
/// An error return indicating that not enough content was available in the
@@ -276,9 +278,20 @@ impl GraphemeCursor {
pre_context_offset: None,
ris_count: None,
resuming: false,
+ grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
}
}
+ fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
+ use tables::grapheme as gr;
+ // If this char isn't within the cached range, update the cache to the
+ // range that includes it.
+ if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
+ self.grapheme_cat_cache = gr::grapheme_category(ch);
+ }
+ self.grapheme_cat_cache.2
+ }
+
// Not sure I'm gonna keep this, the advantage over new() seems thin.
/// Set the cursor to a new location in the same string.
@@ -349,7 +362,7 @@ impl GraphemeCursor {
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
- if gr::grapheme_category(ch) == gr::GC_Prepend {
+ if self.grapheme_category(ch) == gr::GC_Prepend {
self.decide(false); // GB9b
return;
}
@@ -359,7 +372,7 @@ impl GraphemeCursor {
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().rev().next().unwrap();
- self.cat_before = Some(gr::grapheme_category(ch));
+ self.cat_before = Some(self.grapheme_category(ch));
},
}
}
@@ -393,7 +406,7 @@ impl GraphemeCursor {
use tables::grapheme as gr;
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
- if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
+ if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
self.ris_count = Some(ris_count);
self.decide((ris_count % 2) == 0);
return;
@@ -413,13 +426,13 @@ impl GraphemeCursor {
use tables::grapheme as gr;
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
- if gr::grapheme_category(ch) != gr::GC_ZWJ {
+ if self.grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
- match gr::grapheme_category(ch) {
+ match self.grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_Extended_Pictographic => {
self.decide(false);
@@ -481,7 +494,7 @@ impl GraphemeCursor {
let offset_in_chunk = self.offset - chunk_start;
if self.cat_after.is_none() {
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
- self.cat_after = Some(gr::grapheme_category(ch));
+ self.cat_after = Some(self.grapheme_category(ch));
}
if self.offset == chunk_start {
let mut need_pre_context = true;
@@ -497,7 +510,7 @@ impl GraphemeCursor {
}
if self.cat_before.is_none() {
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
- self.cat_before = Some(gr::grapheme_category(ch));
+ self.cat_before = Some(self.grapheme_category(ch));
}
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
PairResult::NotBreak => return self.decision(false),
@@ -553,7 +566,6 @@ impl GraphemeCursor {
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result