Skip to content

Add unicode_word_indices #91

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ extern crate quickcheck;
pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

mod grapheme;
Expand Down Expand Up @@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
/// ```
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;

/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
/// offsets.
///
/// Here, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
///
/// assert_eq!(&uwi1[..], b);
/// ```
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
Expand Down Expand Up @@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
word::new_unicode_words(self)
}

#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
word::new_unicode_word_indices(self)
}

#[inline]
fn split_word_bounds(&self) -> UWordBounds {
word::new_word_bounds(self)
Expand Down
46 changes: 42 additions & 4 deletions src/word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
}

/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
/// This iterator also provides the byte offsets for each substring.
///
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
/// its documentation for more.
///
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
pub struct UnicodeWordIndices<'a> {
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
}

impl<'a> Iterator for UnicodeWordIndices<'a> {
type Item = (usize, &'a str);

#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
}
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
}

/// External iterator for a string's
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
Expand Down Expand Up @@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
}

#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
fn has_alphanumeric(s: &&str) -> bool {
use tables::util::is_alphanumeric;

fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
s.chars().any(|c| is_alphanumeric(c))
}

#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;

UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
}

#[inline]
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
use super::UnicodeSegmentation;

UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
}