From 5f866f525e6a531830d1a7c636c67b8f65602466 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 May 2023 14:12:56 -0400 Subject: [PATCH] regex-automata: fix bug in DFA quit behavior It turns out that the way we were dealing with quit states in the DFA was not quite right. Basically, if we entered a quit state and a match had been found, then we were returning the match instead of the error. But the match might not be the correct leftmost-first match, and so, we really shouldn't return it. Otherwise a regex like '\B.*' could match much less than it should. This was caught by a differential fuzzer developed in #848. --- regex-automata/src/dfa/dense.rs | 6 +++--- regex-automata/src/dfa/search.rs | 12 ------------ regex-automata/src/hybrid/dfa.rs | 6 +++--- regex-automata/src/hybrid/search.rs | 12 ------------ regex-automata/src/meta/strategy.rs | 8 ++++---- testdata/regression.toml | 30 +++++++++++++++++++++++++++++ 6 files changed, 40 insertions(+), 34 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 38085cce6..3bd2fdcc3 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -562,7 +562,7 @@ impl Config { /// /// // The match occurs before the search ever observes the snowman /// // character, so no error occurs. - /// let haystack = "foo 123 ☃".as_bytes(); + /// let haystack = "foo 123 ☃".as_bytes(); /// let expected = Some(HalfMatch::must(0, 7)); /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); @@ -572,8 +572,8 @@ impl Config { /// // routines read one byte past the end of the search to account for /// // look-around, and indeed, this is required here to determine whether /// // the trailing \b matches. - /// let haystack = "foo 123☃".as_bytes(); - /// let expected = MatchError::quit(0xE2, 7); + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = MatchError::quit(0xE2, 8); /// let got = dfa.try_search_fwd(&Input::new(haystack)); /// assert_eq!(Err(expected), got); /// diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs index 6db745b1b..c975921c9 100644 --- a/regex-automata/src/dfa/search.rs +++ b/regex-automata/src/dfa/search.rs @@ -180,9 +180,6 @@ fn find_fwd_imp( // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. debug_assert!(dfa.is_quit_state(sid)); - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -307,9 +304,6 @@ fn find_rev_imp( return Ok(mat); } else { debug_assert!(dfa.is_quit_state(sid)); - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -611,9 +605,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -646,9 +637,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 33e48639d..246e9f55f 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -3226,7 +3226,7 @@ impl Config { /// /// // The match occurs before the search ever observes the snowman /// // character, so no error occurs. - /// let haystack = "foo 123 ☃"; + /// let haystack = "foo 123 ☃"; /// let expected = Some(HalfMatch::must(0, 7)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); @@ -3236,8 +3236,8 @@ impl Config { /// // routines read one byte past the end of the search to account for /// // look-around, and indeed, this is required here to determine whether /// // the trailing \b matches. - /// let haystack = "foo 123☃"; - /// let expected = MatchError::quit(0xE2, 7); + /// let haystack = "foo 123 ☃"; + /// let expected = MatchError::quit(0xE2, 8); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack)); /// assert_eq!(Err(expected), got); /// diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs index 26bec8d7f..aad5c25c3 100644 --- a/regex-automata/src/hybrid/search.rs +++ b/regex-automata/src/hybrid/search.rs @@ -282,9 +282,6 @@ fn find_fwd_imp( return Ok(mat); } else if sid.is_quit() { cache.search_finish(at); - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at)); } else { debug_assert!(sid.is_unknown()); @@ -432,9 +429,6 @@ fn find_rev_imp( return Ok(mat); } else if sid.is_quit() { cache.search_finish(at); - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at)); } else { debug_assert!(sid.is_unknown()); @@ -730,9 +724,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -770,9 +761,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 8fc9ef994..69028d1b0 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -636,7 +636,7 @@ impl Strategy for Core { // We manually inline try_search_mayfail here because letting the // compiler do it seems to produce pretty crappy codegen. return if let Some(e) = self.dfa.get(input) { - trace!("using full DFA for search at {:?}", input.get_span()); + trace!("using full DFA for full search at {:?}", input.get_span()); match e.try_search(input) { Ok(x) => x, Err(_err) => { @@ -645,7 +645,7 @@ impl Strategy for Core { } } } else if let Some(e) = self.hybrid.get(input) { - trace!("using lazy DFA for search at {:?}", input.get_span()); + trace!("using lazy DFA for full search at {:?}", input.get_span()); match e.try_search(&mut cache.hybrid, input) { Ok(x) => x, Err(_err) => { @@ -668,7 +668,7 @@ impl Strategy for Core { // can use a single forward scan without needing to run the reverse // DFA. return if let Some(e) = self.dfa.get(input) { - trace!("using full DFA for search at {:?}", input.get_span()); + trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, Err(_err) => { @@ -677,7 +677,7 @@ impl Strategy for Core { } } } else if let Some(e) = self.hybrid.get(input) { - trace!("using lazy DFA for search at {:?}", input.get_span()); + trace!("using lazy DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(&mut cache.hybrid, input) { Ok(x) => x, Err(_err) => { diff --git a/testdata/regression.toml b/testdata/regression.toml index 9af3f1a0f..39d6ceabc 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -689,3 +689,33 @@ name = "captures-wrong-order" regex = '(a){0}(a)' haystack = 'a' matches = [[[0, 1], [], [0, 1]]] + +# This tests a bug in how quit states are handled in the DFA. At some point +# during development, the DFAs were tweaked slightly such that if they hit +# a quit state (which means, they hit a byte that the caller configured should +# stop the search), then it might not return an error necessarily. Namely, if a +# match had already been found, then it would be returned instead of an error. +# +# But this is actually wrong! Why? Because even though a match had been found, +# it wouldn't be fully correct to return it once a quit state has been seen +# because you can't determine whether the match offset returned is the correct +# greedy/leftmost-first match. Since you can't complete the search as requested +# by the caller, the DFA should just stop and return an error. +# +# Interestingly, this does seem to produce an unavoidable difference between +# 'try_is_match().unwrap()' and 'try_find().unwrap().is_some()' for the DFAs. +# The former will stop immediately once a match is known to occur and return +# 'Ok(true)', where as the latter could find the match but quit with an +# 'Err(..)' first. +# +# Thankfully, I believe this inconsistency between 'is_match()' and 'find()' +# cannot be observed in the higher level meta regex API because it specifically +# will try another engine that won't fail in the case of a DFA failing. +# +# This regression happened in the regex crate rewrite, but before anything got +# released. +[[test]] +name = "negated-unicode-word-boundary-dfa-fail" +regex = '\B.*' +haystack = "!\u02D7" +matches = [[0, 3]]