Skip to content

Commit

Permalink
feat: handle /(foo|bar|baz)/ as an alternation of literals.
Browse files Browse the repository at this point in the history
With this change a regexp like`/(foo|bar|baz)/` is handled as an alternation of literals like `/foo|bar|baz/`. Prior this change the capture group was preventing both regular expressions from being handled in the same way.
  • Loading branch information
plusvic committed Jul 28, 2023
1 parent b36b684 commit 38c0990
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 108 deletions.
224 changes: 127 additions & 97 deletions yara-x/src/compiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub use crate::compiler::errors::*;
pub use crate::compiler::rules::*;
use crate::compiler::SubPattern::{Regexp, RegexpChainHead, RegexpChainTail};
use crate::re;
use crate::re::hir::TrailingPattern;
use crate::re::hir::ChainedPattern;

mod atoms;
mod context;
Expand Down Expand Up @@ -810,10 +810,77 @@ impl<'a> Compiler<'a> {
}

fn process_regexp_pattern(&mut self, pattern: ir::RegexpPattern) {
let ascii = pattern.flags.contains(PatternFlags::Ascii);
let wide = pattern.flags.contains(PatternFlags::Wide);
let case_insensitive = pattern.flags.contains(PatternFlags::Nocase);
let full_word = pattern.flags.contains(PatternFlags::Fullword);
// Try splitting the regexp into multiple chained sub-patterns if it
// contains large gaps. For example, `{ 01 02 03 [-] 04 05 06 }` is
// split into `{ 01 02 03 }` and `{ 04 05 06 }`, where `{ 04 05 06 }`
// is chained to `{ 01 02 03 }`.
//
// If the regexp can't be split then `head` is the whole regexp.
let (head, tail) = pattern.hir.split_at_large_gaps();

if !tail.is_empty() {
// The pattern was split into multiple chained regexps.
self.process_chain(&head, &tail, pattern.flags);
return;
}

if head.is_alternation_literal() {
// The pattern is either a literal, or an alternation of literals.
// Examples:
// /foo/
// /foo|bar|baz/
// { 01 02 03 }
// { (01 02 03 | 04 05 06 ) }
self.process_alternation_literal(head, pattern.flags);
return;
}

// This is a standard, a pattern that can't be split into
// multiple chained patterns, and is neither a literal or
// alternation of literals.
let mut flags = SubPatternFlagSet::none();

if pattern.flags.contains(PatternFlags::Nocase) {
flags.set(SubPatternFlags::Nocase);
}

if pattern.flags.contains(PatternFlags::Fullword) {
flags.set(SubPatternFlags::FullwordLeft);
flags.set(SubPatternFlags::FullwordRight);
}

if matches!(head.is_greedy(), Some(true)) {
flags.set(SubPatternFlags::Greedy);
}

let atoms = self.compile_regexp(&head);

if pattern.flags.contains(PatternFlags::Wide) {
self.add_sub_pattern(
Regexp { flags: flags | SubPatternFlags::Wide },
atoms.iter(),
SubPatternAtom::from_regexp_atom_wide,
);
}

if pattern.flags.contains(PatternFlags::Ascii) {
self.add_sub_pattern(
Regexp { flags },
atoms.into_iter(),
SubPatternAtom::from_regexp_atom,
);
}
}

fn process_alternation_literal(
&mut self,
hir: re::hir::Hir,
flags: PatternFlagSet,
) {
let ascii = flags.contains(PatternFlags::Ascii);
let wide = flags.contains(PatternFlags::Wide);
let case_insensitive = flags.contains(PatternFlags::Nocase);
let full_word = flags.contains(PatternFlags::Fullword);

let mut flags = SubPatternFlagSet::none();

Expand All @@ -826,115 +893,78 @@ impl<'a> Compiler<'a> {
flags.set(SubPatternFlags::FullwordRight);
}

// Try splitting the regexp into multiple chained sub-patterns if it
// contains large gaps. If the regexp can't be split the leading part
// is the whole regexp.
let (leading, trailing) = pattern.hir.split_at_large_gaps();

if trailing.is_empty() && leading.is_alternation_literal() {
// The pattern is either a literal, or an alternation of literals,
// examples:
// /foo/ literal
// /foo|bar|baz/ alternation of literals
// { 01 02 03 } literal
// { (01 02 03 | 04 05 06 )} alternation of literals
let mut process_literal = |literal: &hir::Literal, wide: bool| {
let pattern_lit_id =
self.intern_literal(literal.0.as_bytes(), wide);

let best_atom = best_atom_from_slice(
self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
if wide {
DESIRED_ATOM_SIZE * 2
} else {
DESIRED_ATOM_SIZE
},
);
let mut process_literal = |literal: &hir::Literal, wide: bool| {
let pattern_lit_id =
self.intern_literal(literal.0.as_bytes(), wide);

let sp = SubPattern::Literal {
pattern: pattern_lit_id,
flags: if wide {
flags | SubPatternFlags::Wide
} else {
flags
},
};
let best_atom = best_atom_from_slice(
self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
if wide { DESIRED_ATOM_SIZE * 2 } else { DESIRED_ATOM_SIZE },
);

if case_insensitive {
self.add_sub_pattern(
sp,
CaseGenerator::new(&best_atom),
SubPatternAtom::from_atom,
);
let sp = SubPattern::Literal {
pattern: pattern_lit_id,
flags: if wide {
flags | SubPatternFlags::Wide
} else {
self.add_sub_pattern(
sp,
iter::once(best_atom),
SubPatternAtom::from_atom,
);
}
flags
},
};

match leading.into_kind() {
hir::HirKind::Literal(literal) => {
if ascii {
process_literal(&literal, false);
}
if wide {
process_literal(&literal, true);
}
}
hir::HirKind::Alternation(literals) => {
let literals = literals.into_iter().map({
|l| cast!(l.into_kind(), hir::HirKind::Literal)
});
for literal in literals {
if ascii {
process_literal(&literal, false);
}
if wide {
process_literal(&literal, true);
}
}
}
_ => unreachable!(),
}
} else if trailing.is_empty() {
if matches!(leading.is_greedy(), Some(true)) {
flags.set(SubPatternFlags::Greedy);
}

// The pattern is a regexp that can't be converted into a literal
// or alternation of literals, and can't be split into multiple
// regexps.
let atoms = self.compile_regexp(&leading);

if wide {
if case_insensitive {
self.add_sub_pattern(
Regexp { flags: flags | SubPatternFlags::Wide },
atoms.iter(),
SubPatternAtom::from_regexp_atom_wide,
sp,
CaseGenerator::new(&best_atom),
SubPatternAtom::from_atom,
);
}

if ascii {
} else {
self.add_sub_pattern(
Regexp { flags },
atoms.into_iter(),
SubPatternAtom::from_regexp_atom,
sp,
iter::once(best_atom),
SubPatternAtom::from_atom,
);
}
};

let inner;

let hir = if let hir::HirKind::Capture(group) = hir.kind() {
group.sub.as_ref()
} else {
// The pattern is a regexp that was split into multiple chained
// regexps.
self.process_chain(&leading, &trailing, pattern.flags);
inner = hir.into_inner();
&inner
};

match hir.kind() {
hir::HirKind::Literal(literal) => {
if ascii {
process_literal(literal, false);
}
if wide {
process_literal(literal, true);
}
}
hir::HirKind::Alternation(literals) => {
let literals = literals
.iter()
.map(|l| cast!(l.kind(), hir::HirKind::Literal));
for literal in literals {
if ascii {
process_literal(literal, false);
}
if wide {
process_literal(literal, true);
}
}
}
_ => unreachable!(),
}
}

fn process_chain(
&mut self,
leading: &re::hir::Hir,
trailing: &[TrailingPattern],
trailing: &[ChainedPattern],
flags: PatternFlagSet,
) {
let ascii = flags.contains(PatternFlags::Ascii);
Expand Down
34 changes: 27 additions & 7 deletions yara-x/src/re/hir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::ops::RangeInclusive;
use yara_x_parser::ast::HexByte;

#[derive(Debug, PartialEq)]
pub(crate) struct TrailingPattern {
pub(crate) struct ChainedPattern {
pub gap: RangeInclusive<u32>,
pub hir: Hir,
}
Expand Down Expand Up @@ -67,7 +67,7 @@ impl Hir {
/// If the pattern doesn't contain any gap that is long enough, the pattern
/// won't be split, and the leading piece will contain the whole pattern
/// while the vector will be empty.
pub fn split_at_large_gaps(self) -> (Self, Vec<TrailingPattern>) {
pub fn split_at_large_gaps(self) -> (Self, Vec<ChainedPattern>) {
if !matches!(self.kind(), HirKind::Concat(_)) {
return (self, vec![]);
}
Expand All @@ -78,7 +78,7 @@ impl Hir {

let mut push = |gap: Option<RangeInclusive<u32>>, fragment| {
if let Some(gap) = gap {
trailing.push(TrailingPattern {
trailing.push(ChainedPattern {
gap,
hir: Hir::from(regex_syntax::hir::Hir::concat(fragment))
.set_greedy(greedy),
Expand Down Expand Up @@ -143,9 +143,29 @@ impl Hir {
self.inner.into_kind()
}

#[inline]
pub fn into_inner(self) -> regex_syntax::hir::Hir {
self.inner
}

/// Returns true if this HIR is either a simple literal or an alternation
/// of simple literals.
///
/// For example, `f`, `foo`, `(a|b|c)` and `(foo|bar|baz)` are alternation
/// literals. This also includes capture groups that contain a literal or
/// alternation of literals, like for example `(f)`, `(foo)`, `(a|b|c)`,
/// and `(foo|bar|baz)`.
#[inline]
pub fn is_alternation_literal(&self) -> bool {
self.inner.properties().is_alternation_literal()
if self.inner.properties().is_alternation_literal() {
return true;
}
match self.inner.kind() {
HirKind::Capture(cap) => {
cap.sub.properties().is_alternation_literal()
}
_ => false,
}
}
}

Expand Down Expand Up @@ -275,7 +295,7 @@ mod tests {

use super::Hir;
use crate::re::hir::{
class_to_hex_byte, hex_byte_to_class, TrailingPattern,
class_to_hex_byte, hex_byte_to_class, ChainedPattern,
};

#[test]
Expand Down Expand Up @@ -389,11 +409,11 @@ mod tests {
(
Hir::literal([0x01, 0x02, 0x03]),
vec![
TrailingPattern {
ChainedPattern {
gap: 0..=u32::MAX,
hir: Hir::literal([0x05])
},
TrailingPattern {
ChainedPattern {
gap: 10..=11 + Hir::PATTERN_CHAINING_THRESHOLD,
hir: Hir::literal([0x06, 0x07])
}
Expand Down
5 changes: 1 addition & 4 deletions yara-x/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -772,10 +772,7 @@ fn regexp_patterns_2() {
pattern_match!(r#"/foo|bar|baz/"#, b"foo", b"foo");
pattern_match!(r#"/foo|bar|baz/"#, b"bar", b"bar");
pattern_match!(r#"/foo|bar|baz/"#, b"baz", b"baz");

// TODO: this should be equivalent to /foo|bar|baz/, and should be
// interpreted as an alternation of literals.
//pattern_true!(r#"/(foo|bar|baz)/"#, b"foo");
pattern_true!(r#"/(foo|bar|baz)/"#, b"foo");

pattern_false!(r#"/foo|bar|baz/"#, b"FOO");
pattern_false!(r#"/foo|bar|baz/"#, b"BAR");
Expand Down

0 comments on commit 38c0990

Please # to comment.