From 8c0a57b1da02397145e9855916c9ca7744c6fb3a Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 10 Nov 2023 11:16:56 +0100 Subject: [PATCH] feat: implement new API that allows invoking YARA modules directly (#52) With this new API you use a YARA module as a file-parsing tool, and obtain the results produced by the module without any YARA rule involved in the process. This is useful for external tools that what to leverage YARA's file parsing capabilities for their own purposes. This required changing the arguments passed to the main function of each module, from &ScanContext to &[u8]. --- docs/Module Developer's Guide.md | 19 ++++--- yara-x-macros/src/module_main.rs | 4 +- yara-x/build.rs | 2 +- yara-x/src/lib.rs | 3 +- yara-x/src/modules/elf/mod.rs | 6 +-- yara-x/src/modules/hash/mod.rs | 2 +- yara-x/src/modules/lnk/mod.rs | 4 +- yara-x/src/modules/macho/mod.rs | 5 +- yara-x/src/modules/mod.rs | 76 +++++++++++++++++++++++++-- yara-x/src/modules/modules.rs | 18 +++---- yara-x/src/modules/string.rs | 2 +- yara-x/src/modules/test_proto2/mod.rs | 4 +- yara-x/src/modules/test_proto3/mod.rs | 2 +- yara-x/src/modules/tests.rs | 7 +++ yara-x/src/modules/text.rs | 5 +- yara-x/src/modules/time.rs | 2 +- yara-x/src/scanner/mod.rs | 2 +- 17 files changed, 116 insertions(+), 47 deletions(-) diff --git a/docs/Module Developer's Guide.md b/docs/Module Developer's Guide.md index 70f73cd5e..33af9ae42 100644 --- a/docs/Module Developer's Guide.md +++ b/docs/Module Developer's Guide.md @@ -244,9 +244,8 @@ use crate::modules::prelude::*; use crate::modules::protos::text::*; #[module_main] -fn main(ctx: &ScanContext) -> Text { +fn main(data: &[u8]) -> Text { let mut text_proto = Text::new(); - let data = ctx.scanned_data(); // TODO: parse the data and populate text_proto. @@ -284,18 +283,18 @@ Next comes the module's main function: ```rust #[module_main] -fn main(ctx: &ScanContext) -> Text { +fn main(data: &[u8]) -> Text { ... } ``` The module's main function is called for every file scanned by YARA. This -function receives a reference to a `ScanContext` structure that gives you access -to the scanned data. It must return the `Text` structure that was generated from -the `text.proto` file. The main function must have the `#[module_main]` attribute. -Notice that the module's main function doesn't need to be called `main`, it can -have any arbitrary name, as long as it has the `#[module_main]` attribute. Of -course, this attribute can't be used with more than one function per module. +function receives a byte slice with the content of the file being scanned. It +must return the `Text` structure that was generated from the `text.proto` file. +The main function must have the `#[module_main]` attribute. Notice that the +module's main function doesn't need to be called `main`, it can have any +arbitrary name, as long as it has the `#[module_main]` attribute. Of course, +this attribute can't be used with more than one function per module. The main function usually consists in creating an instance of the protobuf you previously defined, and populating the protobuf with information extracted from @@ -310,7 +309,7 @@ use std::io; use std::io::BufRead; #[module_main] -fn main(ctx: &ScanContext) -> Text { +fn main(data: &[u8]) -> Text { // Create an empty instance of the Text protobuf. let mut text_proto = Text::new(); diff --git a/yara-x-macros/src/module_main.rs b/yara-x-macros/src/module_main.rs index 7a469cf95..485c50fd1 100644 --- a/yara-x-macros/src/module_main.rs +++ b/yara-x-macros/src/module_main.rs @@ -11,8 +11,8 @@ pub(crate) fn impl_module_main_macro( let main_stub = quote! { use protobuf::MessageDyn; - pub(crate) fn __main__(ctx: &ScanContext) -> Box { - Box::new(#fn_name(ctx)) + pub(crate) fn __main__(data: &[u8]) -> Box { + Box::new(#fn_name(data)) } }; diff --git a/yara-x/build.rs b/yara-x/build.rs index 6cc0d4cf0..2f603c418 100644 --- a/yara-x/build.rs +++ b/yara-x/build.rs @@ -105,7 +105,7 @@ fn main() { modules_rs, r#" #[cfg(feature = "{name}-module")] -pub mod {rust_mod};"#, +mod {rust_mod};"#, ) .unwrap(); } diff --git a/yara-x/src/lib.rs b/yara-x/src/lib.rs index 027cc5cbe..021ad2377 100644 --- a/yara-x/src/lib.rs +++ b/yara-x/src/lib.rs @@ -45,7 +45,6 @@ pub use compiler::compile; pub use compiler::CompileError; pub use compiler::CompileErrorInfo; pub use compiler::Compiler; -pub use compiler::EmitWasmError; pub use compiler::Error; pub use compiler::Rules; pub use compiler::SerializationError; @@ -62,6 +61,8 @@ pub use scanner::ScanError; pub use scanner::ScanResults; pub use scanner::Scanner; +pub use modules::mods; + pub use variables::Variable; pub use variables::VariableError; diff --git a/yara-x/src/modules/elf/mod.rs b/yara-x/src/modules/elf/mod.rs index f77014b12..ec1ffc1de 100644 --- a/yara-x/src/modules/elf/mod.rs +++ b/yara-x/src/modules/elf/mod.rs @@ -6,9 +6,7 @@ and sections information, exported symbols, target platform, etc. use itertools::Itertools; use lazy_static::lazy_static; -use md5; use rustc_hash::FxHashSet; -use tlsh; use crate::modules::prelude::*; use crate::modules::protos::elf::*; @@ -19,8 +17,8 @@ pub mod parser; mod tests; #[module_main] -fn main(ctx: &ScanContext) -> ELF { - match parser::ElfParser::new().parse(ctx.scanned_data()) { +fn main(data: &[u8]) -> ELF { + match parser::ElfParser::new().parse(data) { Ok(elf) => elf, Err(_) => ELF::new(), } diff --git a/yara-x/src/modules/hash/mod.rs b/yara-x/src/modules/hash/mod.rs index b1e66b4e3..4d0f7dd37 100644 --- a/yara-x/src/modules/hash/mod.rs +++ b/yara-x/src/modules/hash/mod.rs @@ -23,7 +23,7 @@ thread_local!( ); #[module_main] -fn main(_ctx: &ScanContext) -> Hash { +fn main(_data: &[u8]) -> Hash { // With every scanned file the cache must be cleared. SHA256_CACHE.with(|cache| cache.borrow_mut().clear()); SHA1_CACHE.with(|cache| cache.borrow_mut().clear()); diff --git a/yara-x/src/modules/lnk/mod.rs b/yara-x/src/modules/lnk/mod.rs index 7fd2d801f..a5f8e6684 100644 --- a/yara-x/src/modules/lnk/mod.rs +++ b/yara-x/src/modules/lnk/mod.rs @@ -17,8 +17,8 @@ use crate::modules::protos::lnk::*; pub mod parser; #[module_main] -fn main(ctx: &ScanContext) -> Lnk { - match parser::LnkParser::new().parse(ctx.scanned_data()) { +fn main(data: &[u8]) -> Lnk { + match parser::LnkParser::new().parse(data) { Ok(lnk) => lnk, Err(_) => { let mut lnk = Lnk::new(); diff --git a/yara-x/src/modules/macho/mod.rs b/yara-x/src/modules/macho/mod.rs index a55968621..126c5ccde 100644 --- a/yara-x/src/modules/macho/mod.rs +++ b/yara-x/src/modules/macho/mod.rs @@ -2574,13 +2574,10 @@ fn ep_for_arch_subtype( /// code isn’t interrupted by issues with individual files during bulk /// processing. #[module_main] -fn main(ctx: &ScanContext) -> Macho { +fn main(data: &[u8]) -> Macho { // Create an empty instance of the Mach-O protobuf let mut macho_proto = Macho::new(); - // Get a &[u8] slice with the content of the file being scanned. - let data = ctx.scanned_data(); - // If data is too short to be valid Mach-O file, return empty protobuf if data.len() < VALID_MACHO_LENGTH { #[cfg(feature = "logging")] diff --git a/yara-x/src/modules/mod.rs b/yara-x/src/modules/mod.rs index 1eb0d4fc7..ac42fc6c9 100644 --- a/yara-x/src/modules/mod.rs +++ b/yara-x/src/modules/mod.rs @@ -3,8 +3,6 @@ use protobuf::reflect::MessageDescriptor; use protobuf::MessageDyn; use rustc_hash::FxHashMap; -use crate::scanner::ScanContext; - pub mod protos { include!(concat!(env!("OUT_DIR"), "/protos/mod.rs")); } @@ -26,7 +24,7 @@ pub(crate) mod prelude { include!("modules.rs"); /// Type of module's main function. -type MainFn = fn(&ScanContext) -> Box; +type MainFn = fn(&[u8]) -> Box; /// Describes a YARA module. pub(crate) struct Module { @@ -120,3 +118,75 @@ lazy_static! { modules }; } + +pub mod mods { + /*! Utility functions and structures for invoking YARA modules directly. + + The utility functions [`invoke_mod`] and [`invoke_mod_dyn`] allow leveraging + YARA modules for parsing some file formats independently of any YARA rule. + With these functions you can pass arbitrary data to a YARA module and obtain + the same data structure that is accessible to YARA rules and which you use + in your rule conditions. + + This allows external projects to benefit from YARA's file-parsing + capabilities for their own purposes. + */ + + /// Data structure returned by the `elf` module. + pub use super::protos::elf::ELF; + /// Data structure returned by the `lnk` module. + pub use super::protos::lnk::Lnk; + /// Data structure returned by the `macho` module. + pub use super::protos::macho::Macho; + + /// Invoke a YARA module with arbitrary data. + /// + ///
+ /// + /// YARA modules typically parse specific file formats, returning structures + /// that contain information about the file. These structures are used in YARA + /// rules for expressing powerful and rich conditions. However, being able to + /// access this information outside of YARA rules can also be beneficial. + /// + ///
+ /// + /// This function allows the direct invocation of a YARA module for parsing + /// arbitrary data. It returns the structure produced by the module, which + /// depends upon the invoked module. The result will be [`None`] if the + /// module does not exist, or if it doesn't produce any information for + /// the input data. + /// + /// `T` must be one of the structure types returned by a YARA module, which + /// are defined [`crate::mods`]. + /// + /// # Example + /// ```rust + /// # use yara_x; + /// # let data = &[]; + /// let elf_info = yara_x::mods::invoke_mod::(data); + /// ``` + pub fn invoke_mod( + data: &[u8], + ) -> Option> { + let module_output = invoke_mod_dyn::(data)?; + Some(::downcast_box(module_output).unwrap()) + } + + /// Invoke a YARA module with arbitrary data, but returns a dynamic + /// structure. + /// + /// This function is similar to [`invoke_mod`] but its result is a dynamic- + /// dispatch version of the structure returned by the YARA module. + pub fn invoke_mod_dyn( + data: &[u8], + ) -> Option> { + let descriptor = T::descriptor(); + let proto_name = descriptor.full_name(); + let (_, module) = + super::BUILTIN_MODULES.iter().find(|(_, module)| { + module.root_struct_descriptor.full_name() == proto_name + })?; + + Some(module.main_fn?(data)) + } +} diff --git a/yara-x/src/modules/modules.rs b/yara-x/src/modules/modules.rs index 0cc54cad5..b3ccd9077 100644 --- a/yara-x/src/modules/modules.rs +++ b/yara-x/src/modules/modules.rs @@ -1,19 +1,19 @@ // File generated automatically by build.rs. Do not edit. #[cfg(feature = "string-module")] -pub mod string; +mod string; #[cfg(feature = "macho-module")] -pub mod macho; +mod macho; #[cfg(feature = "elf-module")] -pub mod elf; +mod elf; #[cfg(feature = "text-module")] -pub mod text; +mod text; #[cfg(feature = "hash-module")] -pub mod hash; +mod hash; #[cfg(feature = "test_proto2-module")] -pub mod test_proto2; +mod test_proto2; #[cfg(feature = "lnk-module")] -pub mod lnk; +mod lnk; #[cfg(feature = "time-module")] -pub mod time; +mod time; #[cfg(feature = "test_proto3-module")] -pub mod test_proto3; \ No newline at end of file +mod test_proto3; \ No newline at end of file diff --git a/yara-x/src/modules/string.rs b/yara-x/src/modules/string.rs index 17b055b25..b4ac1b16d 100644 --- a/yara-x/src/modules/string.rs +++ b/yara-x/src/modules/string.rs @@ -2,7 +2,7 @@ use crate::modules::prelude::*; use crate::modules::protos::string::*; #[module_main] -fn main(_ctx: &ScanContext) -> String { +fn main(_data: &[u8]) -> String { // Nothing to do, but we have to return our protobuf String::new() } diff --git a/yara-x/src/modules/test_proto2/mod.rs b/yara-x/src/modules/test_proto2/mod.rs index 68dfd9a02..e41ef1b83 100644 --- a/yara-x/src/modules/test_proto2/mod.rs +++ b/yara-x/src/modules/test_proto2/mod.rs @@ -53,7 +53,7 @@ fn to_int(ctx: &ScanContext, string: RuntimeString) -> Option { } #[module_main] -fn main(ctx: &ScanContext) -> TestProto2 { +fn main(data: &[u8]) -> TestProto2 { let mut test = TestProto2::new(); test.set_int32_zero(0); @@ -131,7 +131,7 @@ fn main(ctx: &ScanContext) -> TestProto2 { test.set_bool_proto(true); - test.set_file_size(ctx.scanned_data().len() as u64); + test.set_file_size(data.len() as u64); test } diff --git a/yara-x/src/modules/test_proto3/mod.rs b/yara-x/src/modules/test_proto3/mod.rs index b4204a99b..6a63ab8db 100644 --- a/yara-x/src/modules/test_proto3/mod.rs +++ b/yara-x/src/modules/test_proto3/mod.rs @@ -2,7 +2,7 @@ use crate::modules::prelude::*; use crate::modules::protos::test_proto3::TestProto3; #[module_main] -fn main(_ctx: &ScanContext) -> TestProto3 { +fn main(_data: &[u8]) -> TestProto3 { let mut test = TestProto3::new(); test.int32_zero = 0; diff --git a/yara-x/src/modules/tests.rs b/yara-x/src/modules/tests.rs index dc8efb8b5..ec520c0b0 100644 --- a/yara-x/src/modules/tests.rs +++ b/yara-x/src/modules/tests.rs @@ -2,6 +2,13 @@ use std::fs; use std::io::Write; use std::path::Path; +/// Utility function that reads a file in [`Intel HEX`][1] (ihex) format and +/// returns the binary data contained in it. +/// +/// All test files in this repository are stored in ihex format in order to +/// avoid storing executable files (some of them malware) in binary form. +/// +/// [1]: https://en.wikipedia.org/wiki/Intel_HEX pub fn create_binary_from_ihex>( path: P, ) -> anyhow::Result> { diff --git a/yara-x/src/modules/text.rs b/yara-x/src/modules/text.rs index 79189a29c..3bed1252e 100644 --- a/yara-x/src/modules/text.rs +++ b/yara-x/src/modules/text.rs @@ -18,13 +18,10 @@ use lingua::{Language, LanguageDetectorBuilder}; /// This function must return an instance of the protobuf message indicated /// in the `root_message` option in `text.proto`. #[module_main] -fn main(ctx: &ScanContext) -> Text { +fn main(data: &[u8]) -> Text { // Create an empty instance of the Text protobuf. let mut text_proto = Text::new(); - // Get a &[u8] slice with the content of the file being scanned. - let data = ctx.scanned_data(); - let mut num_lines = 0; let mut num_words = 0; diff --git a/yara-x/src/modules/time.rs b/yara-x/src/modules/time.rs index e52330c20..84f1a5717 100644 --- a/yara-x/src/modules/time.rs +++ b/yara-x/src/modules/time.rs @@ -3,7 +3,7 @@ use crate::modules::protos::time::*; use std::time::{SystemTime, UNIX_EPOCH}; #[module_main] -fn main(_ctx: &ScanContext) -> Time { +fn main(_data: &[u8]) -> Time { // Nothing to do, but we have to return our protobuf Time::new() } diff --git a/yara-x/src/scanner/mod.rs b/yara-x/src/scanner/mod.rs index 484d2f814..69f42fa09 100644 --- a/yara-x/src/scanner/mod.rs +++ b/yara-x/src/scanner/mod.rs @@ -448,7 +448,7 @@ impl<'r> Scanner<'r> { // the data is specified by the .proto file associated to the // module. let module_output = if let Some(main_fn) = module.main_fn { - main_fn(ctx) + main_fn(data.as_ref()) } else { // Implement the case in which the module doesn't have a main // function and the serialized data should be provided by the