Skip to content

Commit f8b8dc5

Browse files
committed
perf(scanner): lazily materialize imported modules
1 parent c762e4a commit f8b8dc5

5 files changed

Lines changed: 325 additions & 105 deletions

File tree

lib/src/modules/mod.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,30 @@ pub(crate) static BUILTIN_MODULES: LazyLock<FxHashMap<&'static str, Module>> =
148148
modules
149149
});
150150

151+
pub(crate) fn module_name_from_root_struct(
152+
root_struct_name: &str,
153+
) -> Option<&'static str> {
154+
BUILTIN_MODULES.iter().find_map(|(module_name, module)| {
155+
(module.root_struct_descriptor.full_name() == root_struct_name)
156+
.then_some(*module_name)
157+
})
158+
}
159+
160+
pub(crate) fn module_name_from_rust_module_path(
161+
rust_module_path: &str,
162+
) -> Option<&'static str> {
163+
let module_path = rust_module_path.strip_prefix("yara_x::modules::")?;
164+
BUILTIN_MODULES.iter().find_map(|(module_name, module)| {
165+
module.rust_module_name.and_then(|name| {
166+
(module_path == name
167+
|| module_path
168+
.strip_prefix(name)
169+
.is_some_and(|suffix| suffix.starts_with("::")))
170+
.then_some(*module_name)
171+
})
172+
})
173+
}
174+
151175
pub mod mods {
152176
/*! Utility functions and structures that allow invoking YARA modules directly.
153177

lib/src/scanner/context.rs

Lines changed: 117 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use crate::compiler::{
2727
SubPatternAtom, SubPatternFlags, SubPatternId,
2828
};
2929
use crate::errors::VariableError;
30+
use crate::modules::{self, BUILTIN_MODULES};
3031
use crate::re::Action;
3132
use crate::re::fast::FastVM;
3233
use crate::re::hir::ChainedPatternGap;
@@ -120,6 +121,14 @@ pub(crate) struct ScanContext<'r, 'd> {
120121
/// operation. Keys are the fully qualified protobuf message names, and
121122
/// values are the protobuf messages set with [`Scanner::set_module_output`].
122123
pub user_provided_module_outputs: FxHashMap<String, Box<dyn MessageDyn>>,
124+
/// Metadata passed to module main functions for the current scan.
125+
pub module_metadata: FxHashMap<String, Vec<u8>>,
126+
/// Modules that have been materialized during the current scan.
127+
pub materialized_modules: FxHashSet<String>,
128+
/// Whether imported modules should be materialized on first use.
129+
pub lazy_modules: bool,
130+
/// Error captured while materializing a module from a field lookup.
131+
pub module_materialization_error: Option<ScanError>,
123132
/// Hash map that tracks the matches occurred during a scan. The keys
124133
/// are the PatternId of the matching pattern, and values are a list
125134
/// of matches.
@@ -290,6 +299,98 @@ impl ScanContext<'_, '_> {
290299
/// use crate::modules::protos::my_module::MyModuleProto;
291300
/// let module_data: MyModuleProto = ctx.module_data::<MyModuleProto>()
292301
/// ```
302+
pub(crate) fn materialize_module_for_root_field(
303+
&mut self,
304+
field_index: usize,
305+
) {
306+
if self.module_materialization_error.is_some() {
307+
return;
308+
}
309+
310+
let module_name =
311+
self.root_struct.field_by_index(field_index).and_then(|field| {
312+
if let TypeValue::Struct(structure) = &field.type_value {
313+
structure
314+
.protobuf_type_name()
315+
.and_then(modules::module_name_from_root_struct)
316+
} else {
317+
None
318+
}
319+
});
320+
321+
if let Some(module_name) = module_name
322+
&& let Err(err) = self.materialize_module(module_name)
323+
{
324+
self.module_materialization_error = Some(err);
325+
}
326+
}
327+
328+
pub(crate) fn materialize_module(
329+
&mut self,
330+
module_name: &str,
331+
) -> Result<(), ScanError> {
332+
if !self.materialized_modules.insert(module_name.to_string()) {
333+
return Ok(());
334+
}
335+
336+
let module = BUILTIN_MODULES
337+
.get(module_name)
338+
.unwrap_or_else(|| panic!("module `{module_name}` not found"));
339+
340+
let root_struct_name = module.root_struct_descriptor.full_name();
341+
let root_struct_name = root_struct_name.to_string();
342+
343+
let module_output = if let Some(output) =
344+
self.user_provided_module_outputs.remove(root_struct_name.as_str())
345+
{
346+
Some(output)
347+
} else if let Some(main_fn) = module.main_fn
348+
&& let Some(data) = self.scanned_data()
349+
{
350+
let meta =
351+
self.module_metadata.get(module_name).map(Vec::as_slice);
352+
Some(main_fn(data, meta).map_err(|err| {
353+
ScanError::ModuleError { module: module_name.to_string(), err }
354+
})?)
355+
} else {
356+
None
357+
};
358+
359+
if let Some(module_output) = &module_output {
360+
debug_assert_eq!(
361+
module_output.descriptor_dyn().full_name(),
362+
module.root_struct_descriptor.full_name(),
363+
"main function of module `{}` must return `{}`, but returned `{}`",
364+
module_name,
365+
module.root_struct_descriptor.full_name(),
366+
module_output.descriptor_dyn().full_name(),
367+
);
368+
369+
debug_assert!(
370+
module_output.is_initialized_dyn(),
371+
"module `{}` returned a protobuf `{}` where some required fields are not initialized ",
372+
module_name,
373+
module.root_struct_descriptor.full_name()
374+
);
375+
}
376+
377+
let generate_fields_for_enums = !cfg!(feature = "constant-folding");
378+
let module_struct = Struct::from_proto_descriptor_and_msg(
379+
&module.root_struct_descriptor,
380+
module_output.as_deref(),
381+
generate_fields_for_enums,
382+
);
383+
384+
if let Some(module_output) = module_output {
385+
self.module_outputs.insert(root_struct_name, module_output);
386+
}
387+
388+
self.root_struct
389+
.add_field(module_name, TypeValue::Struct(module_struct));
390+
391+
Ok(())
392+
}
393+
293394
pub(crate) fn module_output<T: MessageFull>(&self) -> Option<&T> {
294395
let m = self.module_outputs.get(T::descriptor().full_name())?.as_ref();
295396
<dyn MessageDyn>::downcast_ref(m)
@@ -472,10 +573,15 @@ impl ScanContext<'_, '_> {
472573
// is running. In that case, the function returns `Ok(0)` but the
473574
// scan state is updated to `ScanState::Timeout`.
474575
match eval_result {
475-
Ok(0) => match self.scan_state {
476-
ScanState::Timeout => Err(ScanError::Timeout),
477-
_ => Ok(()),
478-
},
576+
Ok(0) => {
577+
if let Some(err) = self.module_materialization_error.take() {
578+
return Err(err);
579+
}
580+
match self.scan_state {
581+
ScanState::Timeout => Err(ScanError::Timeout),
582+
_ => Ok(()),
583+
}
584+
}
479585
Ok(v) => panic!("WASM main returned: {v}"),
480586
Err(err) if err.is::<ScanError>() => {
481587
Err(err.downcast::<ScanError>().unwrap())
@@ -514,6 +620,9 @@ impl ScanContext<'_, '_> {
514620

515621
// Clear module outputs from previous scans.
516622
self.module_outputs.clear();
623+
self.module_metadata.clear();
624+
self.materialized_modules.clear();
625+
self.module_materialization_error = None;
517626

518627
// Move the matching rules to the `matching_rules` vector, leaving the
519628
// `matching_rules_per_ns` map empty.
@@ -1804,6 +1913,10 @@ pub fn create_wasm_store_and_ctx<'r>(
18041913
wasm_pattern_search_done: None,
18051914
module_outputs: FxHashMap::default(),
18061915
user_provided_module_outputs: FxHashMap::default(),
1916+
module_metadata: FxHashMap::default(),
1917+
materialized_modules: FxHashSet::default(),
1918+
lazy_modules: false,
1919+
module_materialization_error: None,
18071920
pattern_matches: PatternMatches::new(),
18081921
unconfirmed_matches: FxHashMap::default(),
18091922
deadline: 0,

lib/src/scanner/mod.rs

Lines changed: 37 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,14 @@ use memmap2::{Mmap, MmapOptions};
2020
use protobuf::{CodedInputStream, MessageDyn};
2121
use thiserror::Error;
2222

23+
use crate::Variable;
2324
use crate::compiler::{RuleId, Rules};
2425
use crate::models::Rule;
2526
use crate::modules::{BUILTIN_MODULES, Module, ModuleError};
2627
use crate::scanner::context::create_wasm_store_and_ctx;
27-
use crate::types::{Struct, TypeValue};
2828
use crate::variables::VariableError;
2929
use crate::wasm::MATCHING_RULES_BITMAP_BASE;
3030
use crate::wasm::runtime::Store;
31-
use crate::{Variable, modules};
3231

3332
pub(crate) use crate::scanner::context::RuntimeObject;
3433
pub(crate) use crate::scanner::context::RuntimeObjectHandle;
@@ -152,6 +151,7 @@ pub struct ProfilingData<'r> {
152151
#[derive(Debug, Default)]
153152
pub struct ScanOptions<'a> {
154153
module_metadata: HashMap<&'a str, &'a [u8]>,
154+
lazy_modules: bool,
155155
}
156156

157157
impl<'a> ScanOptions<'a> {
@@ -160,7 +160,7 @@ impl<'a> ScanOptions<'a> {
160160
///
161161
/// Use other methods to add additional information.
162162
pub fn new() -> Self {
163-
Self { module_metadata: Default::default() }
163+
Self { module_metadata: Default::default(), lazy_modules: false }
164164
}
165165

166166
/// Adds metadata for a YARA module.
@@ -172,6 +172,18 @@ impl<'a> ScanOptions<'a> {
172172
self.module_metadata.insert(module_name, metadata);
173173
self
174174
}
175+
176+
/// Enables or disables lazy module execution for this scan.
177+
///
178+
/// When enabled, imported modules are executed only if rule condition
179+
/// evaluation actually accesses one of their fields or functions. This can
180+
/// avoid expensive parsers for rules that short-circuit on strings, but
181+
/// modules skipped this way won't appear in [`ScanResults::module_output`]
182+
/// or [`ScanResults::module_outputs`].
183+
pub fn lazy_modules(mut self, yes: bool) -> Self {
184+
self.lazy_modules = yes;
185+
self
186+
}
175187
}
176188

177189
/// Scans data with already compiled YARA rules.
@@ -502,111 +514,33 @@ impl<'r> Scanner<'r> {
502514
// Indicate that the scanner is currently scanning the given data.
503515
ctx.scan_state = ScanState::ScanningData(data);
504516

505-
for module_name in ctx.compiled_rules.imports() {
506-
// Lookup the module in the list of built-in modules.
507-
let module = modules::BUILTIN_MODULES
508-
.get(module_name)
509-
.unwrap_or_else(|| panic!("module `{module_name}` not found"));
510-
511-
let root_struct_name = module.root_struct_descriptor.full_name();
512-
513-
let module_output;
514-
// If the user already provided some output for the module by
515-
// calling `Scanner::set_module_output`, use that output. If not,
516-
// call the module's main function (if the module has a main
517-
// function) for getting its output.
518-
if let Some(output) =
519-
ctx.user_provided_module_outputs.remove(root_struct_name)
520-
{
521-
module_output = Some(output);
522-
} else {
523-
let meta: Option<&'opts [u8]> =
524-
options.as_ref().and_then(|options| {
525-
options.module_metadata.get(module_name).copied()
526-
});
527-
528-
if let Some(main_fn) = module.main_fn {
529-
module_output = Some(
530-
main_fn(ctx.scanned_data().unwrap(), meta).map_err(
531-
|err| ScanError::ModuleError {
532-
module: module_name.to_string(),
533-
err,
534-
},
535-
)?,
536-
);
537-
} else {
538-
module_output = None;
539-
}
540-
}
517+
ctx.lazy_modules =
518+
options.as_ref().is_some_and(|options| options.lazy_modules);
541519

542-
if let Some(module_output) = &module_output {
543-
// Make sure that the module is returning a protobuf message of
544-
// the expected type.
545-
debug_assert_eq!(
546-
module_output.descriptor_dyn().full_name(),
547-
module.root_struct_descriptor.full_name(),
548-
"main function of module `{}` must return `{}`, but returned `{}`",
549-
module_name,
550-
module.root_struct_descriptor.full_name(),
551-
module_output.descriptor_dyn().full_name(),
552-
);
553-
554-
// Make sure that the module is returning a protobuf message
555-
// where all required fields are initialized. This only applies
556-
// to proto2, proto3 doesn't have "required" fields, all fields
557-
// are optional.
558-
debug_assert!(
559-
module_output.is_initialized_dyn(),
560-
"module `{}` returned a protobuf `{}` where some required fields are not initialized ",
561-
module_name,
562-
module.root_struct_descriptor.full_name()
563-
);
520+
if let Some(options) = options.as_ref() {
521+
for (module_name, metadata) in &options.module_metadata {
522+
ctx.module_metadata
523+
.insert((*module_name).to_string(), (*metadata).to_vec());
564524
}
525+
}
565526

566-
// When constant folding is enabled we don't need to generate
567-
// structure fields for enums. This is because during the
568-
// optimization process symbols like MyEnum.ENUM_ITEM are resolved
569-
// to their constant values at compile time. In other words, the
570-
// compiler determines that MyEnum.ENUM_ITEM is equal to some value
571-
// X, and uses that value in the generated code.
572-
//
573-
// However, without constant folding, enums are treated as any
574-
// other field in a struct, and their values are determined at scan
575-
// time. For that reason these fields must be generated for enums
576-
// when constant folding is disabled.
577-
let generate_fields_for_enums =
578-
!cfg!(feature = "constant-folding");
579-
580-
let module_struct = Struct::from_proto_descriptor_and_msg(
581-
&module.root_struct_descriptor,
582-
module_output.as_deref(),
583-
generate_fields_for_enums,
584-
);
585-
586-
if let Some(module_output) = module_output {
587-
ctx.module_outputs
588-
.insert(root_struct_name.to_string(), module_output);
527+
if !ctx.lazy_modules {
528+
let imported_modules: Vec<String> =
529+
ctx.compiled_rules.imports().map(str::to_owned).collect();
530+
for module_name in imported_modules {
531+
ctx.materialize_module(module_name.as_str())?;
589532
}
590-
591-
// The data structure obtained from the module is added to the
592-
// root structure. Any data from previous scans will be replaced
593-
// with the new data structure.
594-
ctx.root_struct
595-
.add_field(module_name, TypeValue::Struct(module_struct));
596533
}
597534

598-
// The user provided module outputs are not needed anymore. Let's
599-
// clear any remaining entry in the hash map (which can happen if
600-
// the user has set outputs for modules that are not even imported
601-
// by the rules.
602-
ctx.user_provided_module_outputs.clear();
603-
604535
// Clear the flag that indicates that the search phase was done.
605536
ctx.set_pattern_search_done(false);
606537

607538
// Evaluate the conditions of every rule, this will call
608539
// `ScanContext::search_for_patterns` if necessary.
609-
ctx.eval_conditions()?;
540+
let eval_result = ctx.eval_conditions();
541+
ctx.module_metadata.clear();
542+
ctx.user_provided_module_outputs.clear();
543+
eval_result?;
610544

611545
let data = match ctx.scan_state.take() {
612546
ScanState::ScanningData(data) => data,
@@ -698,7 +632,9 @@ impl<'a, 'r> ScanResults<'a, 'r> {
698632
/// data.
699633
///
700634
/// The result will be `None` if the module doesn't exist or didn't
701-
/// produce any output.
635+
/// produce any output. When [`ScanOptions::lazy_modules`] is enabled,
636+
/// imported modules that were never accessed during condition evaluation
637+
/// are omitted as well.
702638
pub fn module_output(
703639
&self,
704640
module_name: &str,
@@ -715,7 +651,9 @@ impl<'a, 'r> ScanResults<'a, 'r> {
715651
/// Returns an iterator that yields tuples composed of a YARA module name
716652
/// and the protobuf produced by that module.
717653
///
718-
/// Only returns the modules that produced some output.
654+
/// Only returns the modules that produced some output. When
655+
/// [`ScanOptions::lazy_modules`] is enabled, imported modules that were
656+
/// never accessed during condition evaluation are not included.
719657
pub fn module_outputs(&self) -> ModuleOutputs<'a, 'r> {
720658
ModuleOutputs::new(self.ctx)
721659
}

0 commit comments

Comments
 (0)