Skip to content

Commit b0d3360

Browse files
authored
perf: optimize scans using header constraints (VirusTotal#676)
This change introduces a mechanism to detect and leverage "header constraints" derived from YARA rule conditions. These constraints, such as `uint32(0) == 0x464c457f` or `$a at 0`, specify required byte sequences or integer values at fixed offsets from the start of the file. During scanning, if the initial bytes of the data do not satisfy a pattern's header constraints, that pattern is disabled before any detailed matching attempts. This early pruning reduces redundant work and improves scan performance, particularly for files with well-defined magic bytes or headers that don't match specific rule conditions. The existing filesize bounds checks are also refactored to use this new pattern disabling mechanism for consistency and clearer logic.
1 parent ca7b8e7 commit b0d3360

7 files changed

Lines changed: 559 additions & 22 deletions

File tree

lib/src/compiler/ir/ast2ir.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ use crate::compiler::ir::{
3131
};
3232
use crate::compiler::report::{Level, ReportBuilder};
3333
use crate::compiler::{
34-
CompileContext, CompileError, FilesizeBounds, ForVars, PatternIdx,
35-
RegexId, RegexSetId, TextPatternAsHex, warnings,
34+
CompileContext, CompileError, FilesizeBounds, ForVars, HeaderConstraint,
35+
PatternIdx, RegexId, RegexSetId, TextPatternAsHex, warnings,
3636
};
3737
use crate::errors::CustomError;
3838
use crate::errors::{MethodNotAllowedInWith, PotentiallySlowLoop};
@@ -258,6 +258,7 @@ pub(in crate::compiler) fn text_pattern_from_ast<'src>(
258258
base64wide_alphabet,
259259
anchored_at: None,
260260
filesize_bounds: FilesizeBounds::default(),
261+
header_constraints: HeaderConstraint::default(),
261262
}),
262263
})
263264
}
@@ -312,6 +313,7 @@ pub(in crate::compiler) fn hex_pattern_from_ast<'src>(
312313
flags: PatternFlags::Ascii,
313314
anchored_at: None,
314315
filesize_bounds: FilesizeBounds::default(),
316+
header_constraints: HeaderConstraint::default(),
315317
}),
316318
})
317319
}
@@ -448,6 +450,7 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
448450
hir,
449451
anchored_at: None,
450452
filesize_bounds: FilesizeBounds::default(),
453+
header_constraints: HeaderConstraint::default(),
451454
}),
452455
})
453456
}

lib/src/compiler/ir/mod.rs

Lines changed: 267 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ allows using the same regex engine for matching both types of patterns.
2929
[Hir]: regex_syntax::hir::Hir
3030
*/
3131

32-
use std::collections::Bound;
32+
use std::collections::btree_map::Entry;
33+
use std::collections::{BTreeMap, Bound};
3334
use std::fmt::{Debug, Formatter};
3435
use std::hash::{Hash, Hasher};
3536
use std::mem;
@@ -51,7 +52,7 @@ use crate::compiler::ir::dfs::{
5152
DFSIter, DFSWithScopeIter, Event, EventContext, dfs_common,
5253
};
5354

54-
use crate::compiler::{FilesizeBounds, RegexSetId};
55+
use crate::compiler::{FilesizeBounds, HeaderConstraint, RegexSetId};
5556
use crate::re;
5657
use crate::symbols::Symbol;
5758
use crate::types::Value::Const;
@@ -310,6 +311,17 @@ impl Pattern {
310311
}
311312
}
312313
}
314+
315+
pub fn set_header_constraints(&mut self, constraints: &HeaderConstraint) {
316+
match self {
317+
Pattern::Text(literal) => {
318+
literal.header_constraints = constraints.clone();
319+
}
320+
Pattern::Regexp(regexp) | Pattern::Hex(regexp) => {
321+
regexp.header_constraints = constraints.clone();
322+
}
323+
}
324+
}
313325
}
314326

315327
#[derive(Clone, Eq, Hash, PartialEq)]
@@ -321,6 +333,7 @@ pub(crate) struct LiteralPattern {
321333
pub base64_alphabet: Option<String>,
322334
pub base64wide_alphabet: Option<String>,
323335
pub filesize_bounds: FilesizeBounds,
336+
pub header_constraints: HeaderConstraint,
324337
}
325338

326339
#[derive(Clone, Eq, Hash, PartialEq)]
@@ -329,6 +342,7 @@ pub(crate) struct RegexpPattern {
329342
pub hir: re::hir::Hir,
330343
pub anchored_at: Option<usize>,
331344
pub filesize_bounds: FilesizeBounds,
345+
pub header_constraints: HeaderConstraint,
332346
}
333347

334348
/// The index of a pattern in the rule that declares it.
@@ -992,6 +1006,251 @@ impl IR {
9921006

9931007
result
9941008
}
1009+
1010+
pub fn header_constraints(
1011+
&self,
1012+
pattern_prefix_lookup: impl Fn(PatternIdx) -> Option<Vec<u8>>,
1013+
) -> HeaderConstraint {
1014+
let mut constrained_bytes = BTreeMap::new();
1015+
let mut unsatisfiable = false;
1016+
let mut dfs = self.dfs_iter(self.root.unwrap());
1017+
1018+
while let Some(evt) = dfs.next() {
1019+
let expr = match evt {
1020+
Event::Enter((_, expr, _)) => expr,
1021+
_ => continue,
1022+
};
1023+
match expr {
1024+
Expr::Eq { lhs, rhs } => {
1025+
self.extract_header_constraints_from_eq(
1026+
*lhs,
1027+
*rhs,
1028+
&mut constrained_bytes,
1029+
&mut unsatisfiable,
1030+
);
1031+
}
1032+
Expr::PatternMatch { pattern, anchor } => {
1033+
if let MatchAnchor::At(offset_expr) = anchor
1034+
&& let Some(0) =
1035+
self.get(*offset_expr).try_as_const_integer()
1036+
&& let Some(prefix_bytes) =
1037+
pattern_prefix_lookup(*pattern)
1038+
{
1039+
for (i, &b) in prefix_bytes.iter().enumerate() {
1040+
match constrained_bytes.entry(i) {
1041+
Entry::Occupied(entry) => {
1042+
if *entry.get() != b {
1043+
unsatisfiable = true;
1044+
break;
1045+
}
1046+
}
1047+
Entry::Vacant(entry) => {
1048+
entry.insert(b);
1049+
}
1050+
}
1051+
}
1052+
}
1053+
}
1054+
_ => {}
1055+
}
1056+
if unsatisfiable {
1057+
break;
1058+
}
1059+
if !matches!(expr, Expr::And { .. }) {
1060+
dfs.prune();
1061+
}
1062+
}
1063+
1064+
if unsatisfiable {
1065+
return HeaderConstraint::Unsatisfiable;
1066+
}
1067+
1068+
// If the first byte in `constrained_bytes` is at offset 0, we can
1069+
// return HeaderConstraint::Constrained.
1070+
if let Some((0, _)) = constrained_bytes.first_key_value() {
1071+
HeaderConstraint::Constrained(
1072+
// Take only the bytes at consecutive offsets starting at 0.
1073+
constrained_bytes
1074+
.into_iter()
1075+
.enumerate()
1076+
.map_while(
1077+
|(i, (offset, byte))| {
1078+
if i == offset { Some(byte) } else { None }
1079+
},
1080+
)
1081+
.collect(),
1082+
)
1083+
} else {
1084+
HeaderConstraint::Unconstrained
1085+
}
1086+
}
1087+
1088+
fn extract_header_constraints_from_eq(
1089+
&self,
1090+
lhs: ExprId,
1091+
rhs: ExprId,
1092+
constrained_bytes: &mut BTreeMap<usize, u8>,
1093+
unsatisfiable: &mut bool,
1094+
) {
1095+
if let Some(val) = self.get(rhs).try_as_const_integer()
1096+
&& self.apply_int_read_constraint(
1097+
constrained_bytes,
1098+
unsatisfiable,
1099+
lhs,
1100+
val,
1101+
)
1102+
{
1103+
return;
1104+
}
1105+
if let Some(val) = self.get(lhs).try_as_const_integer() {
1106+
self.apply_int_read_constraint(
1107+
constrained_bytes,
1108+
unsatisfiable,
1109+
rhs,
1110+
val,
1111+
);
1112+
}
1113+
}
1114+
1115+
fn add_constraint(
1116+
&self,
1117+
constrained_bytes: &mut BTreeMap<usize, u8>,
1118+
unsatisfiable: &mut bool,
1119+
offset: usize,
1120+
value: u8,
1121+
) {
1122+
if *unsatisfiable {
1123+
return;
1124+
}
1125+
match constrained_bytes.entry(offset) {
1126+
Entry::Occupied(entry) => {
1127+
if *entry.get() != value {
1128+
*unsatisfiable = true;
1129+
}
1130+
}
1131+
Entry::Vacant(entry) => {
1132+
entry.insert(value);
1133+
}
1134+
}
1135+
}
1136+
1137+
fn apply_int_read_constraint(
1138+
&self,
1139+
constrained_bytes: &mut BTreeMap<usize, u8>,
1140+
unsatisfiable: &mut bool,
1141+
expr_id: ExprId,
1142+
val: i64,
1143+
) -> bool {
1144+
let func_call = match self.get(expr_id) {
1145+
Expr::FuncCall(func_call) => func_call,
1146+
_ => return false,
1147+
};
1148+
1149+
if let Some(offset) = func_call
1150+
.args
1151+
.first()
1152+
.and_then(|arg| self.get(*arg).try_as_const_integer())
1153+
&& offset >= 0
1154+
{
1155+
match func_call.plain_name() {
1156+
"uint8" | "int8" | "uint8be" | "int8be" => {
1157+
self.add_constraint(
1158+
constrained_bytes,
1159+
unsatisfiable,
1160+
offset as usize,
1161+
val as u8,
1162+
);
1163+
return true;
1164+
}
1165+
"uint16" | "int16" => {
1166+
self.add_constraint(
1167+
constrained_bytes,
1168+
unsatisfiable,
1169+
offset as usize,
1170+
(val as u16 & 0xff) as u8,
1171+
);
1172+
self.add_constraint(
1173+
constrained_bytes,
1174+
unsatisfiable,
1175+
offset as usize + 1,
1176+
((val as u16 >> 8) & 0xff) as u8,
1177+
);
1178+
return true;
1179+
}
1180+
"uint16be" | "int16be" => {
1181+
self.add_constraint(
1182+
constrained_bytes,
1183+
unsatisfiable,
1184+
offset as usize,
1185+
((val as u16 >> 8) & 0xff) as u8,
1186+
);
1187+
self.add_constraint(
1188+
constrained_bytes,
1189+
unsatisfiable,
1190+
offset as usize + 1,
1191+
(val as u16 & 0xff) as u8,
1192+
);
1193+
return true;
1194+
}
1195+
"uint32" | "int32" => {
1196+
self.add_constraint(
1197+
constrained_bytes,
1198+
unsatisfiable,
1199+
offset as usize,
1200+
(val as u32 & 0xff) as u8,
1201+
);
1202+
self.add_constraint(
1203+
constrained_bytes,
1204+
unsatisfiable,
1205+
offset as usize + 1,
1206+
((val as u32 >> 8) & 0xff) as u8,
1207+
);
1208+
self.add_constraint(
1209+
constrained_bytes,
1210+
unsatisfiable,
1211+
offset as usize + 2,
1212+
((val as u32 >> 16) & 0xff) as u8,
1213+
);
1214+
self.add_constraint(
1215+
constrained_bytes,
1216+
unsatisfiable,
1217+
offset as usize + 3,
1218+
((val as u32 >> 24) & 0xff) as u8,
1219+
);
1220+
return true;
1221+
}
1222+
"uint32be" | "int32be" => {
1223+
self.add_constraint(
1224+
constrained_bytes,
1225+
unsatisfiable,
1226+
offset as usize,
1227+
((val as u32 >> 24) & 0xff) as u8,
1228+
);
1229+
self.add_constraint(
1230+
constrained_bytes,
1231+
unsatisfiable,
1232+
offset as usize + 1,
1233+
((val as u32 >> 16) & 0xff) as u8,
1234+
);
1235+
self.add_constraint(
1236+
constrained_bytes,
1237+
unsatisfiable,
1238+
offset as usize + 2,
1239+
((val as u32 >> 8) & 0xff) as u8,
1240+
);
1241+
self.add_constraint(
1242+
constrained_bytes,
1243+
unsatisfiable,
1244+
offset as usize + 3,
1245+
(val as u32 & 0xff) as u8,
1246+
);
1247+
return true;
1248+
}
1249+
_ => {}
1250+
}
1251+
}
1252+
false
1253+
}
9951254
}
9961255

9971256
impl IR {
@@ -2367,6 +2626,12 @@ impl FuncCall {
23672626
pub fn mangled_name(&self) -> &str {
23682627
self.signature().mangled_name.as_str()
23692628
}
2629+
2630+
/// Returns the plain function name, without argument or return type
2631+
/// information (i.e: everything before the `@` in the name).
2632+
pub fn plain_name(&self) -> &str {
2633+
self.signature().mangled_name.plain_name()
2634+
}
23702635
}
23712636

23722637
/// An `of` expression with a tuple of expressions (e.g. `1 of (true, false)`).

0 commit comments

Comments
 (0)