perf: optimize PikeVM by matching multiple literal bytes in tight loop when possible (VirusTotal#678)

plusvic · web-flow · commit b5e93fc6ee3a · 2026-06-11T15:39:18.000+02:00
This PR introduces an optimization to PikeVM matching for contiguous runs of literal bytes, speeding up regex matching for patterns containing literal sequences.

Introduced Instr::Bytes(LiteralBytesIter) to group contiguous literal bytes and match them in a single fast loop, bypassing the VM thread scheduling, state transitions, and epsilon-closure overhead for every byte. Built a lazy iterator LiteralBytesIter that decodes bytecode literals on-the-fly, resolving escaped OPCODE_PREFIX (0xAA) sequences and stopping at unescaped control opcodes.

The optimization is enabled only when there is exactly one active thread in the VM (self.threads.len() == 1). This prevents desynchronizing other threads that might be matching at different offsets or branches if a run consumes multiple bytes.
diff --git a/lib/src/re/thompson/compiler.rs b/lib/src/re/thompson/compiler.rs
@@ -1723,6 +1723,10 @@ impl Display for InstrSeq {
                 Instr::Byte(byte) => {
                     writeln!(f, "{addr:05x}: LIT {byte:#04x}")?;
                 }
+                Instr::Bytes(iter) => {
+                    let bytes: Vec<u8> = iter.clone().collect();
+                    writeln!(f, "{addr:05x}: BYTES {:?}", bytes)?;
+                }
                 Instr::MaskedByte { byte, mask } => {
                     writeln!(
                         f,
diff --git a/lib/src/re/thompson/instr.rs b/lib/src/re/thompson/instr.rs
@@ -171,6 +171,9 @@ pub enum Instr<'a> {
     /// Matches a specific byte.
     Byte(u8),
 
+    /// Matches a sequence of bytes.
+    Bytes(LiteralBytesIter<'a>),
+
     /// Matches a case-insensitive character. The value of `u8` is in the
     /// range a-z.
     CaseInsensitiveChar(u8),
@@ -320,7 +323,31 @@ impl<'a> InstrParser<'a> {
     }
 
     #[inline(always)]
-    pub fn decode_instr(code: &[u8]) -> (Instr<'_>, usize) {
+    fn is_bytes_run(code: &[u8]) -> bool {
+        match code {
+            [
+                OPCODE_PREFIX,
+                OPCODE_PREFIX,
+                OPCODE_PREFIX,
+                OPCODE_PREFIX,
+                ..,
+            ] => true,
+            [OPCODE_PREFIX, OPCODE_PREFIX, x, ..] if *x != OPCODE_PREFIX => {
+                true
+            }
+            [x, OPCODE_PREFIX, OPCODE_PREFIX, ..] if *x != OPCODE_PREFIX => {
+                true
+            }
+            [x, y, ..] if *x != OPCODE_PREFIX && *y != OPCODE_PREFIX => true,
+            _ => false,
+        }
+    }
+
+    #[inline(always)]
+    pub fn decode_instr(
+        code: &[u8],
+        decode_literal_runs: bool,
+    ) -> (Instr<'_>, usize) {
         match code[..] {
             [OPCODE_PREFIX, Instr::ANY_BYTE, ..] => (Instr::AnyByte, 2),
             [OPCODE_PREFIX, Instr::MASKED_BYTE, byte, mask, ..] => {
@@ -425,10 +452,15 @@ impl<'a> InstrParser<'a> {
             [OPCODE_PREFIX, Instr::WORD_START, ..] => (Instr::WordStart, 2),
             [OPCODE_PREFIX, Instr::WORD_END, ..] => (Instr::WordEnd, 2),
             [OPCODE_PREFIX, Instr::MATCH, ..] => (Instr::Match, 2),
-            [OPCODE_PREFIX, OPCODE_PREFIX, ..] => {
-                (Instr::Byte(OPCODE_PREFIX), 2)
+            [_, ..] => {
+                if decode_literal_runs && Self::is_bytes_run(code) {
+                    (Instr::Bytes(LiteralBytesIter::new(code)), 0)
+                } else if code[0] == OPCODE_PREFIX {
+                    (Instr::Byte(OPCODE_PREFIX), 2)
+                } else {
+                    (Instr::Byte(code[0]), 1)
+                }
             }
-            [b, ..] => (Instr::Byte(b), 1),
             _ => unreachable!(),
         }
     }
@@ -473,7 +505,7 @@ impl<'a> Iterator for InstrParser<'a> {
         if self.code.is_empty() {
             return None;
         }
-        let (instr, size) = InstrParser::decode_instr(self.code);
+        let (instr, size) = InstrParser::decode_instr(self.code, false);
         let addr = self.addr;
         self.addr += size;
         self.code = &self.code[size..];
@@ -595,3 +627,122 @@ pub fn literal_code_length(literal: &[u8]) -> usize {
     }
     length
 }
+
+#[derive(Clone, Debug)]
+pub struct LiteralBytesIter<'a> {
+    code: &'a [u8],
+    offset: usize,
+}
+
+impl<'a> LiteralBytesIter<'a> {
+    #[inline(always)]
+    pub fn new(code: &'a [u8]) -> Self {
+        Self { code, offset: 0 }
+    }
+
+    #[inline(always)]
+    pub fn consumed(&self) -> usize {
+        self.offset
+    }
+}
+
+impl<'a> Iterator for LiteralBytesIter<'a> {
+    type Item = u8;
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.offset >= self.code.len() {
+            return None;
+        }
+        match self.code[self.offset..] {
+            [OPCODE_PREFIX, OPCODE_PREFIX, ..] => {
+                self.offset += 2;
+                Some(OPCODE_PREFIX)
+            }
+            [OPCODE_PREFIX, ..] => None,
+            [byte, ..] => {
+                self.offset += 1;
+                Some(byte)
+            }
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_decode_instr() {
+        // Run of standard literal bytes
+        let code = [0x01, 0x02, OPCODE_PREFIX, Instr::SPLIT_B];
+        let (instr, size) = InstrParser::decode_instr(&code, true);
+        assert_eq!(size, 0);
+        if let Instr::Bytes(mut iter) = instr {
+            assert_eq!(iter.next(), Some(0x01));
+            assert_eq!(iter.next(), Some(0x02));
+            assert_eq!(iter.next(), None);
+            assert_eq!(iter.consumed(), 2);
+        } else {
+            panic!("Expected Instr::Bytes");
+        }
+
+        let (instr, size) = InstrParser::decode_instr(&code, false);
+        assert_eq!(size, 1);
+        assert!(matches!(instr, Instr::Byte(0x01)));
+
+        // Run of escaped OPCODE_PREFIX bytes
+        let code =
+            [OPCODE_PREFIX, OPCODE_PREFIX, OPCODE_PREFIX, OPCODE_PREFIX];
+        let (instr, size) = InstrParser::decode_instr(&code, true);
+        assert_eq!(size, 0);
+        if let Instr::Bytes(mut iter) = instr {
+            assert_eq!(iter.next(), Some(OPCODE_PREFIX));
+            assert_eq!(iter.next(), Some(OPCODE_PREFIX));
+            assert_eq!(iter.next(), None);
+            assert_eq!(iter.consumed(), 4);
+        } else {
+            panic!("Expected Instr::Bytes");
+        }
+
+        // Run with escaped OPCODE_PREFIX and standard literal byte
+        let code = [OPCODE_PREFIX, OPCODE_PREFIX, 0x01];
+        let (instr, size) = InstrParser::decode_instr(&code, true);
+        assert_eq!(size, 0);
+        if let Instr::Bytes(mut iter) = instr {
+            assert_eq!(iter.next(), Some(OPCODE_PREFIX));
+            assert_eq!(iter.next(), Some(0x01));
+            assert_eq!(iter.next(), None);
+            assert_eq!(iter.consumed(), 3);
+        } else {
+            panic!("Expected Instr::Bytes");
+        }
+
+        // Run with standard literal byte and escaped OPCODE_PREFIX
+        let code = [0x01, OPCODE_PREFIX, OPCODE_PREFIX];
+        let (instr, size) = InstrParser::decode_instr(&code, true);
+        assert_eq!(size, 0);
+        if let Instr::Bytes(mut iter) = instr {
+            assert_eq!(iter.next(), Some(0x01));
+            assert_eq!(iter.next(), Some(OPCODE_PREFIX));
+            assert_eq!(iter.next(), None);
+            assert_eq!(iter.consumed(), 3);
+        } else {
+            panic!("Expected Instr::Bytes");
+        }
+
+        // Single literal byte (not a run of >= 2)
+        let code = [0x01, OPCODE_PREFIX, Instr::SPLIT_B];
+        let (instr, size) = InstrParser::decode_instr(&code, true);
+        assert_eq!(size, 1);
+        assert!(matches!(instr, Instr::Byte(0x01)));
+
+        // Single escaped OPCODE_PREFIX (not a run of >= 2)
+        let code =
+            [OPCODE_PREFIX, OPCODE_PREFIX, OPCODE_PREFIX, Instr::SPLIT_B];
+        let (instr, size) = InstrParser::decode_instr(&code, true);
+        assert_eq!(size, 2);
+        assert!(matches!(instr, Instr::Byte(OPCODE_PREFIX)));
+    }
+}
diff --git a/lib/src/re/thompson/pikevm.rs b/lib/src/re/thompson/pikevm.rs
@@ -162,7 +162,6 @@ impl<'r> PikeVM<'r> {
         F: Iterator<Item = &'a u8>,
         B: Iterator<Item = &'a u8>,
     {
-        let step = 1;
         let mut current_pos = 0;
         let mut curr_byte = fwd_input.next();
 
@@ -181,18 +180,87 @@ impl<'r> PikeVM<'r> {
         );
 
         while !self.threads.is_empty() {
-            let next_byte = fwd_input.next();
+            let mut next_byte = fwd_input.next();
+            // When there is only a single active thread in the VM (that is,
+            // `self.threads.len() == 1`), we can optimize execution by
+            // decoding a contiguous run of literal bytes (`Instr::Bytes`) and
+            // matching them in a fast loop.
+            //
+            // This is only safe when there is one thread active. If there were
+            // multiple concurrent threads, matching a byte run would consume
+            // multiple bytes from the input on the fly, which would
+            // desynchronize and bypass other threads matching at different
+            // positions or branches. It's safe to set decode_literal_runs
+            // always to false, it will simply disable the optimization.
+            let decode_literal_runs = self.threads.len() == 1;
 
             for (ip, rep_count) in self.threads.iter() {
-                let (instr, instr_size) = InstrParser::decode_instr(unsafe {
-                    self.code.get_unchecked(*ip..)
-                });
+                let (instr, mut instr_size) = InstrParser::decode_instr(
+                    unsafe { self.code.get_unchecked(*ip..) },
+                    decode_literal_runs,
+                );
 
                 let is_match = match instr {
                     Instr::AnyByte => curr_byte.is_some(),
                     Instr::Byte(byte) => {
                         matches!(curr_byte, Some(b) if *b == byte)
                     }
+                    // `Instr::Bytes` matches a sequence of literal bytes in
+                    // a single VM step. This bypasses standard VM thread
+                    // scheduling and state updates, matching the sequence
+                    // directly against the input stream in a fast loop. This
+                    // is returned only when decode_literal_runs is true.
+                    Instr::Bytes(mut lit_bytes) => {
+                        let is_match = 'is_match: {
+                            let first = match lit_bytes.next() {
+                                Some(first) => first,
+                                None => break 'is_match false,
+                            };
+
+                            if !matches!(curr_byte, Some(b) if *b == first) {
+                                break 'is_match false;
+                            }
+
+                            let second = match lit_bytes.next() {
+                                Some(second) => second,
+                                None => break 'is_match true,
+                            };
+
+                            if !matches!(next_byte, Some(b) if *b == second) {
+                                break 'is_match false;
+                            }
+
+                            curr_byte = next_byte;
+                            current_pos += 1;
+
+                            // Match the remaining literal bytes in the
+                            // sequence by consuming bytes from the input
+                            // stream.
+                            for expected_byte in lit_bytes.by_ref() {
+                                curr_byte = fwd_input.next();
+                                match curr_byte {
+                                    Some(curr_byte) => {
+                                        current_pos += 1;
+                                        if *curr_byte != expected_byte {
+                                            break 'is_match false;
+                                        }
+                                    }
+                                    None => break 'is_match false,
+                                }
+                            }
+
+                            next_byte = fwd_input.next();
+                            break 'is_match true;
+                        };
+
+                        // Since the instruction size is not known
+                        // statically when decoding `Instr::Bytes`, we
+                        // retrieve the number of consumed bytecode bytes
+                        // from the iterator to advance the instruction
+                        // pointer correctly.
+                        instr_size = lit_bytes.consumed();
+                        is_match
+                    }
                     Instr::MaskedByte { byte, mask } => {
                         matches!(curr_byte, Some(b) if *b & mask == byte)
                     }
@@ -226,7 +294,7 @@ impl<'r> PikeVM<'r> {
             }
 
             curr_byte = next_byte;
-            current_pos += step;
+            current_pos += 1;
 
             mem::swap(&mut self.threads, &mut self.next_threads);
             self.next_threads.clear();
@@ -328,11 +396,14 @@ pub(crate) fn epsilon_closure<C: CodeLoc>(
     };
 
     while let Some((ip, mut rep_count)) = state.threads.pop() {
-        let (instr, instr_size) =
-            InstrParser::decode_instr(unsafe { code.get_unchecked(ip..) });
+        let (instr, instr_size) = InstrParser::decode_instr(
+            unsafe { code.get_unchecked(ip..) },
+            false,
+        );
         match instr {
             Instr::AnyByte
             | Instr::Byte(_)
+            | Instr::Bytes(_)
             | Instr::MaskedByte { .. }
             | Instr::CaseInsensitiveChar(_)
             | Instr::ClassBitmap(_)
diff --git a/lib/src/scanner/tests.rs b/lib/src/scanner/tests.rs
@@ -1053,3 +1053,49 @@ fn fast_scan_mode() {
         test_count.patterns().filter(|p| p.identifier() == "$c");
     assert_eq!(patterns_c.next().unwrap().matches().len(), 2);
 }
+
+#[test]
+fn test_pikevm_literal_run_optimization() {
+    let rules = crate::compile(
+        r#"
+        rule test_opt {
+            strings:
+                $a = /abcdefg.*hijk.*lmno/
+            condition:
+                $a
+        }
+        "#,
+    )
+    .unwrap();
+
+    let mut scanner = Scanner::new(&rules);
+
+    let results = scanner.scan(b"abcdefg_hijk_lmno").unwrap();
+    assert_eq!(results.matching_rules().count(), 1);
+
+    let results = scanner.scan(b"abcdefg_hijk_lmn").unwrap();
+    assert_eq!(results.matching_rules().count(), 0);
+
+    let results = scanner.scan(b"abcdef_hijk_lmno").unwrap();
+    assert_eq!(results.matching_rules().count(), 0);
+}
+
+#[test]
+fn test_slow_rule_hang() {
+    let rules = crate::compile(
+        r#"
+        rule test {
+            strings:
+                $zero_padding = /\x00{860,}/
+            condition:
+                $zero_padding
+        }
+        "#,
+    )
+    .unwrap();
+
+    let mut scanner = Scanner::new(&rules);
+    let data = vec![0u8; 2000];
+    let results = scanner.scan(&data).unwrap();
+    assert_eq!(results.matching_rules().count(), 1);
+}