Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 130 additions & 25 deletions packages/evm/core/src/compression.rs
Original file line number Diff line number Diff line change
@@ -1,53 +1,158 @@
use std::{borrow::Cow, ops::Deref};

const VERSION: u8 = 1;
// First byte of every stored value tags how the remainder is encoded:
// TAG_RAW: [0][raw bincode] (uncompressed)
// TAG_ZSTD: [1][orig_len: u32 LE][zstd payload] (compressed)
const TAG_RAW: u8 = 0;
const TAG_ZSTD: u8 = 1;

const ZSTD_LEVEL: i32 = 3;

// Compression is only attempted for serialized values at least this large. The frequently-written
// values are poor candidates for two independent reasons:
// - small size (accounts ~70 B, proofs ~120 B): too little context, and zstd's frame overhead
// plus the 4-byte length header outweigh any gain;
// - high-entropy content: keccak code hashes and BLS signatures are effectively random, so there
// is no redundancy to exploit at any size.
// The threshold skips the first case cheaply; the "keep raw unless actually smaller" check on the
// result handles the second. Larger structured values (headers with a sparse logs bloom,
// ABI-padded calldata, receipt logs, bytecode) still compress and are kept only when it shrinks them.
const MIN_COMPRESS_LEN: usize = 256;

#[derive(Debug)]
pub struct CompressedBincode<T>(pub T);
impl<'a, T: serde::Serialize + 'a> heed::BytesEncode<'a> for CompressedBincode<T> {
type EItem = CompressedBincode<&'a T>;
pub struct CompactBincode<T>(pub T);
impl<'a, T: serde::Serialize + 'a> heed::BytesEncode<'a> for CompactBincode<T> {
type EItem = CompactBincode<&'a T>;

fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, heed::BoxedError> {
let raw = bincode::serialize(&item.0)?;
let orig_len = raw.len();
let compressed = zstd::bulk::compress(&raw, ZSTD_LEVEL)?;

let mut out = Vec::with_capacity(1 + 4 + compressed.len());
if raw.len() >= MIN_COMPRESS_LEN {
let compressed = zstd::bulk::compress(&raw, ZSTD_LEVEL)?;

// [1 byte version][4 bytes orig_len LE][compressed...]
out.push(VERSION);
out.extend_from_slice(&(orig_len as u32).to_le_bytes());
out.extend_from_slice(&compressed);
// The compressed layout carries an extra 4-byte original-length header; only keep it
// when the result is genuinely smaller than storing raw (both forms share the 1-byte
// tag, so it cancels out of the comparison).
if compressed.len() + 4 < raw.len() {
let mut out = Vec::with_capacity(1 + 4 + compressed.len());
out.push(TAG_ZSTD);
out.extend_from_slice(&(raw.len() as u32).to_le_bytes());
out.extend_from_slice(&compressed);
return Ok(Cow::Owned(out));
}
}

// Store raw. A value is therefore never persisted larger than its bincode encoding plus the
// single tag byte.
let mut out = Vec::with_capacity(1 + raw.len());
out.push(TAG_RAW);
out.extend_from_slice(&raw);
Ok(Cow::Owned(out))
}
}

impl<'a, T: serde::de::DeserializeOwned + 'a> heed::BytesDecode<'a> for CompressedBincode<T> {
type DItem = CompressedBincode<T>;
impl<'a, T: serde::de::DeserializeOwned + 'a> heed::BytesDecode<'a> for CompactBincode<T> {
type DItem = CompactBincode<T>;

fn bytes_decode(bytes: &'_ [u8]) -> Result<Self::DItem, heed::BoxedError> {
let version = bytes[0];
assert_eq!(version, VERSION, "unsupported version");

let mut len_bytes = [0u8; 4];
len_bytes.copy_from_slice(&bytes[1..5]);
let orig_len = u32::from_le_bytes(len_bytes) as usize;
let (&tag, payload) = bytes
.split_first()
.ok_or("CompressedBincode: empty value")?;

let payload = &bytes[5..];
let decompressed = zstd::bulk::decompress(payload, orig_len)?;
let deserialized = match tag {
TAG_ZSTD => {
if payload.len() < 4 {
return Err("CompressedBincode: truncated zstd header".into());
}
let (len_bytes, compressed) = payload.split_at(4);
let orig_len = u32::from_le_bytes(len_bytes.try_into().unwrap()) as usize;
let decompressed = zstd::bulk::decompress(compressed, orig_len)?;
bincode::deserialize(&decompressed)?
}
TAG_RAW => bincode::deserialize(payload)?,
other => return Err(format!("CompressedBincode: unknown tag {other}").into()),
};

let deserialized = bincode::deserialize(&decompressed)?;

Ok(CompressedBincode(deserialized))
Ok(CompactBincode(deserialized))
}
}

impl<T> Deref for CompressedBincode<T> {
impl<T> Deref for CompactBincode<T> {
type Target = T;

fn deref(&self) -> &Self::Target {
&self.0
}
}

#[cfg(test)]
mod tests {
use heed::{BytesDecode, BytesEncode};

use super::*;

fn encode(value: &Vec<u8>) -> Vec<u8> {
let item = CompactBincode(value);
<CompactBincode<Vec<u8>> as BytesEncode>::bytes_encode(&item)
.unwrap()
.into_owned()
}

fn decode(bytes: &[u8]) -> Vec<u8> {
<CompactBincode<Vec<u8>> as BytesDecode>::bytes_decode(bytes)
.unwrap()
.0
}

#[test]
fn small_value_is_stored_raw() {
let value = vec![1u8, 2, 3, 4, 5];
let encoded = encode(&value);
assert_eq!(encoded[0], TAG_RAW);
assert_eq!(decode(&encoded), value);
}

#[test]
fn large_compressible_value_is_stored_zstd_and_smaller() {
let value = vec![7u8; 4096];
let encoded = encode(&value);
assert_eq!(encoded[0], TAG_ZSTD);
assert!(encoded.len() < value.len());
assert_eq!(decode(&encoded), value);
}

#[test]
fn output_never_exceeds_raw_plus_tag() {
// A large, hard-to-compress value: compression is attempted but must fall back to raw
// rather than store something bigger.
let value: Vec<u8> = (0..2048u32)
.map(|i| (i.wrapping_mul(2_654_435_761) >> 13) as u8)
.collect();
let raw_len = bincode::serialize(&value).unwrap().len();

let encoded = encode(&value);
assert!(
encoded.len() <= raw_len + 1,
"encoded {} raw {}",
encoded.len(),
raw_len
);
assert_eq!(decode(&encoded), value);
}

#[test]
fn empty_input_is_an_error_not_a_panic() {
assert!(<CompactBincode<Vec<u8>> as BytesDecode>::bytes_decode(&[]).is_err());
}

#[test]
fn truncated_zstd_header_is_an_error_not_a_panic() {
// TAG_ZSTD with fewer than the four orig_len header bytes must error, not panic on the slice.
assert!(<CompactBincode<Vec<u8>> as BytesDecode>::bytes_decode(&[TAG_ZSTD, 1, 2]).is_err());
}

#[test]
fn unknown_tag_is_an_error_not_a_panic() {
assert!(<CompactBincode<Vec<u8>> as BytesDecode>::bytes_decode(&[9, 0, 0]).is_err());
}
}
Loading
Loading