@@ -26,6 +26,10 @@ const MAX_POSITIONS_PER_PREFIX: usize = 32;
2626/// Trigger hash table compaction when entry count exceeds this
2727const MAX_HASH_TABLE_ENTRIES : usize = 50_000 ;
2828
29+ /// Compaction evicts down to this low watermark, so it runs at most once per
30+ /// `MAX_HASH_TABLE_ENTRIES - COMPACT_TARGET_ENTRIES` inserted prefixes
31+ const COMPACT_TARGET_ENTRIES : usize = MAX_HASH_TABLE_ENTRIES / 2 ;
32+
2933/// ZGFX compressor maintaining a 2.5 MB history buffer and prefix hash table.
3034pub struct Compressor {
3135 history : Vec < u8 > ,
@@ -133,7 +137,15 @@ impl Compressor {
133137 }
134138 }
135139
136- /// Halve stored positions per prefix to bound memory.
140+ /// Halve stored positions per prefix, then evict whole prefixes down to
141+ /// `COMPACT_TARGET_ENTRIES` when the table is over the cap.
142+ ///
143+ /// INVARIANT: the table holds at most `MAX_HASH_TABLE_ENTRIES` prefixes on
144+ /// return. Incompressible input yields a near-unique prefix per byte, so
145+ /// trimming position lists alone never lowers the prefix count; without
146+ /// evicting whole prefixes the table would stay above the threshold and the
147+ /// caller would re-run compaction on every literal byte at O(table) cost.
148+ /// Evicting to a lower watermark amortizes that cost to O(1) per byte.
137149 fn compact_hash_table ( & mut self ) {
138150 for positions in self . match_table . values_mut ( ) {
139151 if positions. len ( ) > MAX_POSITIONS_PER_PREFIX / 2 {
@@ -142,6 +154,25 @@ impl Compressor {
142154 }
143155 }
144156 self . match_table . retain ( |_, positions| !positions. is_empty ( ) ) ;
157+
158+ if self . match_table . len ( ) <= COMPACT_TARGET_ENTRIES {
159+ return ;
160+ }
161+
162+ // Keep the most-recently-seen prefixes; older ones point further back
163+ // than a fresh match can reach, and distance is capped at
164+ // MAX_MATCH_DISTANCE regardless. Each history position belongs to a
165+ // single prefix, so these newest positions are distinct and the cutoff
166+ // retains exactly COMPACT_TARGET_ENTRIES entries.
167+ let mut newest: Vec < usize > = self
168+ . match_table
169+ . values ( )
170+ . map ( |positions| positions. last ( ) . copied ( ) . unwrap_or ( 0 ) )
171+ . collect ( ) ;
172+ let cutoff_index = newest. len ( ) - COMPACT_TARGET_ENTRIES ;
173+ let cutoff = * newest. select_nth_unstable ( cutoff_index) . 1 ;
174+ self . match_table
175+ . retain ( |_, positions| positions. last ( ) . is_some_and ( |& pos| pos >= cutoff) ) ;
145176 }
146177
147178 /// Search hash table for the longest match at `input[pos..]`.
@@ -460,6 +491,38 @@ mod tests {
460491 assert_eq ! ( output, data) ;
461492 }
462493
494+ #[ test]
495+ fn compress_high_entropy_round_trips_and_bounds_table ( ) {
496+ use super :: super :: Decompressor ;
497+
498+ // A near-unique 3-byte prefix per byte is the compactor's worst case:
499+ // trimming per-prefix position lists frees nothing, so without evicting
500+ // whole prefixes the table grows past MAX_HASH_TABLE_ENTRIES and
501+ // compaction re-runs on every literal byte at O(table) cost. A
502+ // deterministic LCG makes the stream exceed the entry cap reproducibly.
503+ let mut state: u32 = 0x1234_5678 ;
504+ let data: Vec < u8 > = core:: iter:: repeat_with ( || {
505+ state = state. wrapping_mul ( 1_664_525 ) . wrapping_add ( 1_013_904_223 ) ;
506+ u8:: try_from ( state >> 24 ) . unwrap ( )
507+ } )
508+ . take ( 100_000 )
509+ . collect ( ) ;
510+
511+ let mut compressor = Compressor :: new ( ) ;
512+ let compressed = compressor. compress ( & data) . unwrap ( ) ;
513+
514+ let mut decompressor = Decompressor :: new ( ) ;
515+ let mut output = Vec :: new ( ) ;
516+ decompressor. decompress_segment ( & compressed, & mut output) . unwrap ( ) ;
517+ assert_eq ! ( output, data) ;
518+
519+ assert ! (
520+ compressor. match_table. len( ) <= MAX_HASH_TABLE_ENTRIES ,
521+ "hash table must stay bounded, got {} entries" ,
522+ compressor. match_table. len( )
523+ ) ;
524+ }
525+
463526 #[ test]
464527 fn bit_writer_basic ( ) {
465528 let mut writer = BitWriter :: new ( ) ;
0 commit comments