From 127250a19c189f735a7bf8e067ee788edb7c8bce Mon Sep 17 00:00:00 2001 From: brendanzab Date: Mon, 1 May 2023 14:31:24 +1000 Subject: [PATCH] Tweak input manipulation formats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This extracts the notion of “not consuming the input” from the input manipulation formats, and adjusts the names of these formats to work more nicely together. This was partly inspired by the paper “Interval Parsing Grammars for File Format Parsing” - except here we default to consuming bytes from the input as opposed to parsing in-place. --- src/lib.rs | 51 +++++++++++++++++--------- src/main.rs | 10 ++--- src/output/tree.rs | 17 ++++++--- tests/expected/decode/test.jpg.stdout | 18 ++++----- tests/expected/decode/test.png.stdout | 16 ++++---- tests/expected/decode/test.webp.stdout | 8 ++-- tests/expected/decode/test2.jpg.stdout | 22 +++++------ 7 files changed, 82 insertions(+), 60 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 523a046e..da260e95 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -210,10 +210,14 @@ pub enum Format { Repeat1(Box), /// Repeat a format an exact number of times RepeatCount(Expr, Box), - /// Restrict a format to a sub-stream of a given number of bytes - Slice(Expr, Box), - /// Matches a format at a byte offset relative to the current stream position - WithRelativeOffset(Expr, Box), + /// Matches a format without consuming the input + WithInput(Box), + /// Takes a given number of bytes from the start of the input, matching + /// the format against those bytes + TakeBytes(Expr, Box), + /// Drops a given number of bytes from the start of the input, matching + /// the format against the remaining input + DropBytes(Expr, Box), /// Transform a decoded value with a function Map(Func, Box), /// Pattern match on an expression @@ -261,8 +265,9 @@ pub enum Decoder { While(MatchTree, Box), Until(MatchTree, Box), RepeatCount(Expr, Box), - Slice(Expr, Box), - WithRelativeOffset(Expr, Box), + WithInput(Box), + TakeBytes(Expr, Box), + DropBytes(Expr, Box), Map(Func, Box), Match(Expr, Vec<(Pattern, Decoder)>), } @@ -408,8 +413,9 @@ impl Format { Format::Repeat(_a) => true, Format::Repeat1(_a) => false, Format::RepeatCount(_expr, _a) => true, - Format::Slice(_expr, _a) => true, - Format::WithRelativeOffset(_, _) => true, + Format::WithInput(_) => true, + Format::TakeBytes(_, _) => true, + Format::DropBytes(_, _) => true, Format::Map(_f, a) => a.is_nullable(), Format::Match(_, branches) => branches.iter().any(|(_, f)| f.is_nullable()), } @@ -523,10 +529,13 @@ impl<'a> MatchTreeLevel<'a> { Format::RepeatCount(_expr, _a) => { self.accept(index) // FIXME } - Format::Slice(_expr, _a) => { + Format::WithInput(_) => { self.accept(index) // FIXME } - Format::WithRelativeOffset(_expr, _a) => { + Format::TakeBytes(_, _) => { + self.accept(index) // FIXME + } + Format::DropBytes(_, _) => { self.accept(index) // FIXME } Format::Map(_f, a) => self.add(index, a, next), @@ -684,13 +693,17 @@ impl Decoder { let da = Box::new(Decoder::compile_next(a, next)?); Ok(Decoder::RepeatCount(expr.clone(), da)) } - Format::Slice(expr, a) => { + Format::WithInput(a) => { + let da = Box::new(Decoder::compile_next(a, Rc::new(Next::Empty))?); + Ok(Decoder::WithInput(da)) + } + Format::TakeBytes(expr, a) => { let da = Box::new(Decoder::compile_next(a, Rc::new(Next::Empty))?); - Ok(Decoder::Slice(expr.clone(), da)) + Ok(Decoder::TakeBytes(expr.clone(), da)) } - Format::WithRelativeOffset(expr, a) => { + Format::DropBytes(expr, a) => { let da = Box::new(Decoder::compile_next(a, Rc::new(Next::Empty))?); - Ok(Decoder::WithRelativeOffset(expr.clone(), da)) + Ok(Decoder::DropBytes(expr.clone(), da)) } Format::Map(f, a) => { let da = Box::new(Decoder::compile_next(a, next)?); @@ -791,7 +804,11 @@ impl Decoder { } Some((Value::Seq(v), input)) } - Decoder::Slice(expr, a) => { + Decoder::WithInput(a) => { + let (v, _) = a.parse(stack, input)?; + Some((v, input)) + } + Decoder::TakeBytes(expr, a) => { let size = expr.eval_usize(stack); if size <= input.len() { let (slice, input) = input.split_at(size); @@ -801,11 +818,11 @@ impl Decoder { None } } - Decoder::WithRelativeOffset(expr, a) => { + Decoder::DropBytes(expr, a) => { let offset = expr.eval_usize(stack); if offset <= input.len() { let (_, slice) = input.split_at(offset); - let (v, _) = a.parse(stack, slice)?; + let (v, input) = a.parse(stack, slice)?; Some((v, input)) } else { None diff --git a/src/main.rs b/src/main.rs index 0b235f6b..c2e57a63 100644 --- a/src/main.rs +++ b/src/main.rs @@ -393,7 +393,7 @@ fn jpeg_format() -> Format { ("length", u16be()), ( "data", - Format::Slice( + Format::TakeBytes( Expr::Sub( Box::new(Expr::Var(0)), // length Box::new(Expr::U16(2)), @@ -736,7 +736,7 @@ fn png_format() -> Format { record([ ("length", u32be()), // FIXME < 2^31 ("tag", tag), - ("data", Format::Slice(Expr::Var(1), Box::new(data))), + ("data", Format::TakeBytes(Expr::Var(1), Box::new(data))), ("crc", u32be()), // FIXME check this ]) } @@ -793,7 +793,7 @@ fn riff_format() -> Format { record([ ("tag", tag), ("length", u32le()), - ("data", Format::Slice(Expr::Var(0), Box::new(data))), + ("data", Format::TakeBytes(Expr::Var(0), Box::new(data))), ( "pad", if_then_else(is_even(Expr::Var(1)), Format::EMPTY, is_byte(0x00)), @@ -875,7 +875,7 @@ fn tiff_format() -> Format { ), ( "ifd", - Format::WithRelativeOffset( + Format::WithInput(Box::new(Format::DropBytes( // TODO: Offset from start of the TIFF header Expr::Sub(Box::new(Expr::Var(0)), Box::new(Expr::U32(8))), Box::new(Format::Match( @@ -885,7 +885,7 @@ fn tiff_format() -> Format { (Pattern::variant("be", Pattern::UNIT), ifd(true)), ], )), - ), + ))), ), ]) } diff --git a/src/output/tree.rs b/src/output/tree.rs index 41744464..18414653 100644 --- a/src/output/tree.rs +++ b/src/output/tree.rs @@ -71,8 +71,9 @@ impl Context { _ => panic!("expected sequence"), } } - Format::Slice(_, format) => self.write_decoded_value(value, format), - Format::WithRelativeOffset(_, format) => self.write_decoded_value(value, format), + Format::WithInput(format) => self.write_decoded_value(value, format), + Format::TakeBytes(_, format) => self.write_decoded_value(value, format), + Format::DropBytes(_, format) => self.write_decoded_value(value, format), Format::Map(Func::Expr(_), _) => self.write_value(value), Format::Map(Func::TupleProj(index), format) => match format.as_ref() { Format::Tuple(formats) => self.write_decoded_value(value, &formats[*index]), @@ -368,14 +369,18 @@ impl Context { write!(&mut self.writer, " ")?; self.write_atomic_format(format) } - Format::Slice(len, format) => { - write!(&mut self.writer, "slice ")?; + Format::WithInput(format) => { + write!(&mut self.writer, "with-input ")?; + self.write_atomic_format(format) + } + Format::TakeBytes(len, format) => { + write!(&mut self.writer, "take-bytes ")?; self.write_atomic_expr(len)?; write!(&mut self.writer, " ")?; self.write_atomic_format(format) } - Format::WithRelativeOffset(offset, format) => { - write!(&mut self.writer, "with-relative-offset ")?; + Format::DropBytes(offset, format) => { + write!(&mut self.writer, "drop-bytes ")?; self.write_atomic_expr(offset)?; write!(&mut self.writer, " ")?; self.write_atomic_format(format) diff --git a/tests/expected/decode/test.jpg.stdout b/tests/expected/decode/test.jpg.stdout index bde852d6..4e396ae6 100644 --- a/tests/expected/decode/test.jpg.stdout +++ b/tests/expected/decode/test.jpg.stdout @@ -5,7 +5,7 @@ │ │ └── app0 <- { ... } := │ │ ├── marker <- map _.1 (...) := 224 │ │ ├── length <- u16be := 16 - │ │ └── data <- slice (length - 2) { ... } := + │ │ └── data <- take-bytes (length - 2) { ... } := │ │ ├── identifier <- map _.string { ... } := │ │ │ ├── 0 <- [!= 0] := 74 │ │ │ ├── 1 <- [!= 0] := 70 @@ -25,7 +25,7 @@ │ │ │ └── dqt <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 219 │ │ │ ├── length <- u16be := 67 - │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ ├── precision-table-id <- u8 := 0 │ │ │ └── elements <- repeat u8 := │ │ │ ├── 0 <- u8 := 8 @@ -44,7 +44,7 @@ │ │ └── dqt <- { ... } := │ │ ├── marker <- map _.1 (...) := 219 │ │ ├── length <- u16be := 67 - │ │ └── data <- slice (length - 2) { ... } := + │ │ └── data <- take-bytes (length - 2) { ... } := │ │ ├── precision-table-id <- u8 := 1 │ │ └── elements <- repeat u8 := │ │ ├── 0 <- u8 := 9 @@ -63,7 +63,7 @@ │ │ └── sof0 <- { ... } := │ │ ├── marker <- map _.1 (...) := 192 │ │ ├── length <- u16be := 17 - │ │ └── data <- slice (length - 2) { ... } := + │ │ └── data <- take-bytes (length - 2) { ... } := │ │ ├── sample-precision <- u8 := 8 │ │ ├── num-lines <- u16be := 97 │ │ ├── num-samples-per-line <- u16be := 105 @@ -87,7 +87,7 @@ │ │ │ │ └── dht <- { ... } := │ │ │ │ ├── marker <- map _.1 (...) := 196 │ │ │ │ ├── length <- u16be := 27 - │ │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ │ ├── class-table-id <- u8 := 0 │ │ │ │ ├── num-codes <- repeat-count 16 u8 := │ │ │ │ │ ├── 0 <- u8 := 0 @@ -115,7 +115,7 @@ │ │ │ │ └── dht <- { ... } := │ │ │ │ ├── marker <- map _.1 (...) := 196 │ │ │ │ ├── length <- u16be := 58 - │ │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ │ ├── class-table-id <- u8 := 16 │ │ │ │ ├── num-codes <- repeat-count 16 u8 := │ │ │ │ │ ├── 0 <- u8 := 0 @@ -147,7 +147,7 @@ │ │ │ │ └── dht <- { ... } := │ │ │ │ ├── marker <- map _.1 (...) := 196 │ │ │ │ ├── length <- u16be := 26 - │ │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ │ ├── class-table-id <- u8 := 1 │ │ │ │ ├── num-codes <- repeat-count 16 u8 := │ │ │ │ │ ├── 0 <- u8 := 0 @@ -174,7 +174,7 @@ │ │ │ └── dht <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 196 │ │ │ ├── length <- u16be := 38 - │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ ├── class-table-id <- u8 := 17 │ │ │ ├── num-codes <- repeat-count 16 u8 := │ │ │ │ ├── 0 <- u8 := 0 @@ -205,7 +205,7 @@ │ │ ├── sos <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 218 │ │ │ ├── length <- u16be := 12 - │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ ├── num-image-components <- u8 := 3 │ │ │ ├── image-components <- repeat-count num-image-components { ... } := │ │ │ │ ├── 0 <- { ... } := diff --git a/tests/expected/decode/test.png.stdout b/tests/expected/decode/test.png.stdout index b1d8a5dd..8df3cfe5 100644 --- a/tests/expected/decode/test.png.stdout +++ b/tests/expected/decode/test.png.stdout @@ -15,7 +15,7 @@ │ │ ├── 1 <- [= 72] := 72 │ │ ├── 2 <- [= 68] := 68 │ │ └── 3 <- [= 82] := 82 - │ ├── data <- slice length { ... } := + │ ├── data <- take-bytes length { ... } := │ │ ├── width <- u32be := 50 │ │ ├── height <- u32be := 50 │ │ ├── bit-depth <- u8 := 8 @@ -33,7 +33,7 @@ │ │ │ ├── 1 <- [= 76] := 76 │ │ │ ├── 2 <- [= 84] := 84 │ │ │ └── 3 <- [= 69] := 69 - │ │ ├── data <- slice length (repeat u8) := + │ │ ├── data <- take-bytes length (repeat u8) := │ │ │ ├── 0 <- u8 := 192 │ │ │ ├── 1 <- u8 := 192 │ │ │ ├── 2 <- u8 := 192 @@ -55,7 +55,7 @@ │ │ │ ├── 1 <- [= 82] := 82 │ │ │ ├── 2 <- [= 78] := 78 │ │ │ └── 3 <- [= 83] := 83 - │ │ ├── data <- slice length (repeat u8) := + │ │ ├── data <- take-bytes length (repeat u8) := │ │ │ └── 0 <- u8 := 0 │ │ └── crc <- u32be := 1088870502 │ ├── 2 <- { ... } := @@ -66,7 +66,7 @@ │ │ │ ├── 1 <- [= 75] := 75 │ │ │ ├── 2 <- [= 71] := 71 │ │ │ └── 3 <- [= 68] := 68 - │ │ ├── data <- slice length (repeat u8) := + │ │ ├── data <- take-bytes length (repeat u8) := │ │ │ └── 0 <- u8 := 0 │ │ └── crc <- u32be := 2282036552 │ ├── 3 <- { ... } := @@ -77,7 +77,7 @@ │ │ │ ├── 1 <- [= 72] := 72 │ │ │ ├── 2 <- [= 89] := 89 │ │ │ └── 3 <- [= 115] := 115 - │ │ ├── data <- slice length (repeat u8) := + │ │ ├── data <- take-bytes length (repeat u8) := │ │ │ ├── 0 <- u8 := 0 │ │ │ ├── 1 <- u8 := 0 │ │ │ ├── 2 <- u8 := 11 @@ -96,7 +96,7 @@ │ │ ├── 1 <- [= 73] := 73 │ │ ├── 2 <- [= 77] := 77 │ │ └── 3 <- [= 69] := 69 - │ ├── data <- slice length (repeat u8) := + │ ├── data <- take-bytes length (repeat u8) := │ │ ├── 0 <- u8 := 7 │ │ ├── 1 <- u8 := 213 │ │ ├── 2 <- u8 := 11 @@ -113,7 +113,7 @@ │ │ ├── 1 <- [= 68] := 68 │ │ ├── 2 <- [= 65] := 65 │ │ └── 3 <- [= 84] := 84 - │ ├── data <- slice length (repeat u8) := + │ ├── data <- take-bytes length (repeat u8) := │ │ ├── 0 <- u8 := 72 │ │ ├── 1 <- u8 := 199 │ │ ├── 2 <- u8 := 165 @@ -135,5 +135,5 @@ │ ├── 1 <- [= 69] := 69 │ ├── 2 <- [= 78] := 78 │ └── 3 <- [= 68] := 68 - ├── data <- slice length () := () + ├── data <- take-bytes length () := () └── crc <- u32be := 2923585666 diff --git a/tests/expected/decode/test.webp.stdout b/tests/expected/decode/test.webp.stdout index 8b4b3bb2..ff45d060 100644 --- a/tests/expected/decode/test.webp.stdout +++ b/tests/expected/decode/test.webp.stdout @@ -5,7 +5,7 @@ │ ├── 2 <- [= 70] := 70 │ └── 3 <- [= 70] := 70 ├── length <- u32le := 1140 - ├── data <- slice length { ... } := + ├── data <- take-bytes length { ... } := │ ├── tag <- (...) := │ │ ├── 0 <- u8 := 87 │ │ ├── 1 <- u8 := 69 @@ -19,7 +19,7 @@ │ │ │ ├── 2 <- u8 := 56 │ │ │ └── 3 <- u8 := 88 │ │ ├── length <- u32le := 10 - │ │ ├── data <- slice length (repeat u8) := + │ │ ├── data <- take-bytes length (repeat u8) := │ │ │ ├── 0 <- u8 := 8 │ │ │ ├── 1 <- u8 := 0 │ │ │ ├── 2 <- u8 := 0 @@ -38,7 +38,7 @@ │ │ │ ├── 2 <- u8 := 56 │ │ │ └── 3 <- u8 := 76 │ │ ├── length <- u32le := 963 - │ │ ├── data <- slice length (repeat u8) := + │ │ ├── data <- take-bytes length (repeat u8) := │ │ │ ├── 0 <- u8 := 47 │ │ │ ├── 1 <- u8 := 72 │ │ │ ├── 2 <- u8 := 128 @@ -59,7 +59,7 @@ │ │ ├── 2 <- u8 := 73 │ │ └── 3 <- u8 := 70 │ ├── length <- u32le := 138 - │ ├── data <- slice length (repeat u8) := + │ ├── data <- take-bytes length (repeat u8) := │ │ ├── 0 <- u8 := 69 │ │ ├── 1 <- u8 := 120 │ │ ├── 2 <- u8 := 105 diff --git a/tests/expected/decode/test2.jpg.stdout b/tests/expected/decode/test2.jpg.stdout index 838a248e..9988f18f 100644 --- a/tests/expected/decode/test2.jpg.stdout +++ b/tests/expected/decode/test2.jpg.stdout @@ -5,7 +5,7 @@ │ │ └── app1 <- { ... } := │ │ ├── marker <- map _.1 (...) := 225 │ │ ├── length <- u16be := 5426 - │ │ └── data <- slice (length - 2) { ... } := + │ │ └── data <- take-bytes (length - 2) { ... } := │ │ ├── identifier <- map _.string { ... } := │ │ │ ├── 0 <- [!= 0] := 69 │ │ │ ├── 1 <- [!= 0] := 120 @@ -17,7 +17,7 @@ │ │ ├── byte-order <- _ |...| _ := { be := () } │ │ ├── magic <- match byte-order { ... } := 42 │ │ ├── offset <- match byte-order { ... } := 8 - │ │ └── ifd <- with-relative-offset (offset - 8) (match byte-order { ... }) := + │ │ └── ifd <- with-input (drop-bytes (offset - 8) (match byte-order { ... })) := │ │ ├── num-fields <- u16be := 7 │ │ ├── fields <- repeat-count num-fields { ... } := │ │ │ ├── 0 <- { ... } := @@ -74,7 +74,7 @@ │ │ │ └── app13 <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 237 │ │ │ ├── length <- u16be := 10600 - │ │ │ └── data <- slice (length - 2) (repeat u8) := + │ │ │ └── data <- take-bytes (length - 2) (repeat u8) := │ │ │ ├── 0 <- u8 := 80 │ │ │ ├── 1 <- u8 := 104 │ │ │ ├── 2 <- u8 := 111 @@ -91,7 +91,7 @@ │ │ │ └── app1 <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 225 │ │ │ ├── length <- u16be := 4429 - │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ ├── identifier <- map _.string { ... } := │ │ │ │ ├── 0 <- [!= 0] := 104 │ │ │ │ ├── 1 <- [!= 0] := 116 @@ -123,7 +123,7 @@ │ │ │ └── app2 <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 226 │ │ │ ├── length <- u16be := 576 - │ │ │ └── data <- slice (length - 2) (repeat u8) := + │ │ │ └── data <- take-bytes (length - 2) (repeat u8) := │ │ │ ├── 0 <- u8 := 73 │ │ │ ├── 1 <- u8 := 67 │ │ │ ├── 2 <- u8 := 67 @@ -140,7 +140,7 @@ │ │ │ └── app14 <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 238 │ │ │ ├── length <- u16be := 14 - │ │ │ └── data <- slice (length - 2) (repeat u8) := + │ │ │ └── data <- take-bytes (length - 2) (repeat u8) := │ │ │ ├── 0 <- u8 := 65 │ │ │ ├── 1 <- u8 := 100 │ │ │ ├── 2 <- u8 := 111 @@ -157,7 +157,7 @@ │ │ └── dqt <- { ... } := │ │ ├── marker <- map _.1 (...) := 219 │ │ ├── length <- u16be := 132 - │ │ └── data <- slice (length - 2) { ... } := + │ │ └── data <- take-bytes (length - 2) { ... } := │ │ ├── precision-table-id <- u8 := 0 │ │ └── elements <- repeat u8 := │ │ ├── 0 <- u8 := 2 @@ -176,7 +176,7 @@ │ │ └── sof0 <- { ... } := │ │ ├── marker <- map _.1 (...) := 192 │ │ ├── length <- u16be := 17 - │ │ └── data <- slice (length - 2) { ... } := + │ │ └── data <- take-bytes (length - 2) { ... } := │ │ ├── sample-precision <- u8 := 8 │ │ ├── num-lines <- u16be := 709 │ │ ├── num-samples-per-line <- u16be := 709 @@ -200,13 +200,13 @@ │ │ │ │ └── dri <- { ... } := │ │ │ │ ├── marker <- map _.1 (...) := 221 │ │ │ │ ├── length <- u16be := 4 - │ │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ │ └── restart-interval <- u16be := 89 │ │ │ └── 1 <- _ |...| _ := │ │ │ └── dht <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 196 │ │ │ ├── length <- u16be := 418 - │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ ├── class-table-id <- u8 := 0 │ │ │ ├── num-codes <- repeat-count 16 u8 := │ │ │ │ ├── 0 <- u8 := 0 @@ -237,7 +237,7 @@ │ │ ├── sos <- { ... } := │ │ │ ├── marker <- map _.1 (...) := 218 │ │ │ ├── length <- u16be := 12 - │ │ │ └── data <- slice (length - 2) { ... } := + │ │ │ └── data <- take-bytes (length - 2) { ... } := │ │ │ ├── num-image-components <- u8 := 3 │ │ │ ├── image-components <- repeat-count num-image-components { ... } := │ │ │ │ ├── 0 <- { ... } :=