@@ -37,8 +37,9 @@ use futures_util::FutureExt;
3737use serde:: { Deserialize , Serialize } ;
3838use tokio:: runtime:: Runtime ;
3939use vespera_inprocess:: {
40- RequestChunk , RequestEnvelope , dispatch_bidirectional_streaming, dispatch_from_bytes,
41- dispatch_owned, dispatch_streaming_async, dispatch_typed, register_app,
40+ DirectWriteResult , RequestChunk , RequestEnvelope , dispatch_bidirectional_streaming,
41+ dispatch_from_bytes, dispatch_into, dispatch_owned, dispatch_streaming_async, dispatch_typed,
42+ register_app,
4243} ;
4344
4445// ── Test fixtures ────────────────────────────────────────────────────
@@ -277,6 +278,96 @@ fn bench_wire_path(c: &mut Criterion) {
277278 drop ( runtime) ;
278279}
279280
281+ /// Raw-byte isolation: `dispatch_from_bytes` against `/echo/bytes`,
282+ /// which echoes the request body unchanged. Comparing this group with
283+ /// `wire_path` (JSON `/echo`) isolates the `serde_json`
284+ /// deserialize+reserialize cost from vespera's pure dispatch/copy
285+ /// overhead at identical body sizes.
286+ fn bench_bytes_path ( c : & mut Criterion ) {
287+ install_bench_app ( ) ;
288+
289+ let runtime = Runtime :: new ( ) . expect ( "tokio runtime" ) ;
290+ let mut group = c. benchmark_group ( "bytes_path" ) ;
291+
292+ for & body_kb in & [ 1_usize , 64 , 1024 ] {
293+ let payload = vec ! [ 0xA5u8 ; body_kb * 1024 ] ;
294+ let wire = assemble_wire (
295+ "POST" ,
296+ "/echo/bytes" ,
297+ Some ( "application/octet-stream" ) ,
298+ & payload,
299+ ) ;
300+ group. throughput ( Throughput :: Bytes ( ( body_kb * 1024 ) as u64 ) ) ;
301+
302+ group. bench_with_input (
303+ BenchmarkId :: new ( "raw_bytes_dispatch_from_bytes" , body_kb) ,
304+ & body_kb,
305+ |b, _| {
306+ b. iter ( || dispatch_from_bytes ( wire. clone ( ) , & runtime) ) ;
307+ } ,
308+ ) ;
309+ }
310+
311+ group. finish ( ) ;
312+ drop ( runtime) ;
313+ }
314+
315+ /// Direct-write A/B: `dispatch_from_bytes` (materialises the wire
316+ /// response into a fresh `Vec` per call) vs `dispatch_into` (streams
317+ /// the wire response straight into a caller-owned, preallocated buffer
318+ /// — the JNI `dispatchDirect` path). Both echo a raw byte body via
319+ /// `/echo/bytes`, so the delta isolates the response `Vec` allocation +
320+ /// final body memcpy that the direct-write path removes.
321+ ///
322+ /// The `dispatch_into` buffer is sized exactly once (outside the timed
323+ /// loop) and reused across iterations, mirroring the pooled direct
324+ /// buffer the Java bridge hands in.
325+ fn bench_direct_write_path ( c : & mut Criterion ) {
326+ install_bench_app ( ) ;
327+
328+ let runtime = Runtime :: new ( ) . expect ( "tokio runtime" ) ;
329+ let mut group = c. benchmark_group ( "direct_write_path" ) ;
330+
331+ for & body_kb in & [ 64_usize , 1024 , 4096 ] {
332+ let payload = vec ! [ 0xA5u8 ; body_kb * 1024 ] ;
333+ let wire = assemble_wire (
334+ "POST" ,
335+ "/echo/bytes" ,
336+ Some ( "application/octet-stream" ) ,
337+ & payload,
338+ ) ;
339+ group. throughput ( Throughput :: Bytes ( ( body_kb * 1024 ) as u64 ) ) ;
340+
341+ // Exact response size: one untimed probe with a generous buffer.
342+ let required = {
343+ let mut probe = vec ! [ 0u8 ; payload. len( ) + 4096 ] ;
344+ match dispatch_into ( wire. clone ( ) , & mut probe, & runtime) {
345+ DirectWriteResult :: Complete ( n) | DirectWriteResult :: Overflow ( n) => n,
346+ }
347+ } ;
348+
349+ group. bench_with_input (
350+ BenchmarkId :: new ( "materialize_dispatch_from_bytes" , body_kb) ,
351+ & body_kb,
352+ |b, _| {
353+ b. iter ( || dispatch_from_bytes ( wire. clone ( ) , & runtime) ) ;
354+ } ,
355+ ) ;
356+
357+ group. bench_with_input (
358+ BenchmarkId :: new ( "direct_write_dispatch_into" , body_kb) ,
359+ & body_kb,
360+ |b, _| {
361+ let mut out = vec ! [ 0u8 ; required] ;
362+ b. iter ( || dispatch_into ( wire. clone ( ) , & mut out, & runtime) ) ;
363+ } ,
364+ ) ;
365+ }
366+
367+ group. finish ( ) ;
368+ drop ( runtime) ;
369+ }
370+
280371/// P2 isolation (within-run A/B): default-app resolution via the
281372/// lock-free `OnceLock` fast path vs named-app resolution through the
282373/// `RwLock<HashMap>` slow path. Identical router, identical wire
@@ -547,6 +638,8 @@ criterion_group!(
547638 bench_router_path,
548639 bench_dispatch_path,
549640 bench_wire_path,
641+ bench_bytes_path,
642+ bench_direct_write_path,
550643 bench_resolve_path,
551644 bench_contended_path,
552645 bench_headers_path,
0 commit comments