Skip to content

Commit dccfb24

Browse files
authored
Add tracing support to the compressor (#7385)
## Summary Tracking issue: #7216 We have very little observability into the compressor. When we are debugging, we don't really have any idea of what schemes the compressor is trying, how good or how bad estimates are, how reliable sampling is, how the cascading paths look, etc. This change adds structured `tracing` support to `vortex-compressor`. The compressor now emits a top-level `compress` span and decision/debug events on the `vortex_compressor::encode` target, so a normal tracing subscriber can see what the compressor sampled, selected, accepted/rejected, and where nested failures happened. The `scheme.compress_result` event reports `scheme`, `before_nbytes`, `after_nbytes`, `estimated_ratio` when available, `actual_ratio` when available, and `accepted`. Sampling is recorded through `sample.result`; compression failures are recorded through `scheme.compress_failed` / `sample.compress_failed` with `cascade_path` and `cascade_depth`. Zero-byte outputs intentionally omit ratio fields instead of logging infinities. This also adds JSON formatting to the benchmark logging setup via `--log-format json`, which makes `data-gen` / `compress-bench` output usable as JSONL. One useful workflow is to generate TPC-H data with compressor logs enabled and use `jq` to find over-optimistic estimates that were rejected. <details> <summary>Example jq query for rejected over-estimates</summary> ```fish set LOG data-gen.jsonl jq -R -s -r ' def rows: split("\n") | map(fromjson? // empty); def r: if type == "number" then ((. * 1000 | round) / 1000) else . end; ([ "estimated_over_actual", "scheme", "estimated_ratio", "actual_ratio", "before_nbytes", "after_nbytes", "extra_bytes", "span_dtype", "span_len" ] | @TSV), ( rows | map(select(.target == "vortex_compressor::encode")) | map(select(.fields.message == "scheme.compress_result")) | map(select(.fields.accepted == false)) | map(select(.fields.estimated_ratio != null and .fields.actual_ratio != null)) | map(select(.fields.estimated_ratio > .fields.actual_ratio)) | map(.fields as $f | { scheme: $f.scheme, estimated_ratio: $f.estimated_ratio, actual_ratio: $f.actual_ratio, estimated_over_actual: ($f.estimated_ratio / $f.actual_ratio), before_nbytes: $f.before_nbytes, after_nbytes: $f.after_nbytes, extra_bytes: ($f.after_nbytes - $f.before_nbytes), span_dtype: (.span.dtype // ""), span_len: (.span.len // "") }) | sort_by(.estimated_over_actual) | reverse | .[:50][] | [ (.estimated_over_actual | r), .scheme, (.estimated_ratio | r), (.actual_ratio | r), .before_nbytes, .after_nbytes, .extra_bytes, .span_dtype, .span_len ] | @TSV ) ' $LOG ``` ``` estimated_over_actual scheme estimated_ratio actual_ratio before_nbytes after_nbytes extra_bytes span_dtype span_len 512 vortex.int.for 1.6 0.003 2 640 638 utf8? 2 512 vortex.int.for 2.667 0.005 2 384 382 utf8? 2 512 vortex.int.for 5.333 0.01 8 768 760 decimal(15,2)? 2 512 vortex.int.for 4.571 0.009 8 896 888 decimal(15,2)? 2 512 vortex.int.for 4 0.008 8 1024 1016 decimal(15,2)? 2 512 vortex.int.for 1.6 0.003 2 640 638 utf8? 2 512 vortex.int.for 8 0.016 2 128 126 utf8? 2 512 vortex.int.for 2.56 0.005 16 3200 3184 i64? 2 512 vortex.int.for 5.818 0.011 16 1408 1392 i64? 2 256 vortex.int.for 2 0.008 4 512 508 utf8 4 256 vortex.int.for 2 0.008 4 512 508 utf8 4 256 vortex.int.for 2 0.008 4 512 508 utf8 8192 256 vortex.int.for 2 0.008 4 512 508 utf8 8192 256 vortex.int.for 2 0.008 4 512 508 utf8 8192 256 vortex.int.for 2 0.008 4 512 508 utf8 8192 256 vortex.int.for 2 0.008 4 512 508 utf8 8192 256 vortex.int.for 2 0.008 4 512 508 utf8 8192 204.8 vortex.int.for 4 0.02 5 256 251 utf8 5 204.8 vortex.int.for 4 0.02 5 256 251 utf8 8192 204.8 vortex.int.for 4 0.02 5 256 251 utf8 8192 204.8 vortex.int.for 2.667 0.013 5 384 379 utf8 5 204.8 vortex.int.for 2.667 0.013 5 384 379 utf8 5 146.286 vortex.int.for 1.333 0.009 7 768 761 utf8? 19 146.286 vortex.int.for 1.333 0.009 7 768 761 utf8? 19 128 vortex.int.for 2 0.016 8 512 504 utf8? 98 128 vortex.int.for 2 0.016 8 512 504 utf8? 98 113.778 vortex.int.for 1.6 0.014 18 1280 1262 i32? 184 113.778 vortex.int.for 1.6 0.014 18 1280 1262 i32? 184 113.778 vortex.int.for 32 0.281 36 128 92 i32? 184 113.778 vortex.int.for 32 0.281 36 128 92 i32? 184 64 vortex.int.for 2 0.031 16 512 496 utf8? 98 64 vortex.int.for 2 0.031 16 512 496 utf8? 98 64 vortex.int.for 2 0.031 16 512 496 utf8? 98 64 vortex.int.for 2 0.031 16 512 496 utf8? 98 53.895 vortex.int.for 1.6 0.03 19 640 621 utf8? 19 53.895 vortex.int.for 3.2 0.059 76 1280 1204 decimal(15,2)? 19 53.895 vortex.int.for 2.909 0.054 76 1408 1332 decimal(15,2)? 19 53.895 vortex.int.for 1.6 0.03 19 640 621 utf8? 19 53.895 vortex.int.for 1.6 0.03 19 640 621 utf8? 19 53.895 vortex.int.for 32 0.594 76 128 52 i32? 184 53.895 vortex.int.for 32 0.594 76 128 52 i32? 184 53.895 vortex.int.for 4 0.074 76 1024 948 decimal(15,2)? 19 51.2 vortex.int.for 2 0.039 20 512 492 utf8? 98 51.2 vortex.int.for 2 0.039 20 512 492 utf8? 98 51.2 vortex.int.for 2 0.039 20 512 492 utf8? 98 51.2 vortex.int.for 2 0.039 20 512 492 utf8? 98 51.2 vortex.int.for 2 0.039 20 512 492 utf8? 98 51.2 vortex.int.for 2 0.039 20 512 492 utf8? 98 40.96 vortex.int.for 2 0.049 25 512 487 utf8? 25 40.96 vortex.int.for 2 0.049 25 512 487 utf8? 25 ``` </details> ## Testing Some basic tracing tests (that was claude-generated). --------- Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 91f1c2f commit dccfb24

13 files changed

Lines changed: 890 additions & 159 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmarks/compress-bench/src/main.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use regex::Regex;
1515
use vortex::utils::aliases::hash_map::HashMap;
1616
use vortex_bench::Engine;
1717
use vortex_bench::Format;
18+
use vortex_bench::LogFormat;
1819
use vortex_bench::Target;
1920
use vortex_bench::compress::CompressMeasurements;
2021
use vortex_bench::compress::CompressOp;
@@ -39,7 +40,7 @@ use vortex_bench::public_bi::PBIDataset::CMSprovider;
3940
use vortex_bench::public_bi::PBIDataset::Euro2016;
4041
use vortex_bench::public_bi::PBIDataset::Food;
4142
use vortex_bench::public_bi::PBIDataset::HashTags;
42-
use vortex_bench::setup_logging_and_tracing;
43+
use vortex_bench::setup_logging_and_tracing_with_format;
4344

4445
#[derive(Parser, Debug)]
4546
#[command(version, about, long_about = None)]
@@ -69,13 +70,17 @@ struct Args {
6970
output_path: Option<PathBuf>,
7071
#[arg(long)]
7172
tracing: bool,
73+
/// Format for the primary stderr log sink. `text` is the default human-readable format;
74+
/// `json` emits one JSON object per event, suitable for piping into `jq`.
75+
#[arg(long, value_enum, default_value_t = LogFormat::Text)]
76+
log_format: LogFormat,
7277
}
7378

7479
#[tokio::main]
7580
async fn main() -> anyhow::Result<()> {
7681
let args = Args::parse();
7782

78-
setup_logging_and_tracing(args.verbose, args.tracing)?;
83+
setup_logging_and_tracing_with_format(args.verbose, args.tracing, args.log_format)?;
7984

8085
run_compress(
8186
args.iterations,

vortex-bench/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ tracing = { workspace = true }
6161
tracing-perfetto = { workspace = true }
6262
tracing-subscriber = { workspace = true, features = [
6363
"env-filter",
64+
"json",
6465
"tracing-log",
6566
] }
6667
url = { workspace = true }

vortex-bench/src/bin/data-gen.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ use vortex_bench::Benchmark;
1717
use vortex_bench::BenchmarkArg;
1818
use vortex_bench::CompactionStrategy;
1919
use vortex_bench::Format;
20+
use vortex_bench::LogFormat;
2021
use vortex_bench::Opt;
2122
use vortex_bench::Opts;
2223
use vortex_bench::conversions::convert_parquet_directory_to_vortex;
2324
use vortex_bench::create_benchmark;
2425
use vortex_bench::generate_duckdb_registration_sql;
25-
use vortex_bench::setup_logging_and_tracing;
26+
use vortex_bench::setup_logging_and_tracing_with_format;
2627

2728
#[derive(Parser)]
2829
#[command(name = "bench-data-gen")]
@@ -37,6 +38,11 @@ struct Args {
3738
#[arg(long)]
3839
tracing: bool,
3940

41+
/// Format for the primary stderr log sink. `text` is the default human-readable format;
42+
/// `json` emits one JSON object per event, suitable for piping into `jq`.
43+
#[arg(long, value_enum, default_value_t = LogFormat::Text)]
44+
log_format: LogFormat,
45+
4046
#[arg(long, value_delimiter = ',', value_parser = value_parser!(Format))]
4147
formats: Vec<Format>,
4248

@@ -49,7 +55,7 @@ async fn main() -> anyhow::Result<()> {
4955
let args = Args::parse();
5056
let opts = Opts::from(args.options);
5157

52-
setup_logging_and_tracing(args.verbose, args.tracing)?;
58+
setup_logging_and_tracing_with_format(args.verbose, args.tracing, args.log_format)?;
5359

5460
let benchmark = create_benchmark(args.benchmark, &opts)?;
5561

vortex-bench/src/utils/logging.rs

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,77 @@
44
use std::fs::File;
55
use std::io::IsTerminal;
66

7+
use clap::ValueEnum;
78
use tracing::level_filters::LevelFilter;
89
use tracing_perfetto::PerfettoLayer;
910
use tracing_subscriber::EnvFilter;
11+
use tracing_subscriber::Layer;
1012
use tracing_subscriber::prelude::*;
1113

12-
/// Initialize logging/tracing for a benchmark
13-
pub fn setup_logging_and_tracing(verbose: bool, tracing: bool) -> anyhow::Result<()> {
14+
/// Format for the primary stderr log sink.
15+
///
16+
/// `Text` is the default human-readable formatter matching the historical behavior of this crate.
17+
/// `Json` emits one newline-delimited JSON object per event, suitable for piping into `jq` or a log
18+
/// aggregator.
19+
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, ValueEnum)]
20+
pub enum LogFormat {
21+
#[default]
22+
Text,
23+
Json,
24+
}
25+
26+
/// Initialize logging/tracing for a benchmark, hardcoding [`LogFormat::Text`].
27+
///
28+
/// See [`setup_logging_and_tracing_with_format`] if you want to select JSON
29+
/// output from a CLI flag.
30+
pub fn setup_logging_and_tracing(verbose: bool, perfetto: bool) -> anyhow::Result<()> {
31+
setup_logging_and_tracing_with_format(verbose, perfetto, LogFormat::Text)
32+
}
33+
34+
/// Initialize logging/tracing for a benchmark with an explicit stderr format.
35+
///
36+
/// - `verbose`: when `RUST_LOG` is unset, raises the default filter from `INFO` to `TRACE`. Has no
37+
/// effect when `RUST_LOG` is set (the env var wins).
38+
/// - `perfetto`: when `true`, additionally attaches a [`tracing_perfetto::PerfettoLayer`] that
39+
/// writes span begin/end events to `trace.json` in the current directory. Intended to be loaded
40+
/// into the Perfetto UI for flamegraph visualization.
41+
/// - `format`: controls the primary stderr sink's formatting. See [`LogFormat`].
42+
pub fn setup_logging_and_tracing_with_format(
43+
verbose: bool,
44+
perfetto: bool,
45+
format: LogFormat,
46+
) -> anyhow::Result<()> {
1447
let filter = default_env_filter(verbose);
1548

16-
let fmt_layer = tracing_subscriber::fmt::layer()
17-
.with_writer(std::io::stderr)
18-
.with_level(true)
19-
.with_file(true)
20-
.with_line_number(true)
21-
.with_ansi(std::io::stderr().is_terminal());
49+
let perfetto_layer = perfetto
50+
.then(|| {
51+
Ok::<_, anyhow::Error>(
52+
PerfettoLayer::new(File::create("trace.json")?).with_debug_annotations(true),
53+
)
54+
})
55+
.transpose()?;
56+
57+
// `fmt::layer()` and `fmt::layer().json()` produce different concrete types,
58+
// so erase each to a `dyn Layer` via `.boxed()` and keep the registry uniform.
59+
let fmt_layer: Box<dyn Layer<_> + Send + Sync> = match format {
60+
LogFormat::Text => tracing_subscriber::fmt::layer()
61+
.with_writer(std::io::stderr)
62+
.with_level(true)
63+
.with_file(true)
64+
.with_line_number(true)
65+
.with_ansi(std::io::stderr().is_terminal())
66+
.boxed(),
67+
LogFormat::Json => tracing_subscriber::fmt::layer()
68+
.json()
69+
.with_writer(std::io::stderr)
70+
.with_current_span(true)
71+
.with_span_list(true)
72+
.boxed(),
73+
};
2274

2375
tracing_subscriber::registry()
2476
.with(filter)
25-
.with(
26-
tracing
27-
.then(|| {
28-
Ok::<_, anyhow::Error>(
29-
PerfettoLayer::new(File::create("trace.json")?)
30-
.with_debug_annotations(true),
31-
)
32-
})
33-
.transpose()?,
34-
)
77+
.with(perfetto_layer)
3578
.with(fmt_layer)
3679
.init();
3780

vortex-btrblocks/src/schemes/integer.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,8 +307,6 @@ impl Scheme for ZigZagScheme {
307307

308308
let compressed = compressor.compress_child(&encoded.into_array(), &ctx, self.id(), 0)?;
309309

310-
tracing::debug!("zigzag output: {}", compressed.encoding_id());
311-
312310
Ok(ZigZag::try_new(compressed)?.into_array())
313311
}
314312
}

vortex-compressor/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ num-traits = { workspace = true }
1919
parking_lot = { workspace = true }
2020
rand = { workspace = true }
2121
rustc-hash = { workspace = true }
22-
tracing = { workspace = true }
22+
tracing = { workspace = true, features = ["std", "attributes"] }
2323
vortex-array = { workspace = true }
2424
vortex-buffer = { workspace = true }
2525
vortex-error = { workspace = true }
@@ -29,6 +29,7 @@ vortex-utils = { workspace = true }
2929
[dev-dependencies]
3030
divan = { workspace = true }
3131
rstest = { workspace = true }
32+
tracing-subscriber = { workspace = true, features = ["env-filter"] }
3233
vortex-array = { workspace = true, features = ["_test-harness"] }
3334

3435
[lints]

0 commit comments

Comments
 (0)