Skip to content

Commit e6c30e6

Browse files
committed
[stats] compute cardinality for identifiers using hyperloglog
1 parent 0a2347f commit e6c30e6

26 files changed

Lines changed: 718 additions & 99 deletions

NEWS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ Features:
5757
the raw value to the base unit implied by `suffix`
5858
e.g. a field storing milliseconds with `"suffix": "s"`
5959
declares `"divisor": 1000`.
60+
* The details overlay now shows per-column statistics for
61+
the focused message. Numeric columns get a `min..max of
62+
N` range summary on the value line, plus a `p50/p90/p99`
63+
percentile sub-line for columns with enough samples to
64+
characterize the distribution shape. Identifier and
65+
metrics-text columns get an estimated distinct-value
66+
count (`~K distinct of N`). Distinct counts are computed
67+
from a HyperLogLog sketch (~4 KB per text column,
68+
~1.6% standard error). Stats render in the column's
69+
declared unit when one is set.
6070

6171
Interface Changes:
6272
* Moving horizontally now defaults to moving to the

src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,8 @@ add_library(
838838
third-party/date/include/date/tz.h
839839
third-party/date/include/date/islamic.h
840840

841+
third-party/cpp-HyperLogLog/include/hyperloglog.hpp
842+
841843
third-party/digestible/include/digestible/digestible.h
842844

843845
third-party/intervaltree/IntervalTree.h
@@ -872,6 +874,7 @@ set(lnav_SRCS lnav.cc file_vtab.cc all_ids_vtabs.cc breakpoint_vtab.cc metrics_v
872874
target_include_directories(diag PUBLIC . fmtlib ${CMAKE_CURRENT_BINARY_DIR}
873875
third-party
874876
third-party/base64/include
877+
third-party/cpp-HyperLogLog/include
875878
third-party/date/include
876879
third-party/digestible/include
877880
third-party/lnav-rs-ext

src/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ AM_CPPFLAGS = \
181181
-I$(srcdir)/fmtlib \
182182
-I$(srcdir)/third-party \
183183
-I$(srcdir)/third-party/base64/include \
184+
-I$(srcdir)/third-party/cpp-HyperLogLog/include \
184185
-I$(srcdir)/third-party/date/include \
185186
-I$(srcdir)/third-party/digestible/include \
186187
-I$(srcdir)/third-party/notcurses/include \
@@ -459,6 +460,7 @@ THIRD_PARTY_SRCS = \
459460
third-party/CLI/Split.hpp \
460461
third-party/CLI/TypeTools.hpp \
461462
third-party/CLI/ConfigFwd.hpp \
463+
third-party/cpp-HyperLogLog/include/hyperloglog.hpp \
462464
third-party/digestible/include/digestible/digestible.h \
463465
third-party/doctest-root/doctest/doctest.h \
464466
third-party/intervaltree/IntervalTree.h \

src/field_overlay_source.cc

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ field_overlay_source::build_field_lines(const listview_curses& lv,
540540
auto* curr_elf = dynamic_cast<external_log_format*>(curr_format);
541541
const auto format_name = curr_format->get_name().to_string();
542542
attr_line_t al;
543-
auto value_str = lv.to_string();
543+
auto value_str = lv.to_humanized_string();
544544

545545
if (curr_format != last_format) {
546546
this->fos_lines.emplace_back(" Known message fields for table "
@@ -554,6 +554,7 @@ field_overlay_source::build_field_lines(const listview_curses& lv,
554554

555555
std::string field_name, orig_field_name;
556556
line_range hl_range;
557+
size_t prefix_len = 0;
557558
al.append(" ").append("|", VC_GRAPHIC.value(NCACS_LTEE)).append(" ");
558559
if (meta.lvm_struct_name.empty()) {
559560
if (curr_elf && curr_elf->elf_body_field == meta.lvm_name) {
@@ -585,11 +586,11 @@ field_overlay_source::build_field_lines(const listview_curses& lv,
585586
al.append(":bar_chart:"_emoji).append(" ");
586587
break;
587588
}
588-
auto prefix_len = al.column_width();
589+
prefix_len = al.column_width() + this->fos_known_key_size;
589590
hl_range.lr_start = al.get_string().length();
590591
al.append(field_name);
591592
hl_range.lr_end = al.get_string().length();
592-
al.pad_to(prefix_len + this->fos_known_key_size);
593+
al.pad_to(prefix_len);
593594

594595
this->fos_row_to_field_meta.emplace(this->fos_lines.size(),
595596
row_info{meta, value_str});
@@ -599,6 +600,7 @@ field_overlay_source::build_field_lines(const listview_curses& lv,
599600
meta.lvm_name.get());
600601
hl_range.lr_start = al.get_string().length();
601602
al.append(jget_str.in());
603+
prefix_len = al.column_width();
602604
hl_range.lr_end = al.get_string().length();
603605

604606
this->fos_row_to_field_meta.emplace(
@@ -607,29 +609,6 @@ field_overlay_source::build_field_lines(const listview_curses& lv,
607609
readline_sql_highlighter_int(
608610
al, lnav::sql::dialect::sqlite, std::nullopt, hl_range);
609611

610-
if (!meta.lvm_unit_suffix.empty()) {
611-
std::optional<double> numeric;
612-
switch (meta.lvm_kind) {
613-
case value_kind_t::VALUE_INTEGER:
614-
numeric = (double) lv.lv_value.i;
615-
break;
616-
case value_kind_t::VALUE_FLOAT:
617-
numeric = lv.lv_value.d;
618-
break;
619-
default:
620-
break;
621-
}
622-
if (numeric) {
623-
if (meta.lvm_unit_divisor != 0.0
624-
&& meta.lvm_unit_divisor != 1.0)
625-
{
626-
*numeric /= meta.lvm_unit_divisor;
627-
}
628-
value_str = humanize::format(
629-
*numeric, meta.lvm_unit_suffix.to_string_fragment());
630-
}
631-
}
632-
633612
if (meta.lvm_kind == value_kind_t::VALUE_TIMESTAMP) {
634613
auto dts = curr_format->build_time_scanner();
635614
exttm tm;
@@ -653,8 +632,66 @@ field_overlay_source::build_field_lines(const listview_curses& lv,
653632

654633
al.append(" = ").append(scrub_ws(value_str.c_str()));
655634

635+
// Per-column stats summary: numeric columns get a min..max
636+
// range and total count; text columns get an HLL-estimated
637+
// distinct count. Both render in the column's unit if one is
638+
// declared, mirroring the value's own formatting above.
639+
const logline_value_stats* stats = nullptr;
640+
const auto* curr_lf = this->fos_log_helper.ldh_file.get();
641+
if (curr_lf != nullptr) {
642+
stats = curr_lf->stats_for_value(meta.lvm_name);
643+
}
644+
645+
if (stats != nullptr) {
646+
std::string summary;
647+
if (stats->lvs_count > 0) {
648+
summary
649+
= fmt::format(FMT_STRING(" {}..{} of {}"),
650+
meta.to_humanized_value(stats->lvs_min_value),
651+
meta.to_humanized_value(stats->lvs_max_value),
652+
stats->lvs_count);
653+
} else if (auto est = stats->distinct_estimate(); est) {
654+
summary = fmt::format(FMT_STRING(" ~{:.0f} distinct of {}"),
655+
est.value(),
656+
stats->lvs_text_count);
657+
}
658+
if (!summary.empty()) {
659+
al.append(attr_line_t(summary).with_attr_for_all(
660+
VC_ROLE.value(role_t::VCR_COMMENT)));
661+
}
662+
}
663+
656664
this->fos_lines.emplace_back(al);
657665

666+
// Numeric percentile sub-line: typical / tail / extreme.
667+
// Suppressed when the sample is too small to be statistically
668+
// meaningful, when the distribution is degenerate (single
669+
// value), or when the upper percentiles all collapse to the
670+
// max — in those cases the inline `min..max of N` already
671+
// tells the whole story.
672+
if (stats != nullptr && stats->lvs_count >= 20
673+
&& stats->lvs_min_value < stats->lvs_max_value)
674+
{
675+
const auto p50 = stats->lvs_tdigest.quantile(50);
676+
const auto p90 = stats->lvs_tdigest.quantile(90);
677+
const auto p99 = stats->lvs_tdigest.quantile(99);
678+
if (!(p50 == p99 && p99 == stats->lvs_max_value)) {
679+
attr_line_t pct_line;
680+
pct_line.append(" ")
681+
.with_attr(string_attr(line_range{1, 2},
682+
VC_GRAPHIC.value(NCACS_VLINE)))
683+
.with_attr(string_attr(line_range{1, 2},
684+
VC_ROLE.value(role_t::VCR_COMMENT)))
685+
.pad_to(prefix_len + 5)
686+
.append(fmt::format(FMT_STRING("p50={} p90={} p99={}"),
687+
meta.to_humanized_value(p50),
688+
meta.to_humanized_value(p90),
689+
meta.to_humanized_value(p99)));
690+
pct_line.with_attr_for_all(VC_ROLE.value(role_t::VCR_COMMENT));
691+
this->fos_lines.emplace_back(pct_line);
692+
}
693+
}
694+
658695
if (meta.lvm_kind == value_kind_t::VALUE_STRUCT) {
659696
json_string js = extract(value_str.c_str());
660697

src/log_format.cc

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,30 @@ logline_value_meta::to_chart_type() const
390390
return retval;
391391
}
392392

393+
std::string
394+
logline_value_meta::to_humanized_value(int64_t i) const
395+
{
396+
if (!this->lvm_unit_suffix.empty()
397+
|| (this->lvm_unit_divisor != 0.0 && this->lvm_unit_divisor != 1.0))
398+
{
399+
double d = i;
400+
if (this->lvm_unit_divisor != 0.0 && this->lvm_unit_divisor != 1.0) {
401+
d /= this->lvm_unit_divisor;
402+
}
403+
return humanize::format(d, this->lvm_unit_suffix.to_string_fragment());
404+
}
405+
return fmt::to_string(i);
406+
}
407+
408+
std::string
409+
logline_value_meta::to_humanized_value(double d) const
410+
{
411+
if (this->lvm_unit_divisor != 0.0 && this->lvm_unit_divisor != 1.0) {
412+
d /= this->lvm_unit_divisor;
413+
}
414+
return humanize::format(d, this->lvm_unit_suffix.to_string_fragment());
415+
}
416+
393417
struct line_range
394418
logline_value::origin_in_full_msg(const char* msg, ssize_t len) const
395419
{
@@ -592,6 +616,19 @@ logline_value::to_string() const
592616
return {buffer};
593617
}
594618

619+
std::string
620+
logline_value::to_humanized_string() const
621+
{
622+
switch (this->lv_meta.lvm_kind) {
623+
case value_kind_t::VALUE_INTEGER:
624+
return this->lv_meta.to_humanized_value(this->lv_value.i);
625+
case value_kind_t::VALUE_FLOAT:
626+
return this->lv_meta.to_humanized_value(this->lv_value.d);
627+
default:
628+
return this->to_string();
629+
}
630+
}
631+
595632
string_fragment
596633
logline_value::to_string_fragment(ArenaAlloc::Alloc<char>& alloc) const
597634
{
@@ -2291,6 +2328,9 @@ external_log_format::scan_tabular(logfile& lf,
22912328
if (field_sf.length() > lvs.lvs_width) {
22922329
lvs.lvs_width = field_sf.length();
22932330
}
2331+
if (vd->vd_meta.lvm_identifier && !field_sf.empty()) {
2332+
lvs.add_text(field_sf);
2333+
}
22942334
}
22952335
// CSV cells may carry a `""`-escaped double-quote literal;
22962336
// collapse those before any downstream comparison or
@@ -2603,6 +2643,15 @@ external_log_format::scan(logfile& lf,
26032643
if (cap_size > lvs.lvs_width) {
26042644
lvs.lvs_width = cap_size;
26052645
}
2646+
// Identifier fields are explicitly excluded from numeric
2647+
// ingest in `ingest_numeric_value`; route them to the
2648+
// distinct-count estimator instead so columns like opid,
2649+
// hostname, request_id surface a useful cardinality.
2650+
if (ivd.ivd_value_def->vd_meta.lvm_identifier) {
2651+
if (auto cap = md[ivd.ivd_index]) {
2652+
lvs.add_text(*cap);
2653+
}
2654+
}
26062655
}
26072656

26082657
for (auto value_index : fpat->p_numeric_value_indexes) {
@@ -5967,6 +6016,11 @@ external_log_format::value_line_count(scan_batch_context& sbc,
59676016
}
59686017
if (val) {
59696018
lvs.add_value(val.value());
6019+
} else if (vd->vd_meta.lvm_identifier && str != nullptr && len > 0) {
6020+
// Identifier fields parsed as strings (no numeric `val`)
6021+
// contribute to the column's distinct-count estimate
6022+
// instead, mirroring the regex/tabular paths.
6023+
lvs.add_text(string_fragment::from_bytes(str, len));
59706024
}
59716025
}
59726026

@@ -6206,6 +6260,18 @@ logline_value_stats::merge(const logline_value_stats& other)
62066260
this->lvs_width = other.lvs_width;
62076261
}
62086262

6263+
// Distinct-count merge runs before the lvs_count == 0 short-circuit
6264+
// because a text-only column has count == 0 but may still carry an
6265+
// HLL whose registers we need to fold in.
6266+
this->lvs_text_count += other.lvs_text_count;
6267+
if (other.lvs_distinct) {
6268+
if (this->lvs_distinct) {
6269+
this->lvs_distinct->merge(other.lvs_distinct.value());
6270+
} else {
6271+
this->lvs_distinct = other.lvs_distinct;
6272+
}
6273+
}
6274+
62096275
if (other.lvs_count == 0) {
62106276
return;
62116277
}
@@ -6239,6 +6305,31 @@ logline_value_stats::add_value(double value)
62396305
this->lvs_tdigest.insert(value);
62406306
}
62416307

6308+
void
6309+
logline_value_stats::add_text(string_fragment sf)
6310+
{
6311+
if (!this->lvs_distinct) {
6312+
this->lvs_distinct.emplace(12);
6313+
}
6314+
this->lvs_distinct->add(sf.data(), static_cast<uint32_t>(sf.length()));
6315+
this->lvs_text_count += 1;
6316+
}
6317+
6318+
std::optional<double>
6319+
logline_value_stats::distinct_estimate() const
6320+
{
6321+
if (!this->lvs_distinct) {
6322+
return std::nullopt;
6323+
}
6324+
return this->lvs_distinct->estimate();
6325+
}
6326+
6327+
void
6328+
logline_value_stats::finalize()
6329+
{
6330+
this->lvs_tdigest.merge();
6331+
}
6332+
62426333
std::vector<logline_value_meta>
62436334
external_log_format::get_value_metadata() const
62446335
{

src/log_format_fwd.hh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#include "base/time_util.hh"
5353
#include "byte_array.hh"
5454
#include "digestible/digestible.h"
55+
#include "hyperloglog.hpp"
5556
#include "log_level.hh"
5657
#include "pcrepp/pcre2pp.hh"
5758
#include "robin_hood/robin_hood.h"
@@ -181,12 +182,32 @@ struct logline_value_stats {
181182

182183
void add_value(double value);
183184

185+
// Feed a non-numeric cell into the per-column distinct-count
186+
// estimator. Lazy-initializes `lvs_distinct` on first call so
187+
// numeric-only columns don't pay the ~4KB register-vector cost.
188+
void add_text(string_fragment sf);
189+
190+
// HLL cardinality estimate for the column, or std::nullopt if no
191+
// text cells were ever fed in.
192+
std::optional<double> distinct_estimate() const;
193+
194+
// Compact the t-digest's pending buffer into ordered centroids so
195+
// `lvs_tdigest.quantile(p)` returns accurate percentiles. Call
196+
// once after the per-file scan completes; no-op for repeated
197+
// calls.
198+
void finalize();
199+
184200
int64_t lvs_width{0};
185201
int64_t lvs_count{0};
202+
int64_t lvs_text_count{0};
186203
double lvs_total{0};
187204
double lvs_min_value{std::numeric_limits<double>::max()};
188205
double lvs_max_value{-std::numeric_limits<double>::max()};
189206
digestible::tdigest<double> lvs_tdigest{200};
207+
// p=12 → ~4KB registers, ~1.6% standard error. Right size for
208+
// typical metrics text columns (HTTP methods, status codes,
209+
// hostnames, op IDs all ≪ 2^52 — the post-XXH3 ceiling).
210+
std::optional<hll::HyperLogLog> lvs_distinct;
190211
};
191212

192213
struct pattern_for_lines {
@@ -624,6 +645,10 @@ struct logline_value_meta {
624645

625646
chart_type_t to_chart_type() const;
626647

648+
std::string to_humanized_value(int64_t i) const;
649+
650+
std::string to_humanized_value(double d) const;
651+
627652
intern_string_t lvm_name;
628653
value_kind_t lvm_kind;
629654
column_t lvm_column{external_column{}};
@@ -695,6 +720,8 @@ public:
695720

696721
std::string to_string() const;
697722

723+
std::string to_humanized_string() const;
724+
698725
string_fragment to_string_fragment(ArenaAlloc::Alloc<char>& alloc) const;
699726

700727
const char* text_value() const;

0 commit comments

Comments
 (0)