Skip to content

Commit 7399129

Browse files
AIQnetLabclaude
andcommitted
fix: v25.3 — rate-govern sig-reject logs + bound RocksDB internal LOG
- consensus_crypto: per-claimed-identity log rate governor for the pre-PK-parse rejection sites (sig_all_zeros, sig_low_entropy, sig_too_short/small, sig_format_invalid, pk_size_invalid, sig_len_mismatch, sig_invalid). An external spoofer flooding garbage under a claimed genesis identity previously emitted ~13k [ERR][CONSENSUS]/[ERR][P2P] lines per node per 20h — those frames fail structural checks before the Dilithium PK is parsed so ATTACKER_PK_BLACKLIST (fingerprint-keyed) cannot suppress them. Governor keeps first 5 rejects per claimed node_id per 60s window fully visible (genuine transient faults still logged), emits one suppression notice, then stays silent with a per-window sig_reject_flood summary. REJECTION IS UNCONDITIONAL at every call site — only log volume is bounded; security semantics unchanged. Keyed per claimed node_id so a flood on one identity cannot starve another's reject budget. Bounded map (8192 cap + lazy 25% LRU). - unified_p2p: route both [ERR][P2P] Invalid Dilithium signature sites through the same governor (shared per-identity window). - storage: bound RocksDB's internal diagnostic LOG. Default is a single LOG file growing unbounded until DB reopen (observed ~454 MB after 27h continuous uptime, ~17 MB/h). set_max_log_file_size(64MB) + set_keep_log_file_num(10) → rolling window ~1.5 days / <=640 MB cap. This is RocksDB engine ops/diagnostics only — NOT chain data, NOT WAL, NOT consensus — verbosity (INFO) deliberately unchanged so engine forensics remain available; only unbounded growth is removed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 20d04cc commit 7399129

3 files changed

Lines changed: 198 additions & 20 deletions

File tree

core/qnet-consensus/src/consensus_crypto.rs

Lines changed: 163 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,153 @@ const ATTACKER_PK_BLACKLIST_CAP: usize = 12_288;
255255
/// connection attempt because the Tier-2 check is always evaluated.
256256
const ATTACKER_PK_EVICT_FRACTION: usize = 4;
257257

258+
// ════════════════════════════════════════════════════════════════════════
259+
// SECURITY-REJECT LOG RATE GOVERNOR (v25.3)
260+
// ════════════════════════════════════════════════════════════════════════
261+
// An external attacker can flood garbage signatures that fail the cheap
262+
// structural checks (all-zero, low-entropy, bad length) which run BEFORE
263+
// the Dilithium3 public key is even parsed. Those failures cannot be
264+
// fingerprinted by `ATTACKER_PK_BLACKLIST` (no PK is extracted), so each
265+
// rejected garbage frame previously produced an unconditional
266+
// `[ERR][CONSENSUS] sig_*` line. Observed in production: ~13 000
267+
// reject-log lines / 20 h on every node a spoofer targeted — pure log
268+
// noise that can DoS the logging subsystem / fill disk while telling the
269+
// operator nothing new after the first occurrence.
270+
//
271+
// This governor rate-limits the LOG OUTPUT per claimed identity. The
272+
// REJECTION itself is unconditional at every call site — the security
273+
// boundary is unchanged. We keep the first `SIG_REJECT_LOG_PER_WINDOW`
274+
// rejections per claimed `node_id` per `SIG_REJECT_LOG_WINDOW_S` fully
275+
// visible (so a genuine transient fault on a real node is never hidden),
276+
// emit one explicit suppression notice at the threshold, then stay
277+
// silent until the window rolls — at which point a single
278+
// `sig_reject_flood` summary reports how many were suppressed and that
279+
// the flood is ongoing. Standard production practice for
280+
// attacker-controlled inputs.
281+
//
282+
// Keyed PER claimed `node_id`: a flood against one identity cannot
283+
// starve the reject-log budget of a different identity's genuine fault.
284+
// Bounded memory: soft cap + lazy 25 % LRU eviction, identical to the
285+
// `ATTACKER_PK_BLACKLIST` discipline.
286+
287+
lazy_static::lazy_static! {
288+
/// claimed_node_id → rolling-window reject-log state.
289+
static ref SIG_REJECT_LOG_GOVERNOR: dashmap::DashMap<String, SigRejectLogState> =
290+
dashmap::DashMap::new();
291+
}
292+
293+
#[derive(Debug, Clone)]
294+
struct SigRejectLogState {
295+
/// UNIX seconds when the current window opened.
296+
window_start_s: u64,
297+
/// Detailed reject lines already emitted in the current window.
298+
logged_in_window: u32,
299+
/// Reject lines suppressed in the current window (reported on roll).
300+
suppressed_in_window: u64,
301+
}
302+
303+
/// Rolling window length for the reject-log governor.
304+
const SIG_REJECT_LOG_WINDOW_S: u64 = 60;
305+
/// Detailed reject lines allowed per claimed identity per window before
306+
/// suppression engages. Small enough to collapse a flood, large enough
307+
/// that a genuine transient fault on a real node is still visible.
308+
const SIG_REJECT_LOG_PER_WINDOW: u32 = 5;
309+
/// Soft cap on the governor map (≤ ~1 MB resident at this size).
310+
const SIG_REJECT_GOVERNOR_CAP: usize = 8_192;
311+
312+
enum SigRejectLogAction {
313+
/// Under the per-window cap — caller emits its detailed reject line.
314+
Emit,
315+
/// Cap just crossed — caller emits ONE suppression notice instead.
316+
EmitSuppressNotice,
317+
/// Over the cap — caller stays silent (rejection already happened).
318+
Suppress,
319+
}
320+
321+
fn sig_reject_log_decision(claimed_node_id: &str) -> SigRejectLogAction {
322+
let now = std::time::SystemTime::now()
323+
.duration_since(std::time::UNIX_EPOCH)
324+
.map(|d| d.as_secs())
325+
.unwrap_or(0);
326+
327+
// Lazy soft-eviction (only when a NEW identity would grow past cap).
328+
if !SIG_REJECT_LOG_GOVERNOR.contains_key(claimed_node_id)
329+
&& SIG_REJECT_LOG_GOVERNOR.len() >= SIG_REJECT_GOVERNOR_CAP
330+
{
331+
let mut entries: Vec<(String, u64)> = SIG_REJECT_LOG_GOVERNOR
332+
.iter()
333+
.map(|e| (e.key().clone(), e.value().window_start_s))
334+
.collect();
335+
entries.sort_by_key(|(_, w)| *w);
336+
let to_drop = entries.len() / 4;
337+
for (k, _) in entries.into_iter().take(to_drop) {
338+
SIG_REJECT_LOG_GOVERNOR.remove(&k);
339+
}
340+
}
341+
342+
let mut action = SigRejectLogAction::Emit;
343+
let mut flood_summary: Option<u64> = None;
344+
345+
SIG_REJECT_LOG_GOVERNOR
346+
.entry(claimed_node_id.to_string())
347+
.and_modify(|st| {
348+
if now.saturating_sub(st.window_start_s) >= SIG_REJECT_LOG_WINDOW_S {
349+
// Window rolled: report any suppression from the closed
350+
// window, then reopen counting this rejection as #1.
351+
if st.suppressed_in_window > 0 {
352+
flood_summary = Some(st.suppressed_in_window);
353+
}
354+
st.window_start_s = now;
355+
st.logged_in_window = 1;
356+
st.suppressed_in_window = 0;
357+
action = SigRejectLogAction::Emit;
358+
} else if st.logged_in_window < SIG_REJECT_LOG_PER_WINDOW {
359+
st.logged_in_window += 1;
360+
action = SigRejectLogAction::Emit;
361+
} else if st.logged_in_window == SIG_REJECT_LOG_PER_WINDOW {
362+
st.logged_in_window += 1; // mark the notice as emitted
363+
action = SigRejectLogAction::EmitSuppressNotice;
364+
} else {
365+
st.suppressed_in_window = st.suppressed_in_window.saturating_add(1);
366+
action = SigRejectLogAction::Suppress;
367+
}
368+
})
369+
.or_insert_with(|| SigRejectLogState {
370+
window_start_s: now,
371+
logged_in_window: 1,
372+
suppressed_in_window: 0,
373+
});
374+
375+
if let Some(n) = flood_summary {
376+
eprintln!(
377+
"[WARN][SECURITY] sig_reject_flood claimed_node={} window_s={} suppressed={} action=window_rolled_still_under_attack",
378+
claimed_node_id, SIG_REJECT_LOG_WINDOW_S, n
379+
);
380+
}
381+
action
382+
}
383+
384+
/// Rate-governed security-reject logger.
385+
///
386+
/// `full_line` is the exact `[ERR][...]` line the call site would have
387+
/// emitted unconditionally before v25.3. The rejection has ALREADY
388+
/// happened at the call site (the caller `return false`s immediately
389+
/// after) — this governs only whether the line reaches the log, so an
390+
/// attacker flooding pre-PK-parse garbage cannot DoS logging. First
391+
/// `SIG_REJECT_LOG_PER_WINDOW` per claimed identity per window pass
392+
/// through verbatim; then one suppression notice; then silence with a
393+
/// per-window flood summary. Security semantics are unchanged.
394+
pub fn log_sig_reject(claimed_node_id: &str, full_line: &str) {
395+
match sig_reject_log_decision(claimed_node_id) {
396+
SigRejectLogAction::Emit => eprintln!("{}", full_line),
397+
SigRejectLogAction::EmitSuppressNotice => eprintln!(
398+
"[WARN][SECURITY] sig_reject_log_suppressed claimed_node={} window_s={} threshold={} action=silencing_until_window_roll",
399+
claimed_node_id, SIG_REJECT_LOG_WINDOW_S, SIG_REJECT_LOG_PER_WINDOW
400+
),
401+
SigRejectLogAction::Suppress => { /* rejection already enforced at call site */ }
402+
}
403+
}
404+
258405
/// Compute the 32-byte SHA3-256 fingerprint of an extracted public key.
259406
/// Collision-resistant and post-quantum safe; fits as a DashMap key
260407
/// with no allocations on the lookup path.
@@ -1675,8 +1822,8 @@ async fn verify_dilithium_signature(
16751822
// Combined format: [sig_len(4)] + [SignedMessage(sig+msg)] + [pk_len(4)] + [pk(1952)]
16761823
// Minimum size: ML-DSA-65 signature (3309 bytes) + message + metadata
16771824
if signature_bytes.len() < 3309 {
1678-
eprintln!("[ERR][CONSENSUS] sig_too_small node={} size={} min=3309",
1679-
node_id, signature_bytes.len());
1825+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_too_small node={} size={} min=3309",
1826+
node_id, signature_bytes.len()));
16801827
return false;
16811828
}
16821829

@@ -1686,9 +1833,13 @@ async fn verify_dilithium_signature(
16861833
if valid {
16871834
println!("[INFO][CONSENSUS] sig_verified node={}", node_id);
16881835
} else {
1689-
eprintln!("[ERR][CONSENSUS] sig_invalid node={}", node_id);
1836+
// Governed: a spoofer flooding garbage under a claimed identity
1837+
// would otherwise emit one of these per frame. Rejection is
1838+
// already final (the inner verify returned false); this only
1839+
// rate-limits the log line.
1840+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_invalid node={}", node_id));
16901841
}
1691-
1842+
16921843
valid
16931844
}
16941845

@@ -1700,22 +1851,22 @@ async fn verify_with_real_dilithium(
17001851
) -> bool {
17011852
// Verify signature structure: all-zero is trivially invalid
17021853
if signature_bytes.iter().all(|&b| b == 0) {
1703-
eprintln!("[ERR][CONSENSUS] sig_all_zeros node={}", node_id);
1854+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_all_zeros node={}", node_id));
17041855
return false;
17051856
}
17061857

17071858
// Entropy check on the ML-DSA-65 signature part (3309 bytes, CTILDEBYTES=48)
17081859
let sig_part = &signature_bytes[..std::cmp::min(3309, signature_bytes.len())];
17091860
let unique_bytes: std::collections::HashSet<_> = sig_part.iter().collect();
17101861
if unique_bytes.len() < 200 {
1711-
eprintln!("[ERR][CONSENSUS] sig_low_entropy node={} unique={} threshold=200",
1712-
node_id, unique_bytes.len());
1862+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_low_entropy node={} unique={} threshold=200",
1863+
node_id, unique_bytes.len()));
17131864
return false;
17141865
}
17151866

17161867
// Parse combined format: [sig_len(4)] + [SignedMessage(sig+msg)] + [pk_len(4)] + [pk(1952)]
17171868
if signature_bytes.len() < 8 {
1718-
eprintln!("[ERR][CONSENSUS] sig_too_short node={} size={}", node_id, signature_bytes.len());
1869+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_too_short node={} size={}", node_id, signature_bytes.len()));
17191870
return false;
17201871
}
17211872

@@ -1728,7 +1879,7 @@ async fn verify_with_real_dilithium(
17281879

17291880
// ML-DSA-65 SignedMessage must be at least 3309 bytes (sig) + 1 byte (msg) = 3310 minimum
17301881
if signed_len <= 3309 || 4 + signed_len >= signature_bytes.len() {
1731-
eprintln!("[ERR][CONSENSUS] sig_format_invalid node={} signed_len={}", node_id, signed_len);
1882+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_format_invalid node={} signed_len={}", node_id, signed_len));
17321883
return false;
17331884
}
17341885

@@ -1751,13 +1902,13 @@ async fn verify_with_real_dilithium(
17511902
// CRITICAL: Dilithium3 public key MUST be exactly 1952 bytes (NIST standard)
17521903
use pqcrypto_mldsa::mldsa65 as dilithium3;
17531904
if pk_len != dilithium3::public_key_bytes() {
1754-
eprintln!("[ERR][CONSENSUS] pk_size_invalid node={} got={} expected={}",
1755-
node_id, pk_len, dilithium3::public_key_bytes());
1905+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] pk_size_invalid node={} got={} expected={}",
1906+
node_id, pk_len, dilithium3::public_key_bytes()));
17561907
return false;
17571908
}
17581909

17591910
if pk_start + pk_len != signature_bytes.len() {
1760-
eprintln!("[ERR][CONSENSUS] sig_len_mismatch node={}", node_id);
1911+
log_sig_reject(node_id, &format!("[ERR][CONSENSUS] sig_len_mismatch node={}", node_id));
17611912
return false;
17621913
}
17631914

development/qnet-integration/src/storage.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,27 @@ impl PersistentStorage {
778778
// With this setting, RocksDB force-flushes oldest CF memtables when
779779
// total WAL exceeds 64MB, enabling old WAL cleanup.
780780
opts.set_max_total_wal_size(67_108_864); // 64MB max WAL (was: unlimited)
781-
781+
782+
// v25.3: BOUND RocksDB's internal diagnostic LOG file.
783+
// Default RocksDB behaviour is a SINGLE `LOG` file that grows
784+
// without bound until the DB is reopened (only a node restart
785+
// archives it to LOG.old.<ts>). In production this was observed
786+
// at ~454 MB after 27 h continuous uptime (~17 MB/h ≈ 150 GB/yr
787+
// unbounded) on every node. This is RocksDB's own operational
788+
// log (compaction/flush/stats) — NOT chain data, NOT the WAL,
789+
// NOT consensus state — so bounding it is purely hygienic and
790+
// cannot affect blockchain integrity, recovery, or determinism.
791+
//
792+
// size + count bounding only: rotate the LOG at 64 MB and keep
793+
// at most 10 rotations → hard cap ≈ 640 MB rolling window
794+
// instead of one ever-growing file. Verbosity (INFO) is
795+
// deliberately UNCHANGED so RocksDB-internal forensics
796+
// (compaction stalls, write-stalls, corruption events) remain
797+
// fully available — we only stop the unbounded growth, we do
798+
// not trade away diagnostic detail.
799+
opts.set_max_log_file_size(67_108_864); // 64 MB → then rotate
800+
opts.set_keep_log_file_num(10); // keep ≤10 rotations (~640 MB cap)
801+
782802
// v3.19: AGGRESSIVE compaction settings
783803
opts.set_level_compaction_dynamic_level_bytes(true);
784804
opts.set_max_bytes_for_level_base(67108864); // 64MB base level

development/qnet-integration/src/unified_p2p.rs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16724,9 +16724,12 @@ impl SimplifiedP2P {
1672416724
println!("[INFO][P2P] Dilithium signature verified for {}", node_id);
1672516725
}
1672616726
} else {
16727-
if crate::node::is_info() {
16728-
println!("[ERR][P2P] Invalid Dilithium signature for {}", node_id);
16729-
}
16727+
// v25.3: governed — collapses spoofer flood (shares the
16728+
// per-claimed-id window with the consensus-layer sites).
16729+
qnet_consensus::consensus_crypto::log_sig_reject(
16730+
node_id,
16731+
&format!("[ERR][P2P] Invalid Dilithium signature for {}", node_id),
16732+
);
1673016733
}
1673116734
valid
1673216735
}
@@ -16738,7 +16741,7 @@ impl SimplifiedP2P {
1673816741
}
1673916742
}
1674016743
}
16741-
16744+
1674216745
/// OPTIMIZED v2.24: Verify HYBRID P2P BINARY signature (bincode+zstd)
1674316746
async fn verify_hybrid_p2p_binary_async(&self, message: &str, signature: &str, node_id: &str) -> bool {
1674416747
use crate::hybrid_crypto::{CompactHybridSignature, HybridCrypto};
@@ -17080,9 +17083,13 @@ impl SimplifiedP2P {
1708017083
println!("[INFO][P2P] Dilithium signature verified for {}", node_id);
1708117084
}
1708217085
} else {
17083-
if crate::node::is_info() {
17084-
println!("[ERR][P2P] Invalid Dilithium signature for {}", node_id);
17085-
}
17086+
// v25.3: governed — collapses spoofer flood
17087+
// (shares the per-claimed-id window with the
17088+
// consensus-layer reject sites).
17089+
qnet_consensus::consensus_crypto::log_sig_reject(
17090+
&node_id,
17091+
&format!("[ERR][P2P] Invalid Dilithium signature for {}", node_id),
17092+
);
1708617093
}
1708717094
valid
1708817095
}

0 commit comments

Comments
 (0)