From 07d3d295ce6cc2aecd6f05c95394901816fa5641 Mon Sep 17 00:00:00 2001 From: Henry Rovnyak Date: Sat, 1 Nov 2025 00:58:36 -0400 Subject: [PATCH 1/4] Implement clearing the statistics data daily Gitignore the webring.log Implement clearing the statistics data daily --- Cargo.lock | 101 ++++++++++++++++++++++---------------- Cargo.toml | 3 +- src/routes.rs | 20 ++------ src/stats/mod.rs | 124 +++++++++++++++++++++++++++++++---------------- src/webring.rs | 29 +++-------- 5 files changed, 152 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ffb7316..9396509 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -211,9 +211,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.0" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" [[package]] name = "bytes" @@ -226,9 +226,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.47" +version = "1.2.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd405d82c84ff7f35739f175f67d8b9fb7687a0e84ccdc78bd3568839827cf07" +checksum = "9f50d563227a1c37cc0a263f64eca3334388c01c5e4c4861a9def205c614383c" dependencies = [ "find-msvc-tools", "shlex", @@ -408,21 +408,22 @@ dependencies = [ [[package]] name = "derive_more" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "093242cf7570c207c83073cf82f79706fe7b8317e98620a47d5be7c3d8497678" +checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618" dependencies = [ "derive_more-impl", ] [[package]] name = "derive_more-impl" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda628edc44c4bb645fbe0f758797143e4e07926f7ebf4e9bdfbd3d2ce621df3" +checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b" dependencies = [ "proc-macro2", "quote", + "rustc_version", "syn", ] @@ -906,9 +907,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.18" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52e9a2a24dc5c6821e71a7030e1e14b7b632acac55c40e9d2e082c621261bb56" +checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" dependencies = [ "base64", "bytes", @@ -978,9 +979,9 @@ checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ "icu_collections", "icu_locale_core", @@ -992,9 +993,9 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" [[package]] name = "icu_provider" @@ -1125,9 +1126,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "js-sys" -version = "0.3.82" +version = "0.3.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" dependencies = [ "once_cell", "wasm-bindgen", @@ -1161,9 +1162,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.177" +version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" [[package]] name = "linux-raw-sys" @@ -1179,9 +1180,9 @@ checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "log" -version = "0.4.28" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lol_html" @@ -1231,9 +1232,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", "log", @@ -1465,6 +1466,7 @@ dependencies = [ "reqwest", "same-file", "sarlacc", + "seize", "serde", "serde_json", "tempfile", @@ -1749,6 +1751,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.2" @@ -1777,9 +1788,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.0" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94182ad936a0c91c324cd46c6511b9510ed16af436d7b5bab34beab0afd55f7a" +checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" dependencies = [ "zeroize", ] @@ -1886,6 +1897,12 @@ dependencies = [ "smallvec", ] +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + [[package]] name = "serde" version = "1.0.228" @@ -2590,9 +2607,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" +checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" dependencies = [ "cfg-if", "once_cell", @@ -2603,9 +2620,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.55" +version = "0.4.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" +checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" dependencies = [ "cfg-if", "js-sys", @@ -2616,9 +2633,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" +checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2626,9 +2643,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" +checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" dependencies = [ "bumpalo", "proc-macro2", @@ -2639,9 +2656,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" +checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" dependencies = [ "unicode-ident", ] @@ -2661,9 +2678,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.82" +version = "0.3.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" +checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" dependencies = [ "js-sys", "wasm-bindgen", @@ -2871,9 +2888,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.13" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" [[package]] name = "wit-bindgen" @@ -2918,18 +2935,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ea879c944afe8a2b25fef16bb4ba234f47c694565e97383b36f3a878219065c" +checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf955aa904d6040f70dc8e9384444cb1030aed272ba3cb09bbc4ab9e7c1f34f5" +checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index a53fba3..0745106 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ missing-docs-in-private-items = "warn" [dependencies] axum = { version = "0.8.4", default-features = false, features = [ "http1", "http2", "tokio", "tracing", "query" ] } -chrono = { version = "0.4.41", default-features = false, features = [ "serde", "now" ] } +chrono = { version = "0.4.42", default-features = false, features = [ "serde", "now" ] } clap = { version = "4.5.41", features = ["derive"] } eyre = "0.6.12" futures = { version = "0.3.31", default-features = false, features = [ "alloc" ] } @@ -50,6 +50,7 @@ rand = { version = "0.9.2", default-features = false, features = [ "thread_rng" reqwest = { version = "0.12.22", features = ["stream", "json"] } same-file = "1.0.6" sarlacc = "0.1.4" +seize = "0.5.1" # sarlacc = { path = "../sarlacc" } serde = { version = "1.0.219", features = ["derive"] } serde_json = "1.0.141" diff --git a/src/routes.rs b/src/routes.rs index 80e422c..e60c7c4 100644 --- a/src/routes.rs +++ b/src/routes.rs @@ -501,7 +501,7 @@ mod tests { http::{Request, Uri, header}, response::IntoResponse, }; - use chrono::Utc; + use eyre::eyre; use http_body_util::BodyExt; use indoc::indoc; @@ -512,10 +512,7 @@ mod tests { use tower::{Service, ServiceExt}; use tower_http::catch_panic::ResponseForPanic; - use crate::{ - stats::{TIMEZONE, UNKNOWN_ORIGIN}, - webring::Webring, - }; + use crate::{stats::UNKNOWN_ORIGIN, webring::Webring}; use super::{OriginUriLocation, PanicResponse, RouteError, create_router}; @@ -571,8 +568,6 @@ mod tests { async fn index() { let (router, webring, tmpfiles) = app().await; - let today = Utc::now().with_timezone(&TIMEZONE).date_naive(); - // Request `/` let res = router .oneshot( @@ -590,10 +585,7 @@ mod tests { .unwrap(); assert_eq!("Hello homepage!", text); assert_eq!(status, StatusCode::OK); - webring.assert_stat_entry( - (today, "kasad.com", "ring.purduehackers.com", "kasad.com"), - 1, - ); + webring.assert_stat_entry(("kasad.com", "ring.purduehackers.com", "kasad.com"), 1); drop(tmpfiles); } @@ -602,8 +594,6 @@ mod tests { async fn index_unknown_referer() { let (router, webring, tmpfiles) = app().await; - let today = Utc::now().with_timezone(&TIMEZONE).date_naive(); - let res = router .oneshot( Request::builder() @@ -622,7 +612,6 @@ mod tests { assert_eq!(status, StatusCode::OK); webring.assert_stat_entry( ( - today, UNKNOWN_ORIGIN.as_str(), "ring.purduehackers.com", UNKNOWN_ORIGIN.as_str(), @@ -637,8 +626,6 @@ mod tests { async fn visit() { let (router, webring, tmpfiles) = app().await; - let today = Utc::now().with_timezone(&TIMEZONE).date_naive(); - let res = router .oneshot( Request::builder() @@ -656,7 +643,6 @@ mod tests { assert_eq!(res.status(), StatusCode::SEE_OTHER); webring.assert_stat_entry( ( - today, "ring.purduehackers.com", "clementine.viridian.page", "ring.purduehackers.com", diff --git a/src/stats/mod.rs b/src/stats/mod.rs index bffbd6d..ca5b2e8 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -27,14 +27,15 @@ use std::{ net::IpAddr, sync::{ LazyLock, - atomic::{AtomicU64, Ordering}, + atomic::{AtomicI32, AtomicPtr, AtomicU64, Ordering}, }, }; use axum::http::uri::Authority; -use chrono::{DateTime, Duration, FixedOffset, NaiveDate, Utc}; -use papaya::HashMap; +use chrono::{DateTime, Duration, FixedOffset, Utc}; +use papaya::{Guard, HashMap}; use sarlacc::Intern; +use seize::Collector; use tracing::{info, instrument}; /// The TTL for IP tracking entries, after which they are considered stale and removed. @@ -55,22 +56,67 @@ struct IpInfo { started_from: Intern, } -#[derive(Debug, Default)] +type Counts = HashMap<(Intern, Intern, Intern), AtomicU64>; + +/// The counts for all of the possible webring redirects +#[derive(Debug)] struct AggregatedStats { - /// (Date (with timezone `TIMEZONE`), From, To, Started From) → Count - #[expect(clippy::type_complexity)] - counters: HashMap< - ( - NaiveDate, - Intern, - Intern, - Intern, - ), - AtomicU64, - >, + /// The collector for the atomic data that we're handling + collector: Collector, + /// The last date that a redirect was tracked (hopefully today) + today: AtomicI32, + /// (From, To, Started From) → Count + /// Invariant: This MUST ALWAYS be a valid pointer + counter: AtomicPtr, +} + +impl AggregatedStats { + /// Create a new `AggregatedStats` + fn new(now: DateTime) -> Self { + let counter: Counts = HashMap::default(); + + AggregatedStats { + today: AtomicI32::new(Self::mk_num(now)), + counter: AtomicPtr::new(Box::into_raw(Box::new(counter))), + collector: Collector::new(), + } + } + + /// Convert the current time into the days since the epoch + fn mk_num(time: DateTime) -> i32 { + time.date_naive().to_epoch_days() + } + + /// Retrieve the current counter from a guard + fn counter<'a>(&'a self, guard: &'a impl Guard) -> &'a Counts { + // SAFETY: The counter is guaranteed to be a valid pointer and we are using Acquire ordering to synchronize-with its initialization + unsafe { &*guard.protect(&self.counter, Ordering::Acquire) } + } + + /// Retrieve the current counter from a guard while updating it if the current time is a new calendar date + fn maybe_update_counter<'a>(&'a self, now: DateTime, guard: &'a impl Guard) -> &'a Counts { + let now = AggregatedStats::mk_num(now); + + let prev_day = self.today.swap(now, Ordering::Relaxed); + + if prev_day != now { + let new_counter: *mut Counts = Box::into_raw(Box::new(HashMap::new())); + + // Release to synchronize-with `counter`. We don't need Acquire because we won't read the previous pointer. + let prev = guard.swap(&self.counter, new_counter, Ordering::Release); + // SAFETY: The pointer can no longer be accessed now that it has been swapped into `prev`, and `Box::from_raw` is the correct way to drop the pointer. + unsafe { + self.collector + .retire(prev, |ptr, _| drop(Box::from_raw(ptr))); + } + } + + self.counter(guard) + } } -#[derive(Debug, Default)] +/// Statistics tracking for the webring +#[derive(Debug)] pub struct Stats { /// Aggregated statistics aggregated: AggregatedStats, @@ -80,9 +126,9 @@ pub struct Stats { impl Stats { /// Creates a new instance of `Stats`. - pub fn new() -> Stats { + pub fn new(now: DateTime) -> Stats { Stats { - aggregated: AggregatedStats::default(), + aggregated: AggregatedStats::new(now), ip_tracking: HashMap::new(), } } @@ -115,11 +161,9 @@ impl Stats { }, ); - let date = now.with_timezone(&TIMEZONE).date_naive(); - - let pinned_map = self.aggregated.counters.pin(); - let counter = - pinned_map.get_or_insert((date, from, to, ip_info.started_from), AtomicU64::new(0)); + let guard = self.aggregated.collector.enter(); + let pinned_map = self.aggregated.maybe_update_counter(now, &guard).pin(); + let counter = pinned_map.get_or_insert((from, to, ip_info.started_from), AtomicU64::new(0)); counter.fetch_add(1, Ordering::Relaxed); } @@ -144,17 +188,17 @@ impl Stats { } #[cfg(test)] - pub fn assert_stat_entry(&self, entry: (NaiveDate, &str, &str, &str), count: u64) { + pub fn assert_stat_entry(&self, entry: (&str, &str, &str), count: u64) { + let guard = self.aggregated.collector.enter(); assert_eq!( self.aggregated - .counters + .counter(&guard) .pin() .get(&( - entry.0, + Intern::new(entry.0.parse::().unwrap()), Intern::new(entry.1.parse::().unwrap()), Intern::new(entry.2.parse::().unwrap()), - Intern::new(entry.3.parse::().unwrap()), - )) + ),) .map_or(0, |v| v.load(Ordering::Relaxed)), count, "{self:#?}\n{entry:?}" @@ -192,7 +236,7 @@ mod tests { #[tokio::test] async fn test_stat_tracking() { - let stats = Stats::new(); + let stats = Stats::new(t(0)); stats.redirected_impl(a("0.0.0.0"), i("a.com"), i("b.com"), t(0)); stats.redirected_impl(a("0.0.0.0"), i("b.com"), i("c.com"), t(1)); @@ -201,11 +245,12 @@ mod tests { stats.redirected_impl(a("1.0.0.0"), i("b.com"), i("homepage.com"), t(2)); stats.redirected_impl(a("1.0.0.0"), i("homepage.com"), i("c.com"), t(3)); - assert_eq!(stats.aggregated.counters.len(), 4); - stats.assert_stat_entry((d(0), "a.com", "b.com", "a.com"), 2); - stats.assert_stat_entry((d(0), "b.com", "c.com", "a.com"), 1); - stats.assert_stat_entry((d(0), "b.com", "homepage.com", "a.com"), 1); - stats.assert_stat_entry((d(0), "homepage.com", "c.com", "a.com"), 1); + let guard = stats.aggregated.collector.enter(); + assert_eq!(stats.aggregated.counter(&guard).len(), 4); + stats.assert_stat_entry(("a.com", "b.com", "a.com"), 2); + stats.assert_stat_entry(("b.com", "c.com", "a.com"), 1); + stats.assert_stat_entry(("b.com", "homepage.com", "a.com"), 1); + stats.assert_stat_entry(("homepage.com", "c.com", "a.com"), 1); let tracking = stats.ip_tracking.pin(); assert_eq!(tracking.len(), 2); @@ -243,13 +288,8 @@ mod tests { stats.redirected_impl(a("0.0.0.0"), i("b.com"), i("c.com"), t(1) + day); stats.redirected_impl(a("0.0.0.0"), i("c.com"), i("a.com"), t(2) + day); - assert_eq!(stats.aggregated.counters.len(), 6); - stats.assert_stat_entry((d(0), "a.com", "b.com", "a.com"), 2); - stats.assert_stat_entry((d(0), "b.com", "c.com", "a.com"), 1); - stats.assert_stat_entry((d(0), "b.com", "homepage.com", "a.com"), 1); - stats.assert_stat_entry((d(0), "homepage.com", "c.com", "a.com"), 1); - stats.assert_stat_entry((d(day.num_seconds()), "b.com", "c.com", "b.com"), 1); - stats.assert_stat_entry((d(day.num_seconds()), "b.com", "c.com", "b.com"), 1); - stats.assert_stat_entry((d(day.num_seconds()), "c.com", "a.com", "b.com"), 1); + assert_eq!(stats.aggregated.counter(&guard).len(), 2); + stats.assert_stat_entry(("b.com", "c.com", "b.com"), 1); + stats.assert_stat_entry(("c.com", "a.com", "b.com"), 1); } } diff --git a/src/webring.rs b/src/webring.rs index 476a508..6908b91 100644 --- a/src/webring.rs +++ b/src/webring.rs @@ -29,7 +29,7 @@ use std::{ }; use axum::http::{Uri, uri::Authority}; -use chrono::TimeDelta; +use chrono::{TimeDelta, Utc}; use futures::{StreamExt, future::join, stream::FuturesUnordered}; use indexmap::IndexMap; use notify::{EventKind, RecommendedWatcher, RecursiveMode, Watcher as _}; @@ -228,7 +228,7 @@ impl Webring { members: RwLock::new(member_map_from_config_table(&config.members)), static_dir_path: config.webring.static_dir.clone(), homepage: AsyncRwLock::new(None), - stats: Arc::new(Stats::new()), + stats: Arc::new(Stats::new(Utc::now())), file_watcher: OnceLock::default(), base_address: config.webring.base_url(), notifier: config @@ -648,7 +648,7 @@ impl Webring { } #[cfg(test)] - pub fn assert_stat_entry(&self, entry: (chrono::NaiveDate, &str, &str, &str), count: u64) { + pub fn assert_stat_entry(&self, entry: (&str, &str, &str), count: u64) { self.stats.assert_stat_entry(entry, count); } } @@ -699,7 +699,7 @@ mod tests { use crate::{ config::{Config, MemberSpec}, discord::{DiscordNotifier, NOTIFICATION_DEBOUNCE_PERIOD, Snowflake}, - stats::{TIMEZONE, UNKNOWN_ORIGIN}, + stats::{Stats, UNKNOWN_ORIGIN}, webring::{CheckLevel, Webring}, }; @@ -716,7 +716,7 @@ mod tests { base_address: Intern::default(), base_authority: Intern::new("ring.purduehackers.com".parse().unwrap()), notifier: None, - stats: Arc::default(), + stats: Arc::new(Stats::new(Utc::now())), config: Arc::new(AsyncRwLock::new(None)), } } @@ -847,21 +847,11 @@ mod tests { assert_eq!(*inner, expected); } - let today = Utc::now().with_timezone(&TIMEZONE).date_naive(); - webring.assert_next( "https://hrovnyak.gitlab.io/bruh/bruh/bruh?bruh=bruh", Ok("kasad.com"), ); - webring.assert_stat_entry( - ( - today, - "hrovnyak.gitlab.io", - "kasad.com", - "hrovnyak.gitlab.io", - ), - 1, - ); + webring.assert_stat_entry(("hrovnyak.gitlab.io", "kasad.com", "hrovnyak.gitlab.io"), 1); webring.assert_prev( "https://hrovnyak.gitlab.io/bruh/bruh/bruh?bruh=bruh", @@ -869,7 +859,6 @@ mod tests { ); webring.assert_stat_entry( ( - today, "hrovnyak.gitlab.io", "refuse-the-r.ing", "hrovnyak.gitlab.io", @@ -1047,15 +1036,12 @@ mod tests { .unwrap(); let webring = Webring::new(&config); - let today = Utc::now().with_timezone(&TIMEZONE).date_naive(); - let uri = webring .random_page(None, "0.0.0.0".parse().unwrap()) .unwrap(); assert_eq!(uri, Intern::new("kasad.com".parse().unwrap())); webring.assert_stat_entry( ( - today, UNKNOWN_ORIGIN.as_str(), "kasad.com", UNKNOWN_ORIGIN.as_str(), @@ -1073,8 +1059,6 @@ mod tests { "# }).unwrap(); let webring = Webring::new(&config); - let today = Utc::now().with_timezone(&TIMEZONE).date_naive(); - let uri = webring .random_page( Some(&"clementine.viridian.page".parse().unwrap()), @@ -1084,7 +1068,6 @@ mod tests { assert_eq!(uri, Intern::new("kasad.com".parse().unwrap())); webring.assert_stat_entry( ( - today, "clementine.viridian.page", "kasad.com", "clementine.viridian.page", From 1661d68f45ccac23f5f7e77d31dab6b458117fc7 Mon Sep 17 00:00:00 2001 From: Henry Rovnyak Date: Sat, 1 Nov 2025 16:28:45 -0400 Subject: [PATCH 2/4] Implement writing statistics data to a writer --- src/stats/mod.rs | 146 +++++++++++++++++++++++++++++++++++++---------- src/webring.rs | 9 +-- 2 files changed, 121 insertions(+), 34 deletions(-) diff --git a/src/stats/mod.rs b/src/stats/mod.rs index ca5b2e8..6b8bf3f 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -23,10 +23,12 @@ with the Purdue Hackers webring. If not, see . #![allow(dead_code)] #![allow(unused_variables)] +use core::fmt::Debug; use std::{ + io::Write, net::IpAddr, sync::{ - LazyLock, + Arc, LazyLock, Mutex, atomic::{AtomicI32, AtomicPtr, AtomicU64, Ordering}, }, }; @@ -36,7 +38,7 @@ use chrono::{DateTime, Duration, FixedOffset, Utc}; use papaya::{Guard, HashMap}; use sarlacc::Intern; use seize::Collector; -use tracing::{info, instrument}; +use tracing::{error, info, instrument}; /// The TTL for IP tracking entries, after which they are considered stale and removed. const IP_TRACKING_TTL: chrono::TimeDelta = Duration::days(1); @@ -59,34 +61,44 @@ struct IpInfo { type Counts = HashMap<(Intern, Intern, Intern), AtomicU64>; /// The counts for all of the possible webring redirects -#[derive(Debug)] -struct AggregatedStats { +struct AggregatedStats { /// The collector for the atomic data that we're handling - collector: Collector, + collector: Arc, /// The last date that a redirect was tracked (hopefully today) today: AtomicI32, /// (From, To, Started From) → Count /// Invariant: This MUST ALWAYS be a valid pointer counter: AtomicPtr, + /// The writer for the statistics output file + output: Arc>, } -impl AggregatedStats { +impl Debug for AggregatedStats { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AggregatedStats") + .field("today", &self.today.load(Ordering::Relaxed)) + .finish_non_exhaustive() + } +} + +/// Convert the current time into the days since the epoch +fn mk_num(time: DateTime) -> i32 { + time.date_naive().to_epoch_days() +} + +impl AggregatedStats { /// Create a new `AggregatedStats` - fn new(now: DateTime) -> Self { + fn new(now: DateTime, writer: W) -> Self { let counter: Counts = HashMap::default(); AggregatedStats { - today: AtomicI32::new(Self::mk_num(now)), + today: AtomicI32::new(mk_num(now)), counter: AtomicPtr::new(Box::into_raw(Box::new(counter))), - collector: Collector::new(), + collector: Arc::new(Collector::new()), + output: Arc::new(Mutex::new(writer)), } } - /// Convert the current time into the days since the epoch - fn mk_num(time: DateTime) -> i32 { - time.date_naive().to_epoch_days() - } - /// Retrieve the current counter from a guard fn counter<'a>(&'a self, guard: &'a impl Guard) -> &'a Counts { // SAFETY: The counter is guaranteed to be a valid pointer and we are using Acquire ordering to synchronize-with its initialization @@ -95,20 +107,45 @@ impl AggregatedStats { /// Retrieve the current counter from a guard while updating it if the current time is a new calendar date fn maybe_update_counter<'a>(&'a self, now: DateTime, guard: &'a impl Guard) -> &'a Counts { - let now = AggregatedStats::mk_num(now); + let now = mk_num(now); let prev_day = self.today.swap(now, Ordering::Relaxed); if prev_day != now { let new_counter: *mut Counts = Box::into_raw(Box::new(HashMap::new())); - // Release to synchronize-with `counter`. We don't need Acquire because we won't read the previous pointer. - let prev = guard.swap(&self.counter, new_counter, Ordering::Release); - // SAFETY: The pointer can no longer be accessed now that it has been swapped into `prev`, and `Box::from_raw` is the correct way to drop the pointer. - unsafe { - self.collector - .retire(prev, |ptr, _| drop(Box::from_raw(ptr))); - } + // We need this guard to go into our task so it needs to be owned + let guard_owned = self.collector.enter(); + + // Release to synchronize-with `counter` and Acquire to ensure that we can see the initialization of the previous one so that we can properly access and write it. + let prev_ptr = guard_owned.swap(&self.counter, new_counter, Ordering::AcqRel); + + let output = Arc::clone(&self.output); + + // Allow it to be moved to our task + let prev_ptr = prev_ptr as usize; + + let this_collector = Arc::clone(&self.collector); + + tokio::task::spawn_blocking(move || { + let mut output = output.lock().unwrap(); + + let prev_ptr = prev_ptr as *mut Counts; + // SAFETY: Since this pointer hasn't been retired yet, we have access to it until we do retire it. + let prev = unsafe { &*prev_ptr }.pin(); + + for ((from, to, started_from), count) in &prev { + let count = count.load(Ordering::Relaxed); + if let Err(e) = output.write_fmt(format_args!( + "{prev_day},{from},{to},{started_from},{count}\n" + )) { + error!("Error writing statistics: {e}"); + } + } + + // SAFETY: The pointer can no longer be accessed from a new location since we previously overwrote the atomic pointer, and `Box::from_raw` is the correct way to drop the pointer. This task is also finished with its access to it. + unsafe { this_collector.retire(prev_ptr, |ptr, _| drop(Box::from_raw(ptr))) } + }); } self.counter(guard) @@ -116,19 +153,27 @@ impl AggregatedStats { } /// Statistics tracking for the webring -#[derive(Debug)] -pub struct Stats { +pub struct Stats { /// Aggregated statistics - aggregated: AggregatedStats, + aggregated: AggregatedStats, /// Map of IP information keyed by IP address ip_tracking: HashMap, } -impl Stats { +impl Debug for Stats { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Stats") + .field("aggregated", &self.aggregated) + .field("ip_tracking", &self.ip_tracking) + .finish() + } +} + +impl Stats { /// Creates a new instance of `Stats`. - pub fn new(now: DateTime) -> Stats { + pub fn new(now: DateTime, writer: W) -> Stats { Stats { - aggregated: AggregatedStats::new(now), + aggregated: AggregatedStats::new(now, writer), ip_tracking: HashMap::new(), } } @@ -208,11 +253,13 @@ impl Stats { #[cfg(test)] mod tests { - use std::net::IpAddr; + use std::{collections::HashSet, net::IpAddr, str::from_utf8}; use axum::http::uri::Authority; use chrono::{DateTime, Duration, NaiveDate, Utc}; + use indoc::indoc; use sarlacc::Intern; + use tokio::sync::mpsc::{self, UnboundedReceiver}; use crate::stats::IP_TRACKING_TTL; @@ -234,9 +281,37 @@ mod tests { t(timestamp).with_timezone(&TIMEZONE).date_naive() } + struct TestWriter(mpsc::UnboundedSender>); + + impl std::io::Write for TestWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.0.send(buf.to_owned()).unwrap(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + async fn assert_same_data(rx: &mut UnboundedReceiver>, expected: &str) { + let mut data = Vec::new(); + while data.len() != expected.len() { + data.extend(rx.recv().await.unwrap()); + } + assert_eq!( + from_utf8(&data) + .unwrap() + .split('\n') + .collect::>(), + expected.split('\n').collect::>() + ); + } + #[tokio::test] async fn test_stat_tracking() { - let stats = Stats::new(t(0)); + let (tx, mut rx) = mpsc::unbounded_channel(); + let stats = Stats::new(t(0), TestWriter(tx)); stats.redirected_impl(a("0.0.0.0"), i("a.com"), i("b.com"), t(0)); stats.redirected_impl(a("0.0.0.0"), i("b.com"), i("c.com"), t(1)); @@ -285,7 +360,18 @@ mod tests { let day = Duration::days(1); + assert!(rx.is_empty()); stats.redirected_impl(a("0.0.0.0"), i("b.com"), i("c.com"), t(1) + day); + assert_same_data( + &mut rx, + indoc! {" + 0,a.com,b.com,a.com,2 + 0,b.com,c.com,a.com,1 + 0,b.com,homepage.com,a.com,1 + 0,homepage.com,c.com,a.com,1 + "}, + ) + .await; stats.redirected_impl(a("0.0.0.0"), i("c.com"), i("a.com"), t(2) + day); assert_eq!(stats.aggregated.counter(&guard).len(), 2); diff --git a/src/webring.rs b/src/webring.rs index 6908b91..e83ab87 100644 --- a/src/webring.rs +++ b/src/webring.rs @@ -20,6 +20,7 @@ with the Purdue Hackers webring. If not, see . //! Ring behavior and data structures use std::{ + io::{Empty, empty}, net::IpAddr, path::{Path, PathBuf}, sync::{ @@ -213,7 +214,7 @@ pub struct Webring { /// Discord notifier for notifying members of issues with their sites notifier: Option>, /// Statistics collected about the ring - stats: Arc, + stats: Arc>, /// Current configuration of the webring, used for detecting changes when reloading config: Arc>>, } @@ -228,7 +229,7 @@ impl Webring { members: RwLock::new(member_map_from_config_table(&config.members)), static_dir_path: config.webring.static_dir.clone(), homepage: AsyncRwLock::new(None), - stats: Arc::new(Stats::new(Utc::now())), + stats: Arc::new(Stats::new(Utc::now(), empty())), file_watcher: OnceLock::default(), base_address: config.webring.base_url(), notifier: config @@ -673,7 +674,7 @@ mod tests { use std::{ collections::HashSet, fs::{File, OpenOptions}, - io::Write as _, + io::{Write as _, empty}, path::PathBuf, sync::{ Arc, OnceLock, RwLock, @@ -716,7 +717,7 @@ mod tests { base_address: Intern::default(), base_authority: Intern::new("ring.purduehackers.com".parse().unwrap()), notifier: None, - stats: Arc::new(Stats::new(Utc::now())), + stats: Arc::new(Stats::new(Utc::now(), empty())), config: Arc::new(AsyncRwLock::new(None)), } } From 1236ad8a0ffb0d9ffdb263b4ed8644b711a53db0 Mon Sep 17 00:00:00 2001 From: Henry Rovnyak Date: Sat, 1 Nov 2025 18:32:12 -0400 Subject: [PATCH 3/4] Operate sensibly if the timing of requests isn't monotonic --- src/stats/mod.rs | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/stats/mod.rs b/src/stats/mod.rs index 6b8bf3f..f676a9f 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -109,9 +109,23 @@ impl AggregatedStats { fn maybe_update_counter<'a>(&'a self, now: DateTime, guard: &'a impl Guard) -> &'a Counts { let now = mk_num(now); - let prev_day = self.today.swap(now, Ordering::Relaxed); + let mut prev_day = self.today.load(Ordering::Relaxed); + + // If our "now" time is in the past (perhaps tasks got out of order or something), we want to count this redirect towards the most recent day rather than getting rid of the newest day and replacing it with data intended for the oldest day. + + while prev_day < now { + match self + .today + .compare_exchange(prev_day, now, Ordering::Relaxed, Ordering::Relaxed) + { + Ok(_) => break, + Err(new_prev_day) => { + prev_day = new_prev_day; + } + } + } - if prev_day != now { + if prev_day < now { let new_counter: *mut Counts = Box::into_raw(Box::new(HashMap::new())); // We need this guard to go into our task so it needs to be owned @@ -372,10 +386,24 @@ mod tests { "}, ) .await; + stats.redirected_impl(a("0.0.0.0"), i("b.com"), i("c.com"), t(4)); stats.redirected_impl(a("0.0.0.0"), i("c.com"), i("a.com"), t(2) + day); assert_eq!(stats.aggregated.counter(&guard).len(), 2); - stats.assert_stat_entry(("b.com", "c.com", "b.com"), 1); + stats.assert_stat_entry(("b.com", "c.com", "b.com"), 2); + stats.assert_stat_entry(("c.com", "a.com", "b.com"), 1); + + stats.redirected_impl(a("0.0.0.0"), i("c.com"), i("a.com"), t(2) + day + day); + assert_same_data( + &mut rx, + indoc! {" + 1,b.com,c.com,b.com,2 + 1,c.com,a.com,b.com,1 + "}, + ) + .await; + + assert_eq!(stats.aggregated.counter(&guard).len(), 1); stats.assert_stat_entry(("c.com", "a.com", "b.com"), 1); } } From b760b112f17e51fb392461100449158082192587 Mon Sep 17 00:00:00 2001 From: Henry Rovnyak Date: Tue, 18 Nov 2025 21:59:12 -0500 Subject: [PATCH 4/4] Improve comments --- src/stats/mod.rs | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/stats/mod.rs b/src/stats/mod.rs index f676a9f..2bdd289 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -111,7 +111,7 @@ impl AggregatedStats { let mut prev_day = self.today.load(Ordering::Relaxed); - // If our "now" time is in the past (perhaps tasks got out of order or something), we want to count this redirect towards the most recent day rather than getting rid of the newest day and replacing it with data intended for the oldest day. + // If our "now" time is in the past relative to "today" (perhaps tasks got out of order or something), we want to count this redirect towards the most recent day rather than replacing and writing the data. while prev_day < now { match self @@ -128,15 +128,27 @@ impl AggregatedStats { if prev_day < now { let new_counter: *mut Counts = Box::into_raw(Box::new(HashMap::new())); - // We need this guard to go into our task so it needs to be owned - let guard_owned = self.collector.enter(); - - // Release to synchronize-with `counter` and Acquire to ensure that we can see the initialization of the previous one so that we can properly access and write it. - let prev_ptr = guard_owned.swap(&self.counter, new_counter, Ordering::AcqRel); + // Release to synchronize-with `counter` and Acquire to ensure that we can see the initialization of the previous one so that we can properly read it. + // + // We do not need to guard the `prev_ptr` because it will not be retired by any other tasks in the meantime. Proof: + // 1. The only code path where we could retire a pointer stored in this location is this one, and this code path is guaranteed to retire the value of `prev_ptr`. + // 2. The only way for our `prev_ptr` to get retired while we are holding it is for another task in this code branch to also hold an identical `prev_ptr` at the same time as us. + // 3. Suppose for contradiction that there exists a task $x$ that is different from this one $y$ such that $x$'s `prev_ptr` equals our `prev_ptr`. + // 4. `prev_ptr` is valid by the invariant on `self.counter`. + // 5. For all tasks $t∈T$ on this code path, its value of `new_counter` is valid at the same time that its `prev_ptr` is valid, because they both need to be valid when `swap` is performed by the invariant. + // 6. $x$'s and $y$'s `prev_ptr` is valid during all swaps between $x$'s and $y$'s inclusive in `self.counter`'s total order, since we suppose that $x$ is still holding `prev_ptr` at least until $y$ calls `swap`. + // 7. For any task $s∈S⊆T$ where $s$'s call to `swap` is between $x$'s and $y$'s inclusive in the total order, by (5) and (6), $s$'s `new_counter` is valid at the same time that `prev_ptr` is. + // 8. By correctness of the memory allocator and (7), $s$'s `new_counter` != `prev_ptr`. + // 9. The task previous to this one wrote its `new_counter` to `self.counter`, which by (8) does not equal `prev_ptr` + // 10. That value is stored in `prev_ptr` in $y$ due to it being swapped out. + // 11. `prev_ptr` != `prev_ptr`. This is a contradiction, proving that the situation is impossible and we do not need to guard `prev_ptr`. + // + // Why not just guard it anyways for safety? Because we would need to transfer the guard into our `task::spawn_blocking`, and since the lifetime of a guard is tied to a lifetime of a collector, and since isn't implemented, we cannot move the guard into the task, therefore we cannot guard `prev_ptr`. + let prev_ptr = self.counter.swap(new_counter, Ordering::AcqRel); let output = Arc::clone(&self.output); - // Allow it to be moved to our task + // Allow it to be moved into our task let prev_ptr = prev_ptr as usize; let this_collector = Arc::clone(&self.collector);