Skip to content

Commit cd8d937

Browse files
epiphanyplxclaude
andauthored
Fix downtime spam and speed up post-recovery detection (#1)
* Detect pre-release site downtime instead of spamming errors Wizards takes the Secret Lair site down for hours before a new drop, which used to produce one Discord error notification every 15 minutes per page. Now the first 2 consecutive failed store-page checks (or a maintenance holding page) flip the monitor into maintenance mode, fire a single "Secret Lair Releasing Soon" notification, and poll every 60s instead of every 3 minutes. When the site comes back, the next check runs immediately so the new drop is announced within a poll interval instead of 15+ minutes later. Also adds Cache-Control/Pragma no-cache headers plus a cache-busting query param so stale CDN responses don't delay detection of recovery. * Burst-poll the top of the hour while in maintenance mode Historically Secret Lair drops have gone live at :00, so a flat 60s maintenance poll can still leave us up to a minute late. Within MAINTENANCE_BURST_WINDOW_BEFORE/AFTER minutes of :00, switch to a 15s poll with ±2s jitter (worst-case detection ~17s after the drop appears). Outside that window, stay on the slower 60s maintenance interval so we don't hammer the site for hours of every outage. Also drop the request timeout to 10s while in maintenance mode so a hung connection can't eat a whole burst interval. * Ignore Python bytecode artifacts --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent bae03a3 commit cd8d937

3 files changed

Lines changed: 208 additions & 18 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
.env
22
data/
3+
__pycache__/
4+
*.pyc

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Monitors the [Secret Lair](https://secretlair.wizards.com/us/) store and [Chaos
1212

1313
- Detects new product additions by tracking product IDs
1414
- Special "CHAOS VAULT IS OPEN" alert when the vault transitions from closed to active
15+
- Detects pre-release site downtime and fires a single "Releasing Soon" alert (instead of spamming for every failed poll), then polls faster so the new drop is announced quickly when the site returns
1516
- Rich Discord embeds with product name, price, and direct link
1617
- Persists state across restarts (won't re-notify on reboot)
1718
- First run silently catalogues existing products (no spam on initial deploy)
@@ -55,7 +56,14 @@ All config is via environment variables (set in `.env`):
5556
| Variable | Default | Description |
5657
|---|---|---|
5758
| `DISCORD_WEBHOOK_URL` | *(required)* | Discord webhook URL |
58-
| `POLL_INTERVAL_SECONDS` | `180` | Seconds between checks |
59+
| `POLL_INTERVAL_SECONDS` | `180` | Seconds between checks when the site is up |
60+
| `MAINTENANCE_POLL_INTERVAL_SECONDS` | `60` | Poll interval used while the site is down, *outside* the top-of-hour window |
61+
| `MAINTENANCE_BURST_INTERVAL_SECONDS` | `15` | Fast poll interval while in maintenance mode and near :00 (drops historically go live on the hour) |
62+
| `MAINTENANCE_BURST_JITTER_SECONDS` | `2` | ±jitter added to the burst interval to avoid looking like a cron job |
63+
| `MAINTENANCE_BURST_WINDOW_BEFORE` | `2` | Minutes before :00 to start burst-polling |
64+
| `MAINTENANCE_BURST_WINDOW_AFTER` | `2` | Minutes after :00 to keep burst-polling |
65+
| `MAINTENANCE_REQUEST_TIMEOUT_SECONDS` | `10` | HTTP timeout while in maintenance mode (so a hung request can't eat a burst interval) |
66+
| `MAINTENANCE_STRIKE_THRESHOLD` | `2` | Consecutive failed checks required before declaring maintenance mode (filters transient blips) |
5967
| `MONITOR_SHOP_ALL` | `true` | Also monitor `/us/shopall` |
6068
| `NOTIFY_ON_START` | `false` | Notify for existing products on first run |
6169
| `LOG_LEVEL` | `INFO` | Log verbosity |

monitor.py

Lines changed: 197 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import json
99
import logging
1010
import os
11+
import random
1112
import re
1213
import sys
1314
import time
@@ -22,6 +23,35 @@
2223
# ---------------------------------------------------------------------------
2324
DISCORD_WEBHOOK_URL = os.environ.get("DISCORD_WEBHOOK_URL", "")
2425
POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "180")) # 3 min default
26+
# While the site is down (likely a pre-release maintenance window), poll faster
27+
# so we catch the new drop quickly once it's back.
28+
MAINTENANCE_POLL_INTERVAL_SECONDS = int(
29+
os.environ.get("MAINTENANCE_POLL_INTERVAL_SECONDS", "60")
30+
)
31+
# Historical Secret Lair drops have gone live on the hour, so while in
32+
# maintenance mode we ramp up to a near-realtime poll for a few minutes on
33+
# either side of :00 and back off to the slower maintenance interval otherwise.
34+
MAINTENANCE_BURST_INTERVAL_SECONDS = int(
35+
os.environ.get("MAINTENANCE_BURST_INTERVAL_SECONDS", "15")
36+
)
37+
MAINTENANCE_BURST_JITTER_SECONDS = int(
38+
os.environ.get("MAINTENANCE_BURST_JITTER_SECONDS", "2")
39+
)
40+
# How many minutes before / after :00 to burst-poll.
41+
MAINTENANCE_BURST_WINDOW_BEFORE = int(
42+
os.environ.get("MAINTENANCE_BURST_WINDOW_BEFORE", "2")
43+
)
44+
MAINTENANCE_BURST_WINDOW_AFTER = int(
45+
os.environ.get("MAINTENANCE_BURST_WINDOW_AFTER", "2")
46+
)
47+
# Shorter request timeout while the site is down so a hung request can't eat
48+
# the whole poll interval.
49+
MAINTENANCE_REQUEST_TIMEOUT_SECONDS = int(
50+
os.environ.get("MAINTENANCE_REQUEST_TIMEOUT_SECONDS", "10")
51+
)
52+
# How many consecutive failed checks before declaring maintenance mode. Filters
53+
# out transient network blips so we don't ping Discord for one-off failures.
54+
MAINTENANCE_STRIKE_THRESHOLD = int(os.environ.get("MAINTENANCE_STRIKE_THRESHOLD", "2"))
2555
STATE_FILE = os.environ.get("STATE_FILE", "/data/state.json")
2656
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper()
2757
USER_AGENT = os.environ.get(
@@ -84,19 +114,52 @@ def save_state(state: dict) -> None:
84114
PRODUCT_LINK_RE = re.compile(r"/us/product/(\d+)(?:/([^\"'\s]*))?")
85115

86116

87-
def fetch_page(url: str) -> str | None:
88-
"""Fetch a page's HTML. Returns None on failure."""
89-
headers = {"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"}
117+
def fetch_page(url: str, timeout: int = 30) -> str | None:
118+
"""Fetch a page's HTML. Returns None on failure.
119+
120+
Sends cache-busting headers so we don't get a stale 'we'll be right back'
121+
page from a CDN edge after Wizards brings the site back online.
122+
"""
123+
headers = {
124+
"User-Agent": USER_AGENT,
125+
"Accept-Language": "en-US,en;q=0.9",
126+
"Cache-Control": "no-cache",
127+
"Pragma": "no-cache",
128+
}
129+
sep = "&" if "?" in url else "?"
130+
bust_url = f"{url}{sep}_={int(time.time())}"
90131
try:
91-
resp = requests.get(url, headers=headers, timeout=30)
132+
resp = requests.get(bust_url, headers=headers, timeout=timeout)
92133
resp.raise_for_status()
93134
return resp.text
94135
except requests.RequestException as e:
95-
log.error("Failed to fetch %s: %s", url, e)
96-
send_discord_error_notification(str(e), context=f"Failed to fetch `{url}`")
136+
log.warning("Failed to fetch %s: %s", url, e)
97137
return None
98138

99139

140+
# Phrases that indicate Wizards has the site behind a maintenance/holding page
141+
# (HTTP 200, but the body says it's down). Matched case-insensitively.
142+
_MAINTENANCE_INDICATORS = (
143+
"we'll be right back",
144+
"we will be right back",
145+
"be right back",
146+
"under maintenance",
147+
"site is temporarily unavailable",
148+
"temporarily unavailable",
149+
"503 service unavailable",
150+
"service is unavailable",
151+
"site is currently down",
152+
)
153+
154+
155+
def is_maintenance_page(html: str | None) -> bool:
156+
"""Detect a 'site down' holding page that loaded as 200 OK."""
157+
if not html:
158+
return False
159+
lower = html.lower()
160+
return any(indicator in lower for indicator in _MAINTENANCE_INDICATORS)
161+
162+
100163
def parse_products(html: str) -> dict[str, dict]:
101164
"""
102165
Extract products from HTML.
@@ -233,6 +296,45 @@ def send_discord_notification(products: list[dict], source: str) -> bool:
233296
return True
234297

235298

299+
def send_release_soon_notification() -> bool:
300+
"""One-shot notification fired when the site goes into maintenance mode.
301+
302+
Replaces the per-fetch error spam that used to fire during the multi-hour
303+
pre-release window.
304+
"""
305+
if not DISCORD_WEBHOOK_URL:
306+
return False
307+
308+
payload = {
309+
"username": "Secret Lair Monitor",
310+
"avatar_url": "https://cdn-prod.scalefast.com/public/assets/img/resized/"
311+
"wizardsofthecoast-secret-lair/favicon-32.png",
312+
"content": "**🃏 Secret Lair Releasing Soon!**",
313+
"embeds": [
314+
{
315+
"title": "Secret Lair site is down",
316+
"url": "https://secretlair.wizards.com/us/",
317+
"color": 0xF1C40F, # Yellow
318+
"description": (
319+
"The Secret Lair site is currently unavailable, which usually "
320+
"means a new drop is about to go live. Monitoring more "
321+
"frequently — you'll get a notification as soon as the new "
322+
"products appear."
323+
),
324+
"timestamp": datetime.now(timezone.utc).isoformat(),
325+
}
326+
],
327+
}
328+
try:
329+
resp = requests.post(DISCORD_WEBHOOK_URL, json=payload, timeout=15)
330+
resp.raise_for_status()
331+
log.info("'Releasing Soon' notification sent")
332+
return True
333+
except requests.RequestException as e:
334+
log.error("Failed to send 'Releasing Soon' notification: %s", e)
335+
return False
336+
337+
236338
def send_chaos_vault_opened_notification() -> bool:
237339
"""Special notification when the Chaos Vault transitions from closed to open."""
238340
if not DISCORD_WEBHOOK_URL:
@@ -312,16 +414,83 @@ def send_discord_error_notification(error_msg: str, context: str = "") -> bool:
312414
# Main loop
313415
# ---------------------------------------------------------------------------
314416

315-
def run_check(state: dict) -> dict:
316-
"""Run one check cycle across all monitored pages."""
417+
def in_top_of_hour_burst_window() -> bool:
418+
"""True when the wall clock is in the configured top-of-hour burst window.
419+
420+
Minute-of-hour is the same across all whole-hour timezones, so we don't
421+
need to know which timezone Wizards releases in.
422+
"""
423+
minute = datetime.now().minute
424+
return (
425+
minute >= 60 - MAINTENANCE_BURST_WINDOW_BEFORE
426+
or minute <= MAINTENANCE_BURST_WINDOW_AFTER
427+
)
428+
429+
430+
def compute_sleep_interval(in_maintenance: bool) -> float:
431+
"""Pick the next sleep duration based on site state and clock position."""
432+
if not in_maintenance:
433+
return float(POLL_INTERVAL_SECONDS)
434+
if in_top_of_hour_burst_window():
435+
jitter = random.uniform(
436+
-MAINTENANCE_BURST_JITTER_SECONDS, MAINTENANCE_BURST_JITTER_SECONDS
437+
)
438+
return max(1.0, MAINTENANCE_BURST_INTERVAL_SECONDS + jitter)
439+
return float(MAINTENANCE_POLL_INTERVAL_SECONDS)
440+
441+
442+
def run_check(state: dict) -> tuple[dict, bool]:
443+
"""Run one check cycle across all monitored pages.
444+
445+
Returns (state, in_maintenance). When the site is in maintenance mode we
446+
skip product parsing and let the caller use a faster poll interval.
447+
"""
317448
known = state.get("known_products", {})
318449
chaos_vault_was_active = state.get("chaos_vault_active", False)
450+
was_in_maintenance = state.get("maintenance_mode", False)
451+
strikes = state.get("maintenance_strikes", 0)
452+
453+
# While we were already in maintenance mode, use a tight request timeout so
454+
# one hung connection can't blow past the burst-poll interval.
455+
fetch_timeout = (
456+
MAINTENANCE_REQUEST_TIMEOUT_SECONDS if was_in_maintenance else 30
457+
)
458+
459+
# Use the main store page as the canonical signal for site health. The
460+
# chaos vault page is legitimately empty most of the time, and shop_all
461+
# can lag, so we don't want either to be the trigger.
462+
store_html = fetch_page(PAGES["store"], timeout=fetch_timeout)
463+
store_down = store_html is None or is_maintenance_page(store_html)
319464

465+
if store_down:
466+
strikes += 1
467+
log.info(
468+
"Secret Lair store appears down (strike %d/%d)",
469+
strikes,
470+
MAINTENANCE_STRIKE_THRESHOLD,
471+
)
472+
state["maintenance_strikes"] = strikes
473+
if strikes >= MAINTENANCE_STRIKE_THRESHOLD and not was_in_maintenance:
474+
send_release_soon_notification()
475+
state["maintenance_mode"] = True
476+
state["last_check"] = datetime.now(timezone.utc).isoformat()
477+
return state, state.get("maintenance_mode", False)
478+
479+
# Site is up — reset strike counter and announce recovery if needed.
480+
state["maintenance_strikes"] = 0
481+
if was_in_maintenance:
482+
log.info("Secret Lair site is back online — checking for new products")
483+
state["maintenance_mode"] = False
484+
485+
# Process the store page we already fetched, then the rest.
320486
for source, url in PAGES.items():
321-
log.debug("Checking %s: %s", source, url)
322-
html = fetch_page(url)
323-
if html is None:
324-
continue
487+
if source == "store":
488+
html = store_html
489+
else:
490+
log.debug("Checking %s: %s", source, url)
491+
html = fetch_page(url, timeout=fetch_timeout)
492+
if html is None:
493+
continue
325494

326495
# Special Chaos Vault open/close detection
327496
if source == "chaos_vault":
@@ -351,7 +520,7 @@ def run_check(state: dict) -> dict:
351520

352521
state["known_products"] = known
353522
state["last_check"] = datetime.now(timezone.utc).isoformat()
354-
return state
523+
return state, False
355524

356525

357526
def main() -> None:
@@ -381,7 +550,7 @@ def main() -> None:
381550
if not notify_on_start:
382551
globals()["DISCORD_WEBHOOK_URL"] = ""
383552

384-
state = run_check(state)
553+
state, _ = run_check(state)
385554
save_state(state)
386555

387556
if not notify_on_start:
@@ -392,19 +561,30 @@ def main() -> None:
392561
len(state.get("known_products", {})),
393562
)
394563

564+
in_maintenance = state.get("maintenance_mode", False)
395565
while True:
566+
was_in_maintenance = in_maintenance
396567
try:
397-
state = run_check(state)
568+
state, in_maintenance = run_check(state)
398569
save_state(state)
399570
except Exception as e:
400571
log.exception("Unhandled error during check cycle")
401572
send_discord_error_notification(
402573
f"{type(e).__name__}: {e}",
403574
context="Unhandled error during check cycle",
404575
)
576+
in_maintenance = state.get("maintenance_mode", False)
577+
578+
# If we just transitioned from maintenance → up, skip the sleep and
579+
# check again immediately so the new drop is announced as soon as it
580+
# appears rather than waiting another poll interval.
581+
if was_in_maintenance and not in_maintenance:
582+
log.info("Recovery detected — re-checking immediately")
583+
continue
405584

406-
log.debug("Sleeping %ds until next check", POLL_INTERVAL_SECONDS)
407-
time.sleep(POLL_INTERVAL_SECONDS)
585+
interval = compute_sleep_interval(in_maintenance)
586+
log.debug("Sleeping %.1fs until next check", interval)
587+
time.sleep(interval)
408588

409589

410590
if __name__ == "__main__":

0 commit comments

Comments
 (0)