From b24436e7fc308faeb6bc9b579dfe020e9ca71059 Mon Sep 17 00:00:00 2001 From: Xeek <6032840+x3ek@users.noreply.github.com> Date: Sat, 16 May 2026 09:11:41 -0500 Subject: [PATCH 1/2] feat(analytics): filter bots and non-content endpoints from page-view tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The analytics middleware was recording every successful request, so /robots.txt, /sitemap.xml, /feed.xml, /favicon.ico, /pygments.css, and crawler hits all inflated view counts. Add two filters: 1. Content-Type must start with text/html — excludes XML, JSON, CSS, plain text, and image responses without needing per-path allowlists. 2. User-Agent must not look like a bot/crawler — pattern covers Googlebot, Bingbot, Baidu/Yandex, social card fetchers (Twitterbot, facebookexternalhit, Slackbot), and scripted clients (curl, wget, python-requests, httpx). Missing UA is treated as a bot since real browsers always send one. Refactor the middleware to use early returns so the order is obvious: status_code -> Content-Type -> path prefix -> User-Agent -> track. Existing path-prefix exclusions (/static, /admin, /health, /auth, /webhooks) are preserved. Tests cover is_bot_user_agent across known bots and real browsers, plus integration tests that the existing excluded paths still don't get tracked. Closes #77 Co-Authored-By: Claude Opus 4.7 (1M context) --- src/squishmark/main.py | 48 +++++++++--- tests/test_analytics_filtering.py | 125 ++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 tests/test_analytics_filtering.py diff --git a/src/squishmark/main.py b/src/squishmark/main.py index b757d56..674e768 100644 --- a/src/squishmark/main.py +++ b/src/squishmark/main.py @@ -74,6 +74,24 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: await close_db() +# Common bot/crawler User-Agent substrings. Matched case-insensitively. Covers +# Googlebot, Bingbot, Baidu/Yandex/Naver/Apple/Petal, social-card fetchers +# (Twitterbot, facebookexternalhit, Slackbot, Discordbot, etc.), and headless +# scripted clients (curl, wget, python-requests, httpx). +BOT_USER_AGENT_PATTERN = re.compile( + r"bot|crawler|spider|slurp|facebookexternalhit|curl|wget|python-requests|httpx", + re.IGNORECASE, +) + + +def is_bot_user_agent(user_agent: str | None) -> bool: + """Return True if the User-Agent looks like a bot, crawler, or scripted client.""" + if not user_agent: + # Treat missing UA as a bot — real browsers always send one. + return True + return bool(BOT_USER_AGENT_PATTERN.search(user_agent)) + + async def track_page_view(request: Request) -> None: """Track a page view asynchronously (fire and forget).""" try: @@ -167,22 +185,32 @@ async def global_exception_handler(request: Request, exc: Exception): # Middleware to track page views (non-blocking) @app.middleware("http") async def analytics_middleware(request: Request, call_next): - """Track page views for non-static, non-admin requests.""" + """Track page views for non-static, non-admin, non-bot HTML requests.""" response = await call_next(request) - # Only track successful HTML responses + if response.status_code != 200: + return response + + # Only HTML pages count as page views — excludes /robots.txt, + # /sitemap.xml, /feed.xml, /favicon.ico, /pygments.css, etc. + if not response.headers.get("content-type", "").startswith("text/html"): + return response + path = request.url.path if ( - response.status_code == 200 - and not path.startswith("/static") - and not path.startswith("/admin") - and not path.startswith("/health") - and not path.startswith("/auth") - and not path.startswith("/webhooks") + path.startswith("/static") + or path.startswith("/admin") + or path.startswith("/health") + or path.startswith("/auth") + or path.startswith("/webhooks") ): - # Fire and forget - don't await - asyncio.create_task(track_page_view(request)) + return response + + if is_bot_user_agent(request.headers.get("user-agent")): + return response + # Fire and forget - don't await + asyncio.create_task(track_page_view(request)) return response # Health check endpoint diff --git a/tests/test_analytics_filtering.py b/tests/test_analytics_filtering.py new file mode 100644 index 0000000..343c08c --- /dev/null +++ b/tests/test_analytics_filtering.py @@ -0,0 +1,125 @@ +"""Tests for analytics middleware filtering: bots, non-HTML, excluded paths.""" + +import asyncio +from unittest.mock import AsyncMock, patch + +import pytest +from httpx import ASGITransport, AsyncClient + +from squishmark.main import is_bot_user_agent + + +@pytest.mark.parametrize( + "ua", + [ + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)", + "Mozilla/5.0 (compatible; Baiduspider/2.0)", + "Mozilla/5.0 (compatible; YandexBot/3.0)", + "facebookexternalhit/1.1", + "Twitterbot/1.0", + "Slackbot-LinkExpanding 1.0", + "curl/8.4.0", + "Wget/1.21.4", + "python-requests/2.31.0", + "httpx/0.27.0", + ], +) +def test_is_bot_user_agent_detects_common_bots(ua: str): + assert is_bot_user_agent(ua) is True + + +@pytest.mark.parametrize( + "ua", + [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120", + "Mozilla/5.0 (X11; Linux x86_64) Firefox/121.0", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0) Mobile/15E148 Safari/604.1", + ], +) +def test_is_bot_user_agent_passes_real_browsers(ua: str): + assert is_bot_user_agent(ua) is False + + +def test_is_bot_user_agent_treats_missing_as_bot(): + assert is_bot_user_agent(None) is True + assert is_bot_user_agent("") is True + + +async def _build_app(): + """Build the app with external services stubbed so it boots.""" + mock_github = AsyncMock() + mock_github.get_config.return_value = { + "theme": {"name": "default", "pygments_style": "github-dark"}, + "site": {"title": "Test"}, + } + stack = [ + patch("squishmark.main.get_github_service", return_value=mock_github), + patch("squishmark.main.get_theme_engine", new_callable=AsyncMock), + patch("squishmark.models.db.init_db", new_callable=AsyncMock), + patch("squishmark.models.db.close_db", new_callable=AsyncMock), + patch("squishmark.main.shutdown_github_service", new_callable=AsyncMock), + patch("squishmark.main.reset_theme_engine"), + ] + return stack + + +async def _assert_track_called(headers: dict, path: str, expected: bool): + """Hit the app with given headers; check whether track_page_view was scheduled.""" + stack = await _build_app() + tracker = AsyncMock() + with ( + stack[0], + stack[1], + stack[2], + stack[3], + stack[4], + stack[5], + patch("squishmark.main.track_page_view", new=tracker), + ): + from squishmark.main import create_app + + app = create_app() + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + await client.get(path, headers=headers) + # Fire-and-forget task may not have completed yet; yield a tick. + await asyncio.sleep(0) + assert tracker.called is expected, ( + f"expected track_page_view.called={expected} for path={path!r} " + f"ua={headers.get('user-agent')!r}; got called={tracker.called}" + ) + + +@pytest.mark.asyncio +async def test_robots_txt_not_tracked(): + await _assert_track_called( + headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0) Chrome/120"}, + path="/robots.txt", + expected=False, + ) + + +@pytest.mark.asyncio +async def test_health_endpoint_not_tracked(): + await _assert_track_called( + headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0) Chrome/120"}, + path="/health", + expected=False, + ) + + +@pytest.mark.asyncio +async def test_bot_request_to_html_page_not_tracked(): + """A Googlebot hit to / must not be tracked even if the response is HTML.""" + # We don't need a real HTML endpoint — /robots.txt returns text/plain which + # already short-circuits. Use the /health JSON endpoint with a bot UA to + # confirm bot UAs are filtered (they'd be filtered by Content-Type too, + # but this test guards the UA gate specifically). + await _assert_track_called( + headers={"user-agent": "Googlebot/2.1"}, + path="/health", + expected=False, + ) From 00c0bb260db1e6bd706a9e4580c1fd2fc1952711 Mon Sep 17 00:00:00 2001 From: Xeek <6032840+x3ek@users.noreply.github.com> Date: Sat, 16 May 2026 09:19:01 -0500 Subject: [PATCH 2/2] test(analytics): strengthen bot UA gate test with real HTML route MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot review on PR #82 flagged that test_bot_request_to_html_page_not_tracked used /health, which is filtered earlier by both Content-Type (JSON) and path prefix — so the test would pass even if the UA gate were removed. Register a stub /_test/html route on the test app (path chosen to avoid the /{slug} catch-all in pages.py) and assert the browser-UA hit IS tracked while the bot-UA hit is NOT. The bot UA filter now has a test that genuinely exercises it. Refs #77, #82 Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_analytics_filtering.py | 37 ++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/tests/test_analytics_filtering.py b/tests/test_analytics_filtering.py index 343c08c..970680a 100644 --- a/tests/test_analytics_filtering.py +++ b/tests/test_analytics_filtering.py @@ -4,6 +4,7 @@ from unittest.mock import AsyncMock, patch import pytest +from fastapi.responses import HTMLResponse from httpx import ASGITransport, AsyncClient from squishmark.main import is_bot_user_agent @@ -66,8 +67,14 @@ async def _build_app(): return stack -async def _assert_track_called(headers: dict, path: str, expected: bool): - """Hit the app with given headers; check whether track_page_view was scheduled.""" +async def _assert_track_called(headers: dict, path: str, expected: bool, add_html_route: bool = False): + """Hit the app with given headers; check whether track_page_view was scheduled. + + When ``add_html_route`` is True, register a stub ``/_test/html`` route that + returns a real ``text/html`` 200 response — the only way to exercise the + bot UA gate, since every existing path is filtered earlier by Content-Type + or the path-prefix list. + """ stack = await _build_app() tracker = AsyncMock() with ( @@ -82,6 +89,12 @@ async def _assert_track_called(headers: dict, path: str, expected: bool): from squishmark.main import create_app app = create_app() + if add_html_route: + + @app.get("/_test/html", response_class=HTMLResponse) + async def _test_html(): + return HTMLResponse("test") + transport = ASGITransport(app=app) async with AsyncClient(transport=transport, base_url="http://test") as client: await client.get(path, headers=headers) @@ -111,15 +124,23 @@ async def test_health_endpoint_not_tracked(): ) +@pytest.mark.asyncio +async def test_real_browser_html_request_is_tracked(): + """Sanity check: a real browser hitting an HTML route does get tracked.""" + await _assert_track_called( + headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0) Chrome/120"}, + path="/_test/html", + expected=True, + add_html_route=True, + ) + + @pytest.mark.asyncio async def test_bot_request_to_html_page_not_tracked(): - """A Googlebot hit to / must not be tracked even if the response is HTML.""" - # We don't need a real HTML endpoint — /robots.txt returns text/plain which - # already short-circuits. Use the /health JSON endpoint with a bot UA to - # confirm bot UAs are filtered (they'd be filtered by Content-Type too, - # but this test guards the UA gate specifically). + """The UA gate must block bots even on a real text/html 200 response.""" await _assert_track_called( headers={"user-agent": "Googlebot/2.1"}, - path="/health", + path="/_test/html", expected=False, + add_html_route=True, )