From b24436e7fc308faeb6bc9b579dfe020e9ca71059 Mon Sep 17 00:00:00 2001
From: Xeek <6032840+x3ek@users.noreply.github.com>
Date: Sat, 16 May 2026 09:11:41 -0500
Subject: [PATCH 1/2] feat(analytics): filter bots and non-content endpoints
 from page-view tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The analytics middleware was recording every successful request, so /robots.txt, /sitemap.xml, /feed.xml, /favicon.ico, /pygments.css, and crawler hits all inflated view counts. Add two filters:

1. Content-Type must start with text/html — excludes XML, JSON, CSS, plain text, and image responses without needing per-path allowlists.

2. User-Agent must not look like a bot/crawler — pattern covers Googlebot, Bingbot, Baidu/Yandex, social card fetchers (Twitterbot, facebookexternalhit, Slackbot), and scripted clients (curl, wget, python-requests, httpx). Missing UA is treated as a bot since real browsers always send one.

Refactor the middleware to use early returns so the order is obvious: status_code -> Content-Type -> path prefix -> User-Agent -> track. Existing path-prefix exclusions (/static, /admin, /health, /auth, /webhooks) are preserved.

Tests cover is_bot_user_agent across known bots and real browsers, plus integration tests that the existing excluded paths still don't get tracked.

Closes #77

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/squishmark/main.py            |  48 +++++++++---
 tests/test_analytics_filtering.py | 125 ++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+), 10 deletions(-)
 create mode 100644 tests/test_analytics_filtering.py

diff --git a/src/squishmark/main.py b/src/squishmark/main.py
index b757d56..674e768 100644
--- a/src/squishmark/main.py
+++ b/src/squishmark/main.py
@@ -74,6 +74,24 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     await close_db()
 
 
+# Common bot/crawler User-Agent substrings. Matched case-insensitively. Covers
+# Googlebot, Bingbot, Baidu/Yandex/Naver/Apple/Petal, social-card fetchers
+# (Twitterbot, facebookexternalhit, Slackbot, Discordbot, etc.), and headless
+# scripted clients (curl, wget, python-requests, httpx).
+BOT_USER_AGENT_PATTERN = re.compile(
+    r"bot|crawler|spider|slurp|facebookexternalhit|curl|wget|python-requests|httpx",
+    re.IGNORECASE,
+)
+
+
+def is_bot_user_agent(user_agent: str | None) -> bool:
+    """Return True if the User-Agent looks like a bot, crawler, or scripted client."""
+    if not user_agent:
+        # Treat missing UA as a bot — real browsers always send one.
+        return True
+    return bool(BOT_USER_AGENT_PATTERN.search(user_agent))
+
+
 async def track_page_view(request: Request) -> None:
     """Track a page view asynchronously (fire and forget)."""
     try:
@@ -167,22 +185,32 @@ async def global_exception_handler(request: Request, exc: Exception):
     # Middleware to track page views (non-blocking)
     @app.middleware("http")
     async def analytics_middleware(request: Request, call_next):
-        """Track page views for non-static, non-admin requests."""
+        """Track page views for non-static, non-admin, non-bot HTML requests."""
         response = await call_next(request)
 
-        # Only track successful HTML responses
+        if response.status_code != 200:
+            return response
+
+        # Only HTML pages count as page views — excludes /robots.txt,
+        # /sitemap.xml, /feed.xml, /favicon.ico, /pygments.css, etc.
+        if not response.headers.get("content-type", "").startswith("text/html"):
+            return response
+
         path = request.url.path
         if (
-            response.status_code == 200
-            and not path.startswith("/static")
-            and not path.startswith("/admin")
-            and not path.startswith("/health")
-            and not path.startswith("/auth")
-            and not path.startswith("/webhooks")
+            path.startswith("/static")
+            or path.startswith("/admin")
+            or path.startswith("/health")
+            or path.startswith("/auth")
+            or path.startswith("/webhooks")
         ):
-            # Fire and forget - don't await
-            asyncio.create_task(track_page_view(request))
+            return response
+
+        if is_bot_user_agent(request.headers.get("user-agent")):
+            return response
 
+        # Fire and forget - don't await
+        asyncio.create_task(track_page_view(request))
         return response
 
     # Health check endpoint
diff --git a/tests/test_analytics_filtering.py b/tests/test_analytics_filtering.py
new file mode 100644
index 0000000..343c08c
--- /dev/null
+++ b/tests/test_analytics_filtering.py
@@ -0,0 +1,125 @@
+"""Tests for analytics middleware filtering: bots, non-HTML, excluded paths."""
+
+import asyncio
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from httpx import ASGITransport, AsyncClient
+
+from squishmark.main import is_bot_user_agent
+
+
+@pytest.mark.parametrize(
+    "ua",
+    [
+        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+        "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+        "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)",
+        "Mozilla/5.0 (compatible; Baiduspider/2.0)",
+        "Mozilla/5.0 (compatible; YandexBot/3.0)",
+        "facebookexternalhit/1.1",
+        "Twitterbot/1.0",
+        "Slackbot-LinkExpanding 1.0",
+        "curl/8.4.0",
+        "Wget/1.21.4",
+        "python-requests/2.31.0",
+        "httpx/0.27.0",
+    ],
+)
+def test_is_bot_user_agent_detects_common_bots(ua: str):
+    assert is_bot_user_agent(ua) is True
+
+
+@pytest.mark.parametrize(
+    "ua",
+    [
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
+        "Mozilla/5.0 (X11; Linux x86_64) Firefox/121.0",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0) Mobile/15E148 Safari/604.1",
+    ],
+)
+def test_is_bot_user_agent_passes_real_browsers(ua: str):
+    assert is_bot_user_agent(ua) is False
+
+
+def test_is_bot_user_agent_treats_missing_as_bot():
+    assert is_bot_user_agent(None) is True
+    assert is_bot_user_agent("") is True
+
+
+async def _build_app():
+    """Build the app with external services stubbed so it boots."""
+    mock_github = AsyncMock()
+    mock_github.get_config.return_value = {
+        "theme": {"name": "default", "pygments_style": "github-dark"},
+        "site": {"title": "Test"},
+    }
+    stack = [
+        patch("squishmark.main.get_github_service", return_value=mock_github),
+        patch("squishmark.main.get_theme_engine", new_callable=AsyncMock),
+        patch("squishmark.models.db.init_db", new_callable=AsyncMock),
+        patch("squishmark.models.db.close_db", new_callable=AsyncMock),
+        patch("squishmark.main.shutdown_github_service", new_callable=AsyncMock),
+        patch("squishmark.main.reset_theme_engine"),
+    ]
+    return stack
+
+
+async def _assert_track_called(headers: dict, path: str, expected: bool):
+    """Hit the app with given headers; check whether track_page_view was scheduled."""
+    stack = await _build_app()
+    tracker = AsyncMock()
+    with (
+        stack[0],
+        stack[1],
+        stack[2],
+        stack[3],
+        stack[4],
+        stack[5],
+        patch("squishmark.main.track_page_view", new=tracker),
+    ):
+        from squishmark.main import create_app
+
+        app = create_app()
+        transport = ASGITransport(app=app)
+        async with AsyncClient(transport=transport, base_url="http://test") as client:
+            await client.get(path, headers=headers)
+        # Fire-and-forget task may not have completed yet; yield a tick.
+        await asyncio.sleep(0)
+        assert tracker.called is expected, (
+            f"expected track_page_view.called={expected} for path={path!r} "
+            f"ua={headers.get('user-agent')!r}; got called={tracker.called}"
+        )
+
+
+@pytest.mark.asyncio
+async def test_robots_txt_not_tracked():
+    await _assert_track_called(
+        headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0) Chrome/120"},
+        path="/robots.txt",
+        expected=False,
+    )
+
+
+@pytest.mark.asyncio
+async def test_health_endpoint_not_tracked():
+    await _assert_track_called(
+        headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0) Chrome/120"},
+        path="/health",
+        expected=False,
+    )
+
+
+@pytest.mark.asyncio
+async def test_bot_request_to_html_page_not_tracked():
+    """A Googlebot hit to / must not be tracked even if the response is HTML."""
+    # We don't need a real HTML endpoint — /robots.txt returns text/plain which
+    # already short-circuits. Use the /health JSON endpoint with a bot UA to
+    # confirm bot UAs are filtered (they'd be filtered by Content-Type too,
+    # but this test guards the UA gate specifically).
+    await _assert_track_called(
+        headers={"user-agent": "Googlebot/2.1"},
+        path="/health",
+        expected=False,
+    )

From 00c0bb260db1e6bd706a9e4580c1fd2fc1952711 Mon Sep 17 00:00:00 2001
From: Xeek <6032840+x3ek@users.noreply.github.com>
Date: Sat, 16 May 2026 09:19:01 -0500
Subject: [PATCH 2/2] test(analytics): strengthen bot UA gate test with real
 HTML route
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot review on PR #82 flagged that test_bot_request_to_html_page_not_tracked used /health, which is filtered earlier by both Content-Type (JSON) and path prefix — so the test would pass even if the UA gate were removed.

Register a stub /_test/html route on the test app (path chosen to avoid the /{slug} catch-all in pages.py) and assert the browser-UA hit IS tracked while the bot-UA hit is NOT. The bot UA filter now has a test that genuinely exercises it.

Refs #77, #82

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_analytics_filtering.py | 37 ++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/tests/test_analytics_filtering.py b/tests/test_analytics_filtering.py
index 343c08c..970680a 100644
--- a/tests/test_analytics_filtering.py
+++ b/tests/test_analytics_filtering.py
@@ -4,6 +4,7 @@
 from unittest.mock import AsyncMock, patch
 
 import pytest
+from fastapi.responses import HTMLResponse
 from httpx import ASGITransport, AsyncClient
 
 from squishmark.main import is_bot_user_agent
@@ -66,8 +67,14 @@ async def _build_app():
     return stack
 
 
-async def _assert_track_called(headers: dict, path: str, expected: bool):
-    """Hit the app with given headers; check whether track_page_view was scheduled."""
+async def _assert_track_called(headers: dict, path: str, expected: bool, add_html_route: bool = False):
+    """Hit the app with given headers; check whether track_page_view was scheduled.
+
+    When ``add_html_route`` is True, register a stub ``/_test/html`` route that
+    returns a real ``text/html`` 200 response — the only way to exercise the
+    bot UA gate, since every existing path is filtered earlier by Content-Type
+    or the path-prefix list.
+    """
     stack = await _build_app()
     tracker = AsyncMock()
     with (
@@ -82,6 +89,12 @@ async def _assert_track_called(headers: dict, path: str, expected: bool):
         from squishmark.main import create_app
 
         app = create_app()
+        if add_html_route:
+
+            @app.get("/_test/html", response_class=HTMLResponse)
+            async def _test_html():
+                return HTMLResponse("<html><body>test</body></html>")
+
         transport = ASGITransport(app=app)
         async with AsyncClient(transport=transport, base_url="http://test") as client:
             await client.get(path, headers=headers)
@@ -111,15 +124,23 @@ async def test_health_endpoint_not_tracked():
     )
 
 
+@pytest.mark.asyncio
+async def test_real_browser_html_request_is_tracked():
+    """Sanity check: a real browser hitting an HTML route does get tracked."""
+    await _assert_track_called(
+        headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0) Chrome/120"},
+        path="/_test/html",
+        expected=True,
+        add_html_route=True,
+    )
+
+
 @pytest.mark.asyncio
 async def test_bot_request_to_html_page_not_tracked():
-    """A Googlebot hit to / must not be tracked even if the response is HTML."""
-    # We don't need a real HTML endpoint — /robots.txt returns text/plain which
-    # already short-circuits. Use the /health JSON endpoint with a bot UA to
-    # confirm bot UAs are filtered (they'd be filtered by Content-Type too,
-    # but this test guards the UA gate specifically).
+    """The UA gate must block bots even on a real text/html 200 response."""
     await _assert_track_called(
         headers={"user-agent": "Googlebot/2.1"},
-        path="/health",
+        path="/_test/html",
         expected=False,
+        add_html_route=True,
     )