diff --git a/.github/workflows/deploy-site.yml b/.github/workflows/deploy-site.yml index 833dbcd..28fd1e5 100644 --- a/.github/workflows/deploy-site.yml +++ b/.github/workflows/deploy-site.yml @@ -41,13 +41,30 @@ jobs: run: | set -euo pipefail + # HTML pages and the three metadata files share a 5-minute edge TTL + # with stale-while-revalidate for 1 hour. A Cloudflare Cache Rule on + # the zone enables caching for these specific paths (extensionless + # HTML isn't cached by Cloudflare default); see infrastructure notes + # in apps/web/README.md. Static assets (favicons, og-image, logo) + # keep Cloudflare's default static-asset cache behavior - no header. + CACHE_HTML="public, max-age=300, stale-while-revalidate=3600" + # Non-HTML assets: copy the finite Astro output directly. Do not use # `aws s3 sync` at the bucket root because the same bucket also stores # the million-plus scraper-managed /documents/ and /extracted/ objects. find apps/web/dist -type f ! -name "*.html" -print0 | while IFS= read -r -d '' f; do rel="${f#apps/web/dist/}" - aws s3 cp "$f" "$R2_BUCKET/$rel" \ - --endpoint-url "$R2_ENDPOINT" + case "$rel" in + sitemap.xml|robots.txt|llms.txt) + aws s3 cp "$f" "$R2_BUCKET/$rel" \ + --cache-control "$CACHE_HTML" \ + --endpoint-url "$R2_ENDPOINT" + ;; + *) + aws s3 cp "$f" "$R2_BUCKET/$rel" \ + --endpoint-url "$R2_ENDPOINT" + ;; + esac done # HTML files: upload to extensionless keys to match canonical URLs. @@ -62,5 +79,6 @@ jobs: fi aws s3 cp "$f" "$R2_BUCKET/$key" \ --content-type "text/html; charset=utf-8" \ + --cache-control "$CACHE_HTML" \ --endpoint-url "$R2_ENDPOINT" done diff --git a/apps/web/README.md b/apps/web/README.md index 8656caf..37aa5b2 100644 --- a/apps/web/README.md +++ b/apps/web/README.md @@ -22,3 +22,28 @@ Verify before changing `robots.txt`: curl -I https://docxcorp.us/documents/000014a959f5225c658740fd7915cd50c5728c9cbe06c7d72d79a9708244ec1f.docx curl -I https://docxcorp.us/extracted/000014a959f5225c658740fd7915cd50c5728c9cbe06c7d72d79a9708244ec1f.txt ``` + +## HTML Edge Caching + +Cloudflare does not cache HTML by default. A zone-level Cache Rule enables caching +for HTML pages, the homepage, and the three metadata files. Static assets +(favicons, og-image, logo) keep Cloudflare's default static-asset cache behavior. + +The Cache Rule is set outside this repo: + +- Phase: `http_request_cache_settings` +- Expression: `http.host eq "docxcorp.us" and (http.request.uri.path eq "/" or http.request.uri.path in {"/dataset" "/classification" "/quality" "/download" "/types" "/topics" "/sitemap.xml" "/robots.txt" "/llms.txt"} or starts_with(http.request.uri.path, "/types/") or starts_with(http.request.uri.path, "/topics/"))` +- Action: `cache: true`, `edge_ttl.mode: bypass_by_default` (TTL driven by origin `Cache-Control`) + +The matching upload-side `Cache-Control` header (`public, max-age=300, stale-while-revalidate=3600`) +is set by `deploy-site.yml` on HTML uploads and on `sitemap.xml`/`robots.txt`/`llms.txt`. Keep the +two sides in sync: if you add a new cacheable HTML route, add it to the expression above. + +Verify after deploy: + +```bash +# Two same-URL requests in a row: first MISS, second HIT. +URL="https://docxcorp.us/dataset" +curl -sI "$URL" | grep -i "cf-cache-status\|cache-control" +curl -sI "$URL" | grep -i "cf-cache-status" +```