Skip to content

Docs Scraper

Docs Scraper #582

name: Docs Scraper
on:
workflow_dispatch:
schedule:
# Run the workflow every night at 5:00 AM UTC, after nightly release and docs update
- cron: "0 5 * * *"
push:
branches:
- next
paths:
- docs/**
jobs:
docs-scraper:
runs-on: ubuntu-latest
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
- name: Reindex with Typesense docsearch-scraper
env:
# Fail the run if the scraper indexes fewer than this many records.
# The docsearch-scraper container exits 0 even when its config is broken
# and the index ends up nearly empty, so this guard turns a silent
# regression (which happened with #22861 dropping the index from
# ~12k to 48 records) into a loud CI failure.
MIN_HITS: "5000"
TYPESENSE_API_KEY: ${{ secrets.TYPESENSE_API_KEY }}
TYPESENSE_HOST: ${{ secrets.TYPESENSE_HOST }}
run: |
set -euo pipefail
docker run \
-e "TYPESENSE_API_KEY=$TYPESENSE_API_KEY" \
-e "TYPESENSE_HOST=$TYPESENSE_HOST" \
-e "TYPESENSE_PORT=443" \
-e "TYPESENSE_PROTOCOL=https" \
-e "CONFIG=$(cat docs/typesense.config.json)" \
typesense/docsearch-scraper:0.11.0 2>&1 | tee scraper.log
nb_hits=$(grep -oE 'Nb hits: *[0-9]+' scraper.log | tail -1 | grep -oE '[0-9]+' || true)
if [ -z "$nb_hits" ]; then
echo "::error::Could not parse 'Nb hits' from scraper output, assuming index is broken."
exit 1
fi
echo "Indexed $nb_hits records (threshold: $MIN_HITS)"
if [ "$nb_hits" -lt "$MIN_HITS" ]; then
echo "::error::Indexed only $nb_hits records (expected at least $MIN_HITS). Search index is likely broken."
exit 1
fi
# Log how many api-nr records are visible in the live index. The
# docusaurus theme always prepends `default` to its contextual
# docusaurus_tag filter, and no docusaurus page is stamped with
# `default` (each carries its plugin-context tag instead), so this
# facet count is effectively the count of indexed api-nr records.
# Informational only: the count varies with aztec-nr content size.
api_hits=$(curl -fsS \
"https://$TYPESENSE_HOST/collections/aztec-docs/documents/search" \
-H "X-TYPESENSE-API-KEY: $TYPESENSE_API_KEY" \
-G \
--data-urlencode "q=*" \
--data-urlencode "query_by=hierarchy.lvl0" \
--data-urlencode "filter_by=docusaurus_tag:=[default]&&language:=en" \
--data-urlencode "per_page=1" \
| jq -r '.found')
echo "api-nr records visible under docusaurus_tag:=[default]: $api_hits"