Docs Scraper #582
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Docs Scraper | |
| on: | |
| workflow_dispatch: | |
| schedule: | |
| # Run the workflow every night at 5:00 AM UTC, after nightly release and docs update | |
| - cron: "0 5 * * *" | |
| push: | |
| branches: | |
| - next | |
| paths: | |
| - docs/** | |
| jobs: | |
| docs-scraper: | |
| runs-on: ubuntu-latest | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} | |
| NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 | |
| with: | |
| fetch-depth: 0 | |
| - name: Reindex with Typesense docsearch-scraper | |
| env: | |
| # Fail the run if the scraper indexes fewer than this many records. | |
| # The docsearch-scraper container exits 0 even when its config is broken | |
| # and the index ends up nearly empty, so this guard turns a silent | |
| # regression (which happened with #22861 dropping the index from | |
| # ~12k to 48 records) into a loud CI failure. | |
| MIN_HITS: "5000" | |
| TYPESENSE_API_KEY: ${{ secrets.TYPESENSE_API_KEY }} | |
| TYPESENSE_HOST: ${{ secrets.TYPESENSE_HOST }} | |
| run: | | |
| set -euo pipefail | |
| docker run \ | |
| -e "TYPESENSE_API_KEY=$TYPESENSE_API_KEY" \ | |
| -e "TYPESENSE_HOST=$TYPESENSE_HOST" \ | |
| -e "TYPESENSE_PORT=443" \ | |
| -e "TYPESENSE_PROTOCOL=https" \ | |
| -e "CONFIG=$(cat docs/typesense.config.json)" \ | |
| typesense/docsearch-scraper:0.11.0 2>&1 | tee scraper.log | |
| nb_hits=$(grep -oE 'Nb hits: *[0-9]+' scraper.log | tail -1 | grep -oE '[0-9]+' || true) | |
| if [ -z "$nb_hits" ]; then | |
| echo "::error::Could not parse 'Nb hits' from scraper output, assuming index is broken." | |
| exit 1 | |
| fi | |
| echo "Indexed $nb_hits records (threshold: $MIN_HITS)" | |
| if [ "$nb_hits" -lt "$MIN_HITS" ]; then | |
| echo "::error::Indexed only $nb_hits records (expected at least $MIN_HITS). Search index is likely broken." | |
| exit 1 | |
| fi | |
| # Log how many api-nr records are visible in the live index. The | |
| # docusaurus theme always prepends `default` to its contextual | |
| # docusaurus_tag filter, and no docusaurus page is stamped with | |
| # `default` (each carries its plugin-context tag instead), so this | |
| # facet count is effectively the count of indexed api-nr records. | |
| # Informational only: the count varies with aztec-nr content size. | |
| api_hits=$(curl -fsS \ | |
| "https://$TYPESENSE_HOST/collections/aztec-docs/documents/search" \ | |
| -H "X-TYPESENSE-API-KEY: $TYPESENSE_API_KEY" \ | |
| -G \ | |
| --data-urlencode "q=*" \ | |
| --data-urlencode "query_by=hierarchy.lvl0" \ | |
| --data-urlencode "filter_by=docusaurus_tag:=[default]&&language:=en" \ | |
| --data-urlencode "per_page=1" \ | |
| | jq -r '.found') | |
| echo "api-nr records visible under docusaurus_tag:=[default]: $api_hits" |