Skip to content

Commit 34323b6

Browse files
committed
Fix broken cross-package Haddock links on hosted docs site
Fixes #601. Links to dependency packages (cardano-ledger-*, plutus-*, etc.) were relative paths pointing to directories that don't exist on the hosted site, resulting in 404s. Adds scripts/fix-haddock-links.sh, invoked from the github-page workflow, which: - Fetches the CHaP package index to classify packages as CHaP vs Hackage - Auto-discovers all broken cross-package links in generated HTML - Creates symlinks for versioned directory names (GHC-bundled and local) - Rewrites links to known doc sites (prefix rules + exact overrides) or Hackage - Validates all rewritten URLs with HTTP HEAD requests - Replaces dead links with annotated plain text (tooltip shows package name) This eliminates any ad-hoc hardcoded list, automatically handles new dependencies, and guarantees zero broken clickable links.
1 parent 779f539 commit 34323b6

2 files changed

Lines changed: 378 additions & 0 deletions

File tree

.github/workflows/github-page.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ jobs:
3939
mkdir website
4040
cabal haddock-project --output=./website --internal --foreign-libraries
4141
42+
- name: Fix cross-package Haddock links
43+
run: |
44+
./scripts/fix-haddock-links.sh ./website
45+
4246
- name: Build typedoc documentation
4347
run: |
4448
nix build .#wasm-typedoc

scripts/fix-haddock-links.sh

Lines changed: 374 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
#!/usr/bin/env bash
2+
# fix-haddock-links.sh
3+
#
4+
# Post-processes Haddock HTML generated by `cabal haddock-project` to fix
5+
# cross-package links. Without this, links to external dependencies are
6+
# broken 404s because the hosted site only contains docs for packages in
7+
# this repo.
8+
#
9+
# The script:
10+
# 1. Fetches the CHaP package index to classify packages
11+
# 2. Auto-discovers which packages have broken links
12+
# 3. Rewrites links to known doc sites or Hackage
13+
# 4. Validates all rewritten links with HTTP HEAD requests
14+
# 5. Replaces dead links (internal modules, missing docs) with annotated
15+
# plain text so there are zero broken clickable links
16+
#
17+
# Usage: ./scripts/fix-haddock-links.sh <website-directory>
18+
19+
set -euo pipefail
20+
21+
WEBSITE_DIR="${1:?Usage: $0 <website-directory>}"
22+
23+
if [ ! -d "$WEBSITE_DIR" ]; then
24+
echo "Error: $WEBSITE_DIR is not a directory" >&2
25+
exit 1
26+
fi
27+
28+
# ============================================================================
29+
# Configuration: doc site mappings for CHaP packages
30+
# ============================================================================
31+
32+
LEDGER_DOCS="https://cardano-ledger.cardano.intersectmbo.org"
33+
BASE_DOCS="https://base.cardano.intersectmbo.org"
34+
PLUTUS_DOCS="https://plutus.cardano.intersectmbo.org/haddock/latest"
35+
CONSENSUS_DOCS="https://ouroboros-consensus.cardano.intersectmbo.org/haddocks"
36+
NETWORK_DOCS="https://ouroboros-network.cardano.intersectmbo.org"
37+
IO_SIM_DOCS="https://input-output-hk.github.io/io-sim"
38+
39+
# Prefix rules: first match wins
40+
PREFIX_RULES=(
41+
"cardano-ledger-|$LEDGER_DOCS"
42+
"ouroboros-consensus|$CONSENSUS_DOCS"
43+
"ouroboros-network|$NETWORK_DOCS"
44+
"plutus-|$PLUTUS_DOCS"
45+
"io-classes|$IO_SIM_DOCS"
46+
"io-sim|$IO_SIM_DOCS"
47+
)
48+
49+
# Exact overrides for CHaP packages whose names don't match a prefix rule
50+
declare -A EXACT_OVERRIDES=(
51+
# cardano-ledger repo
52+
[cardano-crypto-wrapper]="$LEDGER_DOCS"
53+
[cardano-data]="$LEDGER_DOCS"
54+
[cardano-protocol-tpraos]="$LEDGER_DOCS"
55+
[small-steps]="$LEDGER_DOCS"
56+
[byron-spec-chain]="$LEDGER_DOCS"
57+
[byron-spec-ledger]="$LEDGER_DOCS"
58+
[non-integral]="$LEDGER_DOCS"
59+
[vector-map]="$LEDGER_DOCS"
60+
# cardano-base repo
61+
[cardano-binary]="$BASE_DOCS"
62+
[cardano-crypto-class]="$BASE_DOCS"
63+
[cardano-crypto-praos]="$BASE_DOCS"
64+
[cardano-slotting]="$BASE_DOCS"
65+
[cardano-strict-containers]="$BASE_DOCS"
66+
[cardano-base]="$BASE_DOCS"
67+
[cardano-prelude]="$BASE_DOCS"
68+
[heapwords]="$BASE_DOCS"
69+
[measures]="$BASE_DOCS"
70+
# ouroboros-network repo
71+
[cardano-diffusion]="$NETWORK_DOCS"
72+
[network-mux]="$NETWORK_DOCS"
73+
[monoidal-synchronisation]="$NETWORK_DOCS"
74+
[typed-protocols]="$NETWORK_DOCS"
75+
[cardano-ping]="$NETWORK_DOCS"
76+
# io-sim repo
77+
[strict-checked-vars]="$IO_SIM_DOCS"
78+
[strict-sop-core]="$IO_SIM_DOCS"
79+
[strict-mvar]="$IO_SIM_DOCS"
80+
)
81+
82+
# Resolve a package name to its doc site URL (or HACKAGE or NONE)
83+
resolve_url() {
84+
local pkg="$1"
85+
local is_chap="$2" # "yes" or "no"
86+
87+
# Exact override?
88+
if [[ -v "EXACT_OVERRIDES[$pkg]" ]]; then
89+
echo "${EXACT_OVERRIDES[$pkg]}"
90+
return
91+
fi
92+
93+
# Prefix rule?
94+
for rule in "${PREFIX_RULES[@]}"; do
95+
local prefix="${rule%%|*}"
96+
local url="${rule##*|}"
97+
if [[ "$pkg" == "${prefix}"* ]]; then
98+
echo "$url"
99+
return
100+
fi
101+
done
102+
103+
# CHaP package with no known doc site
104+
if [[ "$is_chap" == "yes" ]]; then
105+
echo "NONE"
106+
return
107+
fi
108+
109+
# Not on CHaP -> Hackage
110+
echo "HACKAGE"
111+
}
112+
113+
# ============================================================================
114+
# Phase 0: Create symlinks for local packages with versioned directory names
115+
# ============================================================================
116+
117+
echo "Phase 0: Creating symlinks for versioned directories..."
118+
symlink_count=0
119+
120+
# Build a set of existing short directory names
121+
declare -A SHORT_DIRS
122+
for dir in "$WEBSITE_DIR"/*/; do
123+
SHORT_DIRS["$(basename "$dir")"]=1
124+
done
125+
126+
# a) Symlink versioned-inplace dirs to short names (local cabal packages)
127+
# e.g. cardano-api-10.26.0.0-inplace -> cardano-api
128+
for dir in "$WEBSITE_DIR"/*/; do
129+
dirname="$(basename "$dir")"
130+
if [[ "$dirname" =~ ^(.+)-[0-9]+\.[0-9]+.*-inplace$ ]]; then
131+
short_name="${BASH_REMATCH[1]}"
132+
if [[ ! -e "$WEBSITE_DIR/$short_name" ]] && [[ ! "$dirname" =~ -inplace-.+ ]]; then
133+
ln -s "$dirname" "$WEBSITE_DIR/$short_name"
134+
symlink_count=$((symlink_count + 1))
135+
fi
136+
fi
137+
done
138+
139+
# b) Discover versioned-hash link targets that match existing short-name dirs
140+
# e.g. links to ../base-4.20.2.0-f074/ when ./website/base/ exists
141+
# Create symlinks so these links resolve locally
142+
VERSIONED_TARGETS=$(grep -rohP 'href="\.\./(\.\./)?\K[a-zA-Z][a-zA-Z0-9_.-]*(?=/)' "$WEBSITE_DIR" 2>/dev/null \
143+
| grep -P '^.+-[0-9]+\.[0-9]+' | sort -u)
144+
145+
while IFS= read -r target; do
146+
[ -z "$target" ] && continue
147+
# Already exists?
148+
[[ -e "$WEBSITE_DIR/$target" ]] && continue
149+
# Strip version+hash: "base-4.20.2.0-f074" -> "base"
150+
# Pattern: strip everything from the first "-DIGIT.DIGIT" onward
151+
if [[ "$target" =~ ^(.+)-[0-9]+\.[0-9] ]]; then
152+
short_name="${BASH_REMATCH[1]}"
153+
else
154+
short_name="$target"
155+
fi
156+
if [[ -v "SHORT_DIRS[$short_name]" ]]; then
157+
ln -s "$short_name" "$WEBSITE_DIR/$target"
158+
symlink_count=$((symlink_count + 1))
159+
fi
160+
done <<< "$VERSIONED_TARGETS"
161+
162+
echo " Created $symlink_count symlinks"
163+
164+
# ============================================================================
165+
# Phase 1: Fetch CHaP package list
166+
# ============================================================================
167+
168+
echo "Phase 1: Fetching CHaP package index..."
169+
CHAP_PKGS_FILE=$(mktemp)
170+
trap 'rm -f "$CHAP_PKGS_FILE"' EXIT
171+
172+
curl -sL https://chap.intersectmbo.org/01-index.tar.gz \
173+
| tar -tz \
174+
| grep -oP '^[^/]+' \
175+
| sort -u > "$CHAP_PKGS_FILE"
176+
177+
chap_total=$(wc -l < "$CHAP_PKGS_FILE")
178+
echo " Fetched $chap_total CHaP packages"
179+
180+
# Load into associative array for O(1) lookup
181+
declare -A CHAP_SET
182+
while IFS= read -r pkg; do
183+
CHAP_SET["$pkg"]=1
184+
done < "$CHAP_PKGS_FILE"
185+
186+
# ============================================================================
187+
# Phase 2: Discover local packages
188+
# ============================================================================
189+
190+
echo "Phase 2: Discovering local packages..."
191+
declare -A LOCAL_SET
192+
for dir in "$WEBSITE_DIR"/*/; do
193+
[ -d "$dir" ] || continue
194+
name="$(basename "$dir")"
195+
LOCAL_SET["$name"]=1
196+
done
197+
echo " Found ${#LOCAL_SET[@]} local directories"
198+
199+
# ============================================================================
200+
# Phase 3: Discover broken link targets
201+
# ============================================================================
202+
203+
echo "Phase 3: Discovering cross-package link targets..."
204+
DISCOVERED_PKGS=$(grep -rohP 'href="\.\./(\.\./)?\K[a-zA-Z][a-zA-Z0-9_.-]*(?=/)' "$WEBSITE_DIR" 2>/dev/null | sort -u)
205+
discovered_total=$(echo "$DISCOVERED_PKGS" | grep -c . || true)
206+
echo " Found $discovered_total unique link targets"
207+
208+
# ============================================================================
209+
# Phase 4 + 5: Classify, resolve, and apply rewrites
210+
# ============================================================================
211+
212+
echo "Phase 4: Classifying and rewriting links..."
213+
214+
SED_ARGS=()
215+
UNMAPPED_CHAP=()
216+
REWRITTEN_HACKAGE=()
217+
REWRITTEN_DOCSITE=()
218+
skipped_count=0
219+
220+
while IFS= read -r pkg; do
221+
[ -z "$pkg" ] && continue
222+
223+
# Skip local packages (directory already exists)
224+
if [[ -v "LOCAL_SET[$pkg]" ]]; then
225+
skipped_count=$((skipped_count + 1))
226+
continue
227+
fi
228+
229+
# Determine if CHaP package
230+
is_chap="no"
231+
if [[ -v "CHAP_SET[$pkg]" ]]; then
232+
is_chap="yes"
233+
fi
234+
235+
url=$(resolve_url "$pkg" "$is_chap")
236+
237+
if [[ "$url" == "NONE" ]]; then
238+
# CHaP package with no doc site — will be made unclickable
239+
UNMAPPED_CHAP+=("$pkg")
240+
continue
241+
elif [[ "$url" == "HACKAGE" ]]; then
242+
target="https://hackage.haskell.org/package/${pkg}/docs/"
243+
REWRITTEN_HACKAGE+=("$pkg")
244+
else
245+
target="${url}/${pkg}/"
246+
REWRITTEN_DOCSITE+=("$pkg")
247+
fi
248+
249+
SED_ARGS+=(-e "s|href=\"\\.\\./${pkg}/|href=\"${target}|g")
250+
SED_ARGS+=(-e "s|href=\"\\.\\./\\.\\./${pkg}/|href=\"${target}|g")
251+
done <<< "$DISCOVERED_PKGS"
252+
253+
echo " Local (skipped): $skipped_count"
254+
echo " Rewritten (doc site): ${#REWRITTEN_DOCSITE[@]}"
255+
echo " Rewritten (Hackage): ${#REWRITTEN_HACKAGE[@]}"
256+
echo " Unmapped CHaP: ${#UNMAPPED_CHAP[@]}"
257+
258+
if [[ ${#UNMAPPED_CHAP[@]} -gt 0 ]]; then
259+
echo " Unmapped CHaP packages (will be made unclickable):"
260+
for pkg in "${UNMAPPED_CHAP[@]}"; do
261+
echo " - $pkg"
262+
done
263+
fi
264+
265+
# Apply URL rewrites
266+
if [[ ${#SED_ARGS[@]} -gt 0 ]]; then
267+
echo "Phase 5: Applying link rewrites..."
268+
find "$WEBSITE_DIR" -name '*.html' -print0 | xargs -0 -P "$(nproc)" sed -i "${SED_ARGS[@]}"
269+
echo " Done"
270+
fi
271+
272+
# ============================================================================
273+
# Phase 5b: Make unmapped CHaP links unclickable
274+
# ============================================================================
275+
276+
if [[ ${#UNMAPPED_CHAP[@]} -gt 0 ]]; then
277+
echo "Phase 5b: Making unmapped CHaP links unclickable..."
278+
UNMAP_SED_ARGS=()
279+
for pkg in "${UNMAPPED_CHAP[@]}"; do
280+
# Replace <a href="../PKG/...">TEXT</a> with <span class="dead-link" title="...">TEXT</span>
281+
# We need to handle both ../ and ../../ prefixes
282+
UNMAP_SED_ARGS+=(-e "s|<a href=\"\\.\\./${pkg}/[^\"]*\"[^>]*>\\([^<]*\\)</a>|<span class=\"dead-link\" title=\"No hosted documentation available for ${pkg}\">\\1</span>|g")
283+
UNMAP_SED_ARGS+=(-e "s|<a href=\"\\.\\./\\.\\./${pkg}/[^\"]*\"[^>]*>\\([^<]*\\)</a>|<span class=\"dead-link\" title=\"No hosted documentation available for ${pkg}\">\\1</span>|g")
284+
done
285+
find "$WEBSITE_DIR" -name '*.html' -print0 | xargs -0 -P "$(nproc)" sed -i "${UNMAP_SED_ARGS[@]}"
286+
echo " Done"
287+
fi
288+
289+
# ============================================================================
290+
# Phase 6: Post-process validation
291+
# ============================================================================
292+
293+
echo "Phase 6: Validating rewritten links..."
294+
295+
# Extract all unique rewritten external URLs (stripping #fragment)
296+
URLS_FILE=$(mktemp)
297+
trap 'rm -f "$CHAP_PKGS_FILE" "$URLS_FILE"' EXIT
298+
299+
grep -rohP 'href="\Khttps://[^"#]+\.html' "$WEBSITE_DIR" 2>/dev/null | sort -u > "$URLS_FILE"
300+
url_count=$(wc -l < "$URLS_FILE")
301+
echo " Found $url_count unique external URLs to validate"
302+
303+
# Validate each URL with HEAD request, collect 404s
304+
DEAD_URLS_FILE=$(mktemp)
305+
trap 'rm -f "$CHAP_PKGS_FILE" "$URLS_FILE" "$DEAD_URLS_FILE"' EXIT
306+
307+
if [[ $url_count -gt 0 ]]; then
308+
# shellcheck disable=SC2016 # single quotes intentional: expansions evaluated by inner sh -c
309+
xargs -P 16 -I{} sh -c \
310+
'code=$(curl -sI -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 "{}"); if [ "$code" != "200" ] && [ "$code" != "301" ] && [ "$code" != "302" ]; then echo "{}"; fi' \
311+
< "$URLS_FILE" > "$DEAD_URLS_FILE" 2>/dev/null
312+
fi
313+
314+
dead_count=$(wc -l < "$DEAD_URLS_FILE" | tr -d ' ')
315+
valid_count=$((url_count - dead_count))
316+
echo " Valid: $valid_count"
317+
echo " Dead: $dead_count"
318+
319+
# ============================================================================
320+
# Phase 7: Replace dead links with annotated plain text
321+
# ============================================================================
322+
323+
if [[ $dead_count -gt 0 ]]; then
324+
echo "Phase 7: Replacing dead links with annotated text..."
325+
echo " Dead URLs:"
326+
327+
DEAD_SED_ARGS=()
328+
while IFS= read -r dead_url; do
329+
[ -z "$dead_url" ] && continue
330+
# Extract package name from URL
331+
# Doc site pattern: https://SITE/PACKAGE/Module.html
332+
# Hackage pattern: https://hackage.haskell.org/package/PACKAGE/docs/Module.html
333+
if [[ "$dead_url" == *"hackage.haskell.org"* ]]; then
334+
pkg_name=$(echo "$dead_url" | grep -oP 'package/\K[^/]+')
335+
else
336+
# Take the second-to-last path component (PACKAGE from .../PACKAGE/Module.html)
337+
pkg_name=$(echo "$dead_url" | grep -oP '[^/]+(?=/[^/]+\.html$)')
338+
fi
339+
echo " - $dead_url ($pkg_name)"
340+
341+
# Escape special chars for sed
342+
escaped_url=$(printf '%s' "$dead_url" | sed 's|[&/\]|\\&|g; s|\.|\\.|g')
343+
344+
# Replace <a href="DEAD_URL...">TEXT</a> with dead-link span
345+
# The href may have a #fragment appended
346+
DEAD_SED_ARGS+=(-e "s|<a href=\"${escaped_url}[^\"]*\"[^>]*>\\([^<]*\\)</a>|<span class=\"dead-link\" title=\"From ${pkg_name} — documentation not available at the expected URL\">\\1</span>|g")
347+
done < "$DEAD_URLS_FILE"
348+
349+
if [[ ${#DEAD_SED_ARGS[@]} -gt 0 ]]; then
350+
find "$WEBSITE_DIR" -name '*.html' -print0 | xargs -0 -P "$(nproc)" sed -i "${DEAD_SED_ARGS[@]}"
351+
fi
352+
353+
# Inject CSS for dead-link styling into all HTML files
354+
echo " Injecting dead-link CSS..."
355+
find "$WEBSITE_DIR" -name '*.html' -print0 | xargs -0 -P "$(nproc)" sed -i \
356+
's|</head>|<style>.dead-link { border-bottom: 1px dotted \#888; color: \#888; cursor: help; }</style></head>|'
357+
358+
echo " Done"
359+
fi
360+
361+
# ============================================================================
362+
# Summary
363+
# ============================================================================
364+
365+
echo ""
366+
echo "=== fix-haddock-links summary ==="
367+
echo " Packages discovered in HTML: $discovered_total"
368+
echo " Local (skipped): $skipped_count"
369+
echo " Rewritten to doc sites: ${#REWRITTEN_DOCSITE[@]}"
370+
echo " Rewritten to Hackage: ${#REWRITTEN_HACKAGE[@]}"
371+
echo " Unmapped CHaP (unclickable): ${#UNMAPPED_CHAP[@]}"
372+
echo " Links validated: $url_count"
373+
echo " Dead links (unclickable): $dead_count"
374+
echo "================================="

0 commit comments

Comments
 (0)