Skip to content

Commit d2b818a

Browse files
authored
Merge pull request #146 from KubaO/staging
Use check_links.py for all link checking.
2 parents e2eafeb + b312565 commit d2b818a

5 files changed

Lines changed: 93 additions & 46 deletions

File tree

.github/workflows/checks.yml

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,25 @@ jobs:
3535
- name: Build with Jekyll
3636
run: bundle exec jekyll build
3737
working-directory: ./docs
38-
- name: Check online links (lychee)
39-
uses: lycheeverse/lychee-action@v2
40-
with:
41-
args: >-
42-
--offline --include-fragments
43-
--fallback-extensions html
44-
--index-files 'index.html,.'
45-
--root-dir ${{ github.workspace }}/docs/_site
46-
./_site
47-
workingDirectory: ./docs
48-
fail: true
49-
- name: Set up Python for offline link check
38+
- name: Set up Python for link checks
5039
uses: actions/setup-python@v5
5140
with:
5241
python-version: '3.14'
5342
cache: 'pip'
5443
- name: Install Python deps
5544
run: pip install -r requirements.txt
45+
- name: Check online links (check_links.py)
46+
# `--fallback-extensions html` mirrors what GitHub Pages does at request time:
47+
# an extensionless URL like `/FAQ` is served as `/FAQ.html`. This workflow's
48+
# Jekyll build runs without --baseurl (no Pages prefix), so no --base-path is
49+
# needed -- contrast with jekyll-gh-pages.yml.
50+
run: >-
51+
python scripts/check_links.py
52+
--offline --include-fragments
53+
--fallback-extensions html
54+
--index-files 'index.html,.'
55+
--root-dir docs/_site
56+
docs/_site
5657
- name: Check offline links (check_links.py)
5758
run: >-
5859
python scripts/check_links.py

.github/workflows/jekyll-gh-pages.yml

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -57,38 +57,29 @@ jobs:
5757
env:
5858
JEKYLL_ENV: production
5959
PAGES_REPO_NWO: "${{ github.repository }}"
60-
- name: Check online links (lychee)
61-
uses: lycheeverse/lychee-action@v2
62-
with:
63-
# --remap matches the fully-resolved file URI (not the raw href), so the pattern
64-
# must include the file:// scheme and --root-dir prefix. The (/|$) tail handles
65-
# both `/twinBASIC-docs/page` and bare `/twinBASIC-docs` — lychee strips trailing
66-
# slashes before remap, so we can't require one in the pattern.
67-
#
68-
# `--fallback-extensions html` mirrors what GitHub Pages does at request time:
69-
# an extensionless URL like `/FAQ` is served as `/FAQ.html`. Without the flag
70-
# lychee would flag every pretty permalink on the site.
71-
#
72-
# Lychee, not the Python checker, handles the online tree here because the
73-
# `--remap` flag isn't implemented by scripts/check_links.py; the offline tree
74-
# below has all baseurl prefixes already stripped by the offlinify plugin and
75-
# so doesn't need it.
76-
args: >-
77-
--offline --include-fragments
78-
--fallback-extensions html
79-
--index-files 'index.html,.'
80-
--remap '^file://${{ github.workspace }}/docs/_site${{ steps.pages.outputs.base_path }}(/|$) file://${{ github.workspace }}/docs/_site/'
81-
--root-dir ${{ github.workspace }}/docs/_site
82-
./_site
83-
workingDirectory: ./docs
84-
fail: true
85-
- name: Set up Python for offline link check
60+
- name: Set up Python for link checks
8661
uses: actions/setup-python@v5
8762
with:
8863
python-version: '3.14'
8964
cache: 'pip'
9065
- name: Install Python deps
9166
run: pip install -r requirements.txt
67+
- name: Check online links (check_links.py)
68+
# `--fallback-extensions html` mirrors what GitHub Pages does at request time:
69+
# an extensionless URL like `/FAQ` is served as `/FAQ.html`. Without the flag
70+
# every pretty permalink on the site would look broken.
71+
#
72+
# `--base-path` strips the Pages baseurl (e.g. `/twinBASIC-docs`) from absolute
73+
# URLs before resolving against `--root-dir`. Equivalent to the `--remap` regex
74+
# that lychee used in earlier iterations of this step.
75+
run: >-
76+
python scripts/check_links.py
77+
--offline --include-fragments
78+
--fallback-extensions html
79+
--index-files 'index.html,.'
80+
--base-path '${{ steps.pages.outputs.base_path }}'
81+
--root-dir docs/_site
82+
docs/_site
9283
- name: Check offline links (check_links.py)
9384
# Strict check on `_site-offline/`: every link must resolve to an actual file
9485
# under `file://`, with no extension fallback. Catches relative links in

scripts/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__pycache__/
-13.4 KB
Binary file not shown.

scripts/check_links.py

Lines changed: 62 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,42 @@ def extract_fragment_ids(html_path):
119119
return ids
120120

121121

122-
def resolve(href, source_dir_str, source_str, root_str):
122+
def _normalize_base_path(s):
123+
"""Coerce a base-path arg into the canonical '/prefix' form (leading
124+
slash, no trailing slash). Empty input maps to empty string."""
125+
if not s:
126+
return ""
127+
s = s.strip().rstrip("/")
128+
if not s:
129+
return ""
130+
if not s.startswith("/"):
131+
s = "/" + s
132+
return s
133+
134+
135+
def _strip_base_path(path_str, base_path):
136+
"""Lop a base-path prefix off an absolute URL path, if it matches.
137+
138+
A Jekyll build with `--baseurl /twinBASIC-docs` produces hrefs like
139+
'/twinBASIC-docs/foo' that resolve, in the deployed site, to '/foo'
140+
under the actual root. This mirrors lychee's `--remap` regex but as
141+
a clean prefix strip:
142+
143+
'/twinBASIC-docs/foo' -> '/foo' (prefix + /...)
144+
'/twinBASIC-docs' -> '/' (bare prefix, treat as root)
145+
'/twinBASIC-docs-other' -> unchanged (only strip on '/' or end-of-string)
146+
'/foo' -> unchanged (no prefix match)
147+
"""
148+
if not base_path:
149+
return path_str
150+
if path_str == base_path:
151+
return "/"
152+
if path_str.startswith(base_path + "/"):
153+
return path_str[len(base_path):]
154+
return path_str
155+
156+
157+
def resolve(href, source_dir_str, source_str, root_str, base_path=""):
123158
"""Lexically resolve href -> (normalized_target_str, is_dir_link, fragment).
124159
Returns None for schemes/netlocs we skip. Uses only string ops — no
125160
filesystem syscalls (Path.resolve is ~110us per call on Windows).
@@ -129,6 +164,10 @@ def resolve(href, source_dir_str, source_str, root_str):
129164
for resolution: 'foo/' must resolve as a directory (try index files),
130165
while 'foo' falls through to fallback extensions ('foo.html') if no
131166
file/dir 'foo' exists.
167+
168+
base_path is an absolute-URL prefix to strip before resolving against
169+
root_str -- e.g. '/twinBASIC-docs' to handle a Jekyll --baseurl build.
170+
Only applied to absolute URLs; relative paths are unaffected.
132171
"""
133172
if "#" in href:
134173
path_part, frag = href.split("#", 1)
@@ -151,6 +190,7 @@ def resolve(href, source_dir_str, source_str, root_str):
151190
is_dir_link = path_str.endswith("/") or path_str.endswith("/.")
152191

153192
if path_str.startswith("/"):
193+
path_str = _strip_base_path(path_str, base_path)
154194
target = os.path.normpath(os.path.join(root_str, path_str.lstrip("/")))
155195
else:
156196
target = os.path.normpath(os.path.join(source_dir_str, path_str))
@@ -242,6 +282,15 @@ def _build_parser():
242282
"as broken."
243283
),
244284
)
285+
ap.add_argument(
286+
"--base-path", default="", metavar="PREFIX",
287+
help=(
288+
"URL-path prefix to strip from absolute URLs before resolving "
289+
"against --root-dir. Matches a Jekyll build's --baseurl, e.g. "
290+
"'/twinBASIC-docs'. Equivalent to a constrained form of "
291+
"lychee's --remap. Empty by default (no stripping)."
292+
),
293+
)
245294
ap.add_argument(
246295
"--threads", type=int, default=os.cpu_count() or 4, metavar="N",
247296
help="Worker threads for HTML parsing. Default: CPU count.",
@@ -292,6 +341,7 @@ def main():
292341
root_str = str(args.root_dir.resolve()) if args.root_dir else ""
293342
fallback_exts = [e for e in args.fallback_extensions.split(",") if e]
294343
index_files = [e for e in args.index_files.split(",") if e]
344+
base_path = _normalize_base_path(args.base_path)
295345

296346
t0 = time.perf_counter()
297347
html_files = _collect_html_files(args.inputs)
@@ -317,7 +367,7 @@ def main():
317367
rk = (src_dir, href)
318368
r = resolution_cache.get(rk, ...)
319369
if r is ...:
320-
r = resolve(href, src_dir, src_str, root_str)
370+
r = resolve(href, src_dir, src_str, root_str, base_path)
321371
resolution_cache[rk] = r
322372
if r is None:
323373
continue
@@ -346,24 +396,28 @@ def main():
346396
fragment_cache[f] = ids
347397
t_fragments = time.perf_counter()
348398

349-
broken = []
350-
for (target_str, is_dir, frag), sources in unique_checks.items():
399+
broken = [] # one entry per occurrence; for human-readable report
400+
broken_keys = set() # unique broken (target, is_dir, frag) keys
401+
for key, sources in unique_checks.items():
402+
target_str, is_dir, frag = key
351403
resolved = target_resolution.get((target_str, is_dir))
352404
if resolved is None:
405+
broken_keys.add(key)
353406
for src_str, href in sources:
354407
broken.append((src_str, href, "target not found"))
355408
continue
356409
if frag and args.include_fragments:
357410
ids = fragment_cache.get(resolved, set())
358411
if frag not in ids:
412+
broken_keys.add(key)
359413
for src_str, href in sources:
360414
broken.append((src_str, href, f"fragment #{frag} not found"))
361415
t_done = time.perf_counter()
362416

363417
total = len(occurrences)
364418
unique = len(unique_checks)
365-
errors = len(broken)
366-
ok = unique - errors
419+
errors_unique = len(broken_keys)
420+
ok_unique = unique - errors_unique
367421

368422
if broken:
369423
# Group by source file, lychee-style.
@@ -378,8 +432,8 @@ def main():
378432

379433
elapsed = t_done - t0
380434
print(
381-
f"Checked {total} links ({unique} unique) in {elapsed:.3f}s "
382-
f"-- {ok} OK, {errors} errors"
435+
f"Checked {total} occurrences ({unique} unique) in {elapsed:.3f}s "
436+
f"-- {ok_unique} OK, {errors_unique} broken"
383437
)
384438

385439
if args.verbose:

0 commit comments

Comments
 (0)