Skip to content

Commit 0edb4b7

Browse files
Enhance SEO checks with live feed validation and schedule support
1 parent f1d1596 commit 0edb4b7

2 files changed

Lines changed: 145 additions & 2 deletions

File tree

.github/workflows/seo-check.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ on:
77
pull_request:
88
branches:
99
- main
10+
schedule:
11+
- cron: "17 3 * * 1"
1012
workflow_dispatch:
1113

1214
permissions:
@@ -33,8 +35,8 @@ jobs:
3335
run: bash ./scripts/seo_lint.sh .
3436

3537
seo-live:
36-
name: Live robots and sitemap check
37-
if: github.event_name == 'workflow_dispatch'
38+
name: Live robots, sitemap, and feed check
39+
if: github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
3840
needs: seo-lint
3941
runs-on: ubuntu-latest
4042
timeout-minutes: 10

scripts/seo_lint.sh

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ cd "$ROOT_DIR"
66

77
failures=0
88
warnings=0
9+
site_base_url=""
910

1011
err() {
1112
printf '[SEO FAIL] %s\n' "$1"
@@ -51,6 +52,19 @@ extract_link_href() {
5152
' "$file"
5253
}
5354

55+
extract_link_href_with_type() {
56+
local file="$1"
57+
local rel="$2"
58+
local type="$3"
59+
SEO_LINK_REL="$rel" SEO_LINK_TYPE="$type" perl -0777 -ne '
60+
my $rel = quotemeta $ENV{SEO_LINK_REL};
61+
my $type = quotemeta $ENV{SEO_LINK_TYPE};
62+
if (/<link\b(?=[^>]*\brel=["\x27]$rel["\x27])(?=[^>]*\btype=["\x27]$type["\x27])(?=[^>]*\bhref=["\x27]([^"\x27]+)["\x27])[^>]*>/is) {
63+
print $1;
64+
}
65+
' "$file"
66+
}
67+
5468
extract_title() {
5569
local file="$1"
5670
perl -0777 -ne '
@@ -78,6 +92,16 @@ extract_robots_sitemap_urls() {
7892
sed -nE 's/^[[:space:]]*[Ss]itemap:[[:space:]]*(https:\/\/[^[:space:]]+)[[:space:]]*$/\1/p' "$file"
7993
}
8094

95+
extract_feed_id() {
96+
local file="$1"
97+
perl -0777 -ne 'if (/<feed\b.*?<id>\s*([^<]+)\s*<\/id>/is) { print $1; }' "$file"
98+
}
99+
100+
extract_feed_self_href() {
101+
local file="$1"
102+
perl -0777 -ne 'if (/<link\b(?=[^>]*\brel=["\x27]self["\x27])(?=[^>]*\bhref=["\x27]([^"\x27]+)["\x27])[^>]*>/is) { print $1; }' "$file"
103+
}
104+
81105
strip_wrapping_quotes() {
82106
local value="$1"
83107

@@ -117,6 +141,58 @@ is_repository_doc_markdown() {
117141
return 1
118142
}
119143

144+
ensure_trailing_slash() {
145+
local url="$1"
146+
[[ -n "$url" ]] || {
147+
printf '%s' "$url"
148+
return
149+
}
150+
151+
[[ "$url" == */ ]] || url="${url}/"
152+
printf '%s' "$url"
153+
}
154+
155+
derive_site_base_url() {
156+
local configured_url="${SEO_LINT_SITE_BASE_URL:-}"
157+
local canonical_url
158+
local sitemap_urls=()
159+
160+
if [[ -n "$configured_url" ]]; then
161+
site_base_url="$(ensure_trailing_slash "$configured_url")"
162+
elif [[ -f robots.txt ]]; then
163+
mapfile -t sitemap_urls < <(extract_robots_sitemap_urls robots.txt)
164+
if [[ "${#sitemap_urls[@]}" -gt 0 ]]; then
165+
site_base_url="${sitemap_urls[0]%/*}/"
166+
fi
167+
fi
168+
169+
if [[ -z "$site_base_url" && -f index.html ]]; then
170+
canonical_url="$(extract_link_href index.html canonical)"
171+
if [[ -n "$canonical_url" ]]; then
172+
if [[ "$canonical_url" == */ ]]; then
173+
site_base_url="$canonical_url"
174+
else
175+
site_base_url="${canonical_url%/*}/"
176+
fi
177+
fi
178+
fi
179+
180+
if [[ -z "$site_base_url" ]]; then
181+
err "Unable to determine site base URL from SEO_LINT_SITE_BASE_URL, robots.txt, or index.html"
182+
return
183+
fi
184+
185+
[[ "$site_base_url" =~ ^https:// ]] || err "Site base URL must start with https:// ($site_base_url)"
186+
}
187+
188+
validate_url_within_site_base() {
189+
local label="$1"
190+
local url="$2"
191+
192+
[[ -n "$site_base_url" ]] || return
193+
[[ "$url" == "$site_base_url"* ]] || err "$label must stay within site base URL ($site_base_url): $url"
194+
}
195+
120196
should_check_http() {
121197
local setting="${SEO_LINT_CHECK_HTTP:-auto}"
122198

@@ -203,6 +279,7 @@ validate_robots_txt() {
203279

204280
for sitemap_url in "${sitemap_urls[@]}"; do
205281
[[ "$sitemap_url" =~ ^https:// ]] || err "robots.txt: Sitemap URL must start with https:// ($sitemap_url)"
282+
validate_url_within_site_base "robots.txt sitemap URL" "$sitemap_url"
206283
done
207284

208285
if ! should_check_http; then
@@ -235,6 +312,7 @@ validate_sitemap_xml() {
235312

236313
for url in "${urls[@]}"; do
237314
[[ "$url" =~ ^https:// ]] || err "sitemap.xml: URL must start with https:// ($url)"
315+
validate_url_within_site_base "sitemap.xml URL" "$url"
238316
done
239317

240318
if ! should_check_http; then
@@ -250,6 +328,63 @@ validate_sitemap_xml() {
250328
done
251329
}
252330

331+
validate_feed_xml() {
332+
local feed_id
333+
local feed_self_url
334+
local root_feed_link
335+
local expected_feed_url
336+
337+
[[ -f feed.xml ]] || {
338+
err "Missing feed.xml at repo root"
339+
return
340+
}
341+
342+
feed_id="$(extract_feed_id feed.xml)"
343+
feed_self_url="$(extract_feed_self_href feed.xml)"
344+
expected_feed_url="${site_base_url}feed.xml"
345+
346+
if [[ -z "$feed_id" ]]; then
347+
err "feed.xml: Missing <id>"
348+
elif [[ ! "$feed_id" =~ ^https:// ]]; then
349+
err "feed.xml: <id> must start with https://"
350+
else
351+
validate_url_within_site_base "feed.xml <id>" "$feed_id"
352+
fi
353+
354+
if [[ -z "$feed_self_url" ]]; then
355+
err "feed.xml: Missing self link"
356+
elif [[ ! "$feed_self_url" =~ ^https:// ]]; then
357+
err "feed.xml: Self link must start with https://"
358+
else
359+
validate_url_within_site_base "feed.xml self link" "$feed_self_url"
360+
[[ "$feed_self_url" == "$expected_feed_url" ]] || err "feed.xml: Self link must equal $expected_feed_url"
361+
fi
362+
363+
if [[ -n "$feed_id" && -n "$feed_self_url" && "$feed_id" != "$feed_self_url" ]]; then
364+
err "feed.xml: <id> must match the self link URL"
365+
fi
366+
367+
grep -q '<entry>' feed.xml || err "feed.xml: Missing <entry>"
368+
369+
if [[ -f index.html ]]; then
370+
root_feed_link="$(extract_link_href_with_type index.html alternate 'application/atom+xml')"
371+
if [[ -z "$root_feed_link" ]]; then
372+
err "index.html: Missing Atom feed alternate link"
373+
elif [[ -n "$feed_self_url" && "$root_feed_link" != "$feed_self_url" ]]; then
374+
err "index.html: Atom feed alternate link must point to $feed_self_url"
375+
fi
376+
fi
377+
378+
if ! should_check_http; then
379+
info "Skipping live feed URL checks outside CI (set SEO_LINT_CHECK_HTTP=1 to enable)"
380+
return
381+
fi
382+
383+
require_tool curl
384+
info "Checking feed URL"
385+
http_200 "$expected_feed_url" || err "feed.xml URL failed after retries: $expected_feed_url"
386+
}
387+
253388
scan_html_files() {
254389
local file
255390
local title_text
@@ -279,6 +414,8 @@ scan_html_files() {
279414
err "$file: Missing canonical link"
280415
elif [[ ! "$canonical" =~ ^https:// ]]; then
281416
err "$file: Canonical must start with https://"
417+
else
418+
validate_url_within_site_base "$file canonical" "$canonical"
282419
fi
283420

284421
robots="$(extract_meta_content "$file" robots)"
@@ -330,6 +467,8 @@ scan_markdown_files() {
330467
canonical_url="$(strip_wrapping_quotes "$canonical_url")"
331468
if [[ -n "$canonical_url" && ! "$canonical_url" =~ ^https:// ]]; then
332469
err "$md: canonical_url must start with https://"
470+
elif [[ -n "$canonical_url" ]]; then
471+
validate_url_within_site_base "$md canonical_url" "$canonical_url"
333472
fi
334473
done < <(find_markdown_files)
335474
}
@@ -339,8 +478,10 @@ require_tool grep
339478
require_tool perl
340479
require_tool sed
341480

481+
derive_site_base_url
342482
validate_robots_txt
343483
validate_sitemap_xml
484+
validate_feed_xml
344485
scan_html_files
345486
scan_markdown_files
346487

0 commit comments

Comments
 (0)