@@ -6,6 +6,7 @@ cd "$ROOT_DIR"
66
77failures=0
88warnings=0
9+ site_base_url=" "
910
1011err () {
1112 printf ' [SEO FAIL] %s\n' " $1 "
@@ -51,6 +52,19 @@ extract_link_href() {
5152 ' " $file "
5253}
5354
55+ extract_link_href_with_type () {
56+ local file=" $1 "
57+ local rel=" $2 "
58+ local type=" $3 "
59+ SEO_LINK_REL=" $rel " SEO_LINK_TYPE=" $type " perl -0777 -ne '
60+ my $rel = quotemeta $ENV{SEO_LINK_REL};
61+ my $type = quotemeta $ENV{SEO_LINK_TYPE};
62+ if (/<link\b(?=[^>]*\brel=["\x27]$rel["\x27])(?=[^>]*\btype=["\x27]$type["\x27])(?=[^>]*\bhref=["\x27]([^"\x27]+)["\x27])[^>]*>/is) {
63+ print $1;
64+ }
65+ ' " $file "
66+ }
67+
5468extract_title () {
5569 local file=" $1 "
5670 perl -0777 -ne '
@@ -78,6 +92,16 @@ extract_robots_sitemap_urls() {
7892 sed -nE ' s/^[[:space:]]*[Ss]itemap:[[:space:]]*(https:\/\/[^[:space:]]+)[[:space:]]*$/\1/p' " $file "
7993}
8094
95+ extract_feed_id () {
96+ local file=" $1 "
97+ perl -0777 -ne ' if (/<feed\b.*?<id>\s*([^<]+)\s*<\/id>/is) { print $1; }' " $file "
98+ }
99+
100+ extract_feed_self_href () {
101+ local file=" $1 "
102+ perl -0777 -ne ' if (/<link\b(?=[^>]*\brel=["\x27]self["\x27])(?=[^>]*\bhref=["\x27]([^"\x27]+)["\x27])[^>]*>/is) { print $1; }' " $file "
103+ }
104+
81105strip_wrapping_quotes () {
82106 local value=" $1 "
83107
@@ -117,6 +141,58 @@ is_repository_doc_markdown() {
117141 return 1
118142}
119143
144+ ensure_trailing_slash () {
145+ local url=" $1 "
146+ [[ -n " $url " ]] || {
147+ printf ' %s' " $url "
148+ return
149+ }
150+
151+ [[ " $url " == * / ]] || url=" ${url} /"
152+ printf ' %s' " $url "
153+ }
154+
155+ derive_site_base_url () {
156+ local configured_url=" ${SEO_LINT_SITE_BASE_URL:- } "
157+ local canonical_url
158+ local sitemap_urls=()
159+
160+ if [[ -n " $configured_url " ]]; then
161+ site_base_url=" $( ensure_trailing_slash " $configured_url " ) "
162+ elif [[ -f robots.txt ]]; then
163+ mapfile -t sitemap_urls < <( extract_robots_sitemap_urls robots.txt)
164+ if [[ " ${# sitemap_urls[@]} " -gt 0 ]]; then
165+ site_base_url=" ${sitemap_urls[0]%/* } /"
166+ fi
167+ fi
168+
169+ if [[ -z " $site_base_url " && -f index.html ]]; then
170+ canonical_url=" $( extract_link_href index.html canonical) "
171+ if [[ -n " $canonical_url " ]]; then
172+ if [[ " $canonical_url " == * / ]]; then
173+ site_base_url=" $canonical_url "
174+ else
175+ site_base_url=" ${canonical_url%/* } /"
176+ fi
177+ fi
178+ fi
179+
180+ if [[ -z " $site_base_url " ]]; then
181+ err " Unable to determine site base URL from SEO_LINT_SITE_BASE_URL, robots.txt, or index.html"
182+ return
183+ fi
184+
185+ [[ " $site_base_url " =~ ^https:// ]] || err " Site base URL must start with https:// ($site_base_url )"
186+ }
187+
188+ validate_url_within_site_base () {
189+ local label=" $1 "
190+ local url=" $2 "
191+
192+ [[ -n " $site_base_url " ]] || return
193+ [[ " $url " == " $site_base_url " * ]] || err " $label must stay within site base URL ($site_base_url ): $url "
194+ }
195+
120196should_check_http () {
121197 local setting=" ${SEO_LINT_CHECK_HTTP:- auto} "
122198
@@ -203,6 +279,7 @@ validate_robots_txt() {
203279
204280 for sitemap_url in " ${sitemap_urls[@]} " ; do
205281 [[ " $sitemap_url " =~ ^https:// ]] || err " robots.txt: Sitemap URL must start with https:// ($sitemap_url )"
282+ validate_url_within_site_base " robots.txt sitemap URL" " $sitemap_url "
206283 done
207284
208285 if ! should_check_http; then
@@ -235,6 +312,7 @@ validate_sitemap_xml() {
235312
236313 for url in " ${urls[@]} " ; do
237314 [[ " $url " =~ ^https:// ]] || err " sitemap.xml: URL must start with https:// ($url )"
315+ validate_url_within_site_base " sitemap.xml URL" " $url "
238316 done
239317
240318 if ! should_check_http; then
@@ -250,6 +328,63 @@ validate_sitemap_xml() {
250328 done
251329}
252330
331+ validate_feed_xml () {
332+ local feed_id
333+ local feed_self_url
334+ local root_feed_link
335+ local expected_feed_url
336+
337+ [[ -f feed.xml ]] || {
338+ err " Missing feed.xml at repo root"
339+ return
340+ }
341+
342+ feed_id=" $( extract_feed_id feed.xml) "
343+ feed_self_url=" $( extract_feed_self_href feed.xml) "
344+ expected_feed_url=" ${site_base_url} feed.xml"
345+
346+ if [[ -z " $feed_id " ]]; then
347+ err " feed.xml: Missing <id>"
348+ elif [[ ! " $feed_id " =~ ^https:// ]]; then
349+ err " feed.xml: <id> must start with https://"
350+ else
351+ validate_url_within_site_base " feed.xml <id>" " $feed_id "
352+ fi
353+
354+ if [[ -z " $feed_self_url " ]]; then
355+ err " feed.xml: Missing self link"
356+ elif [[ ! " $feed_self_url " =~ ^https:// ]]; then
357+ err " feed.xml: Self link must start with https://"
358+ else
359+ validate_url_within_site_base " feed.xml self link" " $feed_self_url "
360+ [[ " $feed_self_url " == " $expected_feed_url " ]] || err " feed.xml: Self link must equal $expected_feed_url "
361+ fi
362+
363+ if [[ -n " $feed_id " && -n " $feed_self_url " && " $feed_id " != " $feed_self_url " ]]; then
364+ err " feed.xml: <id> must match the self link URL"
365+ fi
366+
367+ grep -q ' <entry>' feed.xml || err " feed.xml: Missing <entry>"
368+
369+ if [[ -f index.html ]]; then
370+ root_feed_link=" $( extract_link_href_with_type index.html alternate ' application/atom+xml' ) "
371+ if [[ -z " $root_feed_link " ]]; then
372+ err " index.html: Missing Atom feed alternate link"
373+ elif [[ -n " $feed_self_url " && " $root_feed_link " != " $feed_self_url " ]]; then
374+ err " index.html: Atom feed alternate link must point to $feed_self_url "
375+ fi
376+ fi
377+
378+ if ! should_check_http; then
379+ info " Skipping live feed URL checks outside CI (set SEO_LINT_CHECK_HTTP=1 to enable)"
380+ return
381+ fi
382+
383+ require_tool curl
384+ info " Checking feed URL"
385+ http_200 " $expected_feed_url " || err " feed.xml URL failed after retries: $expected_feed_url "
386+ }
387+
253388scan_html_files () {
254389 local file
255390 local title_text
@@ -279,6 +414,8 @@ scan_html_files() {
279414 err " $file : Missing canonical link"
280415 elif [[ ! " $canonical " =~ ^https:// ]]; then
281416 err " $file : Canonical must start with https://"
417+ else
418+ validate_url_within_site_base " $file canonical" " $canonical "
282419 fi
283420
284421 robots=" $( extract_meta_content " $file " robots) "
@@ -330,6 +467,8 @@ scan_markdown_files() {
330467 canonical_url=" $( strip_wrapping_quotes " $canonical_url " ) "
331468 if [[ -n " $canonical_url " && ! " $canonical_url " =~ ^https:// ]]; then
332469 err " $md : canonical_url must start with https://"
470+ elif [[ -n " $canonical_url " ]]; then
471+ validate_url_within_site_base " $md canonical_url" " $canonical_url "
333472 fi
334473 done < <( find_markdown_files)
335474}
@@ -339,8 +478,10 @@ require_tool grep
339478require_tool perl
340479require_tool sed
341480
481+ derive_site_base_url
342482validate_robots_txt
343483validate_sitemap_xml
484+ validate_feed_xml
344485scan_html_files
345486scan_markdown_files
346487
0 commit comments