Skip to content

Commit 42c30d1

Browse files
authored
Rewrite hexdocs.pm canonical links to per-package subdomains (#119)
ExDoc emits a <link rel="canonical"> tag (when the package sets the :canonical option) pointing at the old path-based URL, https://hexdocs.pm/<package>/... . Now that docs are served from per-package subdomains, that canonical points away from where the page actually lives, splitting SEO signal. Rewrite the canonical tag at ingestion time in the file rewriter so it points at https://<package>.hexdocs.pm/... , reusing package_to_subdomain for the underscore-to-hyphen mapping and upgrading http to https. The bare apex, apex files such as sitemap.xml, and canonical links that already use a subdomain are left untouched. Body links and other tags are intentionally not rewritten: a permanent redirect from the old URLs preserves link equity via 301, so canonical is the only tag where the rewrite changes SEO behavior.
1 parent d9ceaf4 commit 42c30d1

2 files changed

Lines changed: 119 additions & 1 deletion

File tree

lib/hexdocs/file_rewriter.ex

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,25 @@ defmodule Hexdocs.FileRewriter do
1616
|> add_elixir_org_link(path)
1717
|> add_analytics(path)
1818
|> remove_noindex(path)
19+
|> rewrite_canonical_links(path)
1920
|> add_nofollow(path)
2021
end
2122

23+
@canonical_tag_re ~r{<link[^>]*\brel=["']canonical["'][^>]*>}i
24+
@hexdocs_link_re ~r{https?://hexdocs\.pm/([a-z][a-z0-9_]*)(?![a-zA-Z0-9_.-])}
25+
26+
defp rewrite_canonical_links(content, path) do
27+
if String.ends_with?(path, ".html") do
28+
Regex.replace(@canonical_tag_re, content, fn tag ->
29+
Regex.replace(@hexdocs_link_re, tag, fn _match, package ->
30+
"https://#{Hexdocs.Utils.package_to_subdomain(package)}.hexdocs.pm"
31+
end)
32+
end)
33+
else
34+
content
35+
end
36+
end
37+
2238
defp add_elixir_org_link(content, path) do
2339
if String.ends_with?(path, ".html") and not String.contains?(content, @link_addition) do
2440
String.replace(content, @link_hooks, &(&1 <> " for the " <> @link_addition))

test/hexdocs/file_rewriter_test.exs

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,108 @@ defmodule Hexdocs.FileRewriterTest do
1717
assert FileRewriter.run("index.html", ~s|<meta name="robots" content="noindex">|) == ""
1818
end
1919

20+
describe "rewrite hexdocs.pm canonical links to subdomains" do
21+
test "rewrites a canonical link" do
22+
assert FileRewriter.run(
23+
"index.html",
24+
~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
25+
) ==
26+
~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|
27+
end
28+
29+
test "preserves version, query and fragment in the tail" do
30+
assert FileRewriter.run(
31+
"index.html",
32+
~s|<link rel="canonical" href="https://hexdocs.pm/jason/1.4.0/Jason.html?foo=bar#decode/2"/>|
33+
) ==
34+
~s|<link rel="canonical" href="https://jason.hexdocs.pm/1.4.0/Jason.html?foo=bar#decode/2"/>|
35+
end
36+
37+
test "maps underscores in the package name to hyphens" do
38+
assert FileRewriter.run(
39+
"index.html",
40+
~s|<link rel="canonical" href="https://hexdocs.pm/phoenix_html/Phoenix.HTML.html"/>|
41+
) ==
42+
~s|<link rel="canonical" href="https://phoenix-html.hexdocs.pm/Phoenix.HTML.html"/>|
43+
end
44+
45+
test "rewrites the hex package" do
46+
assert FileRewriter.run(
47+
"index.html",
48+
~s|<link rel="canonical" href="https://hexdocs.pm/hex/usage.html"/>|
49+
) ==
50+
~s|<link rel="canonical" href="https://hex.hexdocs.pm/usage.html"/>|
51+
end
52+
53+
test "rewrites http links to https subdomains" do
54+
assert FileRewriter.run(
55+
"index.html",
56+
~s|<link rel="canonical" href="http://hexdocs.pm/jason/Jason.html"/>|
57+
) ==
58+
~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|
59+
end
60+
61+
test "handles href before rel in the canonical tag" do
62+
assert FileRewriter.run(
63+
"index.html",
64+
~s|<link href="https://hexdocs.pm/jason/Jason.html" rel="canonical">|
65+
) ==
66+
~s|<link href="https://jason.hexdocs.pm/Jason.html" rel="canonical">|
67+
end
68+
69+
test "does not rewrite body links or text" do
70+
for input <- [
71+
~s|<a href="https://hexdocs.pm/jason/Jason.html">Jason</a>|,
72+
~s|<pre><code>visit https://hexdocs.pm/jason/readme.html</code></pre>|
73+
] do
74+
assert FileRewriter.run("index.html", input) == input
75+
end
76+
end
77+
78+
test "does not rewrite other link tags" do
79+
input = ~s|<link rel="stylesheet" href="https://hexdocs.pm/jason/app.css">|
80+
assert FileRewriter.run("index.html", input) == input
81+
end
82+
83+
test "leaves the bare apex untouched" do
84+
for input <- [
85+
~s|<link rel="canonical" href="https://hexdocs.pm"/>|,
86+
~s|<link rel="canonical" href="https://hexdocs.pm/"/>|
87+
] do
88+
assert FileRewriter.run("index.html", input) == input
89+
end
90+
end
91+
92+
test "leaves apex files untouched" do
93+
for input <- [
94+
~s|<link rel="canonical" href="https://hexdocs.pm/sitemap.xml"/>|,
95+
~s|<link rel="canonical" href="https://hexdocs.pm/foo.html"/>|
96+
] do
97+
assert FileRewriter.run("index.html", input) == input
98+
end
99+
end
100+
101+
test "does not touch canonical links that already use a subdomain" do
102+
for input <- [
103+
~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|,
104+
~s|<link rel="canonical" href="https://preview.hexdocs.pm/foo/Foo.html"/>|
105+
] do
106+
assert FileRewriter.run("index.html", input) == input
107+
end
108+
end
109+
110+
test "is idempotent" do
111+
input = ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
112+
once = FileRewriter.run("index.html", input)
113+
assert FileRewriter.run("index.html", once) == once
114+
end
115+
116+
test "does not modify non-html files" do
117+
input = ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
118+
assert FileRewriter.run("index.js", input) == input
119+
end
120+
end
121+
20122
describe "add_nofollow" do
21123
test "adds rel=nofollow to external links" do
22124
assert FileRewriter.run("index.html", ~s|<a href="https://example.com">example</a>|) ==
@@ -42,7 +144,7 @@ defmodule Hexdocs.FileRewriterTest do
42144
test "does not add nofollow to official ecosystem links" do
43145
for url <- [
44146
"https://hex.pm/packages/foo",
45-
"https://hexdocs.pm/foo",
147+
"https://hexdocs.pm",
46148
"https://elixir-lang.org",
47149
"https://www.erlang.org",
48150
"https://preview.hexdocs.pm/foo"

0 commit comments

Comments
 (0)