Skip to content

Commit d9f26a8

Browse files
vintaclaude
andauthored
Improve SEO/AEO discovery surface for awesome-python.com (#3103)
* update gitignore * feat: tighten homepage metadata * fix: trim generated HTML whitespace * feat(website): add discovery files and markdown alternate * feat(website): add sitemap lastmod * feat(seo): add Content-Signal directive to robots.txt Signals search, ai-input, and ai-train to crawlers via the experimental Content-Signal header in robots.txt. Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent ccd4fb7 commit d9f26a8

5 files changed

Lines changed: 259 additions & 26 deletions

File tree

.gitignore

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@ __pycache__/
1010
website/output/
1111
website/data/
1212

13-
# claude code
13+
# planning docs
14+
docs/
15+
16+
# agents
17+
.agents/
1418
.claude/skills/
15-
.gstack/
16-
.playwright-cli/
1719
.superpowers/
20+
.playwright-cli/
1821
skills-lock.json
19-
20-
# codex
21-
.agents/

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Awesome Python
22

3-
An opinionated list of Python frameworks, libraries, tools, and resources.
3+
An opinionated guide to the best Python frameworks, libraries, tools, and resources.
44

55
# **Sponsors**
66

website/build.py

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import json
55
import re
66
import shutil
7+
import xml.etree.ElementTree as ET
8+
from collections.abc import Sequence
79
from datetime import UTC, datetime
810
from pathlib import Path
911
from typing import Any
@@ -12,6 +14,9 @@
1214
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
1315

1416
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
17+
SITE_URL = "https://awesome-python.com/"
18+
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
19+
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
1520

1621
SOURCE_TYPE_DOMAINS = {
1722
"docs.python.org": "Built-in",
@@ -67,6 +72,59 @@ def sort_key(entry: dict) -> tuple[int, int, int, str]:
6772
return sorted(entries, key=sort_key)
6873

6974

75+
def build_robots_txt() -> str:
76+
return (
77+
"User-agent: *\n"
78+
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
79+
"Allow: /\n"
80+
"\n"
81+
f"Sitemap: {SITEMAP_URL}\n"
82+
)
83+
84+
85+
def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
86+
ET.register_namespace("", SITEMAP_NS)
87+
urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
88+
for url, lastmod in urls:
89+
url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
90+
loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
91+
loc_el.text = url
92+
lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
93+
lastmod_el.text = lastmod
94+
95+
ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
96+
with path.open("ab") as f:
97+
f.write(b"\n")
98+
99+
100+
def top_level_heading_text(line: str) -> str | None:
101+
stripped = line.strip()
102+
if not stripped.startswith("# "):
103+
return None
104+
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
105+
106+
107+
def remove_sponsors_section(markdown: str) -> str:
108+
lines = markdown.splitlines(keepends=True)
109+
start_idx = None
110+
for i, line in enumerate(lines):
111+
heading = top_level_heading_text(line)
112+
if heading and heading.lower() == "sponsors":
113+
start_idx = i
114+
break
115+
116+
if start_idx is None:
117+
return markdown
118+
119+
end_idx = len(lines)
120+
for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
121+
if top_level_heading_text(line):
122+
end_idx = i
123+
break
124+
125+
return "".join(lines[:start_idx] + lines[end_idx:])
126+
127+
70128
def extract_entries(
71129
categories: list[ParsedSection],
72130
groups: list[ParsedGroup],
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
131189
categories = [cat for g in parsed_groups for cat in g["categories"]]
132190
total_entries = sum(c["entry_count"] for c in categories)
133191
entries = extract_entries(categories, parsed_groups)
192+
build_date = datetime.now(UTC)
134193

135194
stars_data = load_stars(website / "data" / "github_stars.json")
136195

@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
155214
env = Environment(
156215
loader=FileSystemLoader(website / "templates"),
157216
autoescape=True,
217+
trim_blocks=True,
218+
lstrip_blocks=True,
158219
)
159220

160221
site_dir = website / "output"
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
171232
total_entries=total_entries,
172233
total_categories=len(categories),
173234
repo_stars=repo_stars,
174-
build_date=datetime.now(UTC).strftime("%B %d, %Y"),
235+
build_date=build_date.strftime("%B %d, %Y"),
175236
sponsors=sponsors,
176237
),
177238
encoding="utf-8",
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
182243
if static_src.exists():
183244
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
184245

185-
(site_dir / "llms.txt").write_text(readme_text, encoding="utf-8")
246+
markdown_index = remove_sponsors_section(readme_text)
247+
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
248+
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
249+
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
250+
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
186251

187252
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
188253
print(f"Total entries: {total_entries}")

website/templates/base.html

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,27 @@
11
<!doctype html>
22
<html lang="en">
33
<head>
4+
{% set default_meta_title = "Awesome Python" %}
5+
{% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %}
6+
{% set canonical_url = "https://awesome-python.com/" %}
7+
{% set social_image_url = "https://awesome-python.com/static/og-image.png" %}
8+
{% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %}
9+
{% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %}
410
<meta charset="utf-8" />
511
<meta name="viewport" content="width=device-width, initial-scale=1" />
6-
<title>{% block title %}Awesome Python{% endblock %}</title>
7-
<meta
8-
name="description"
9-
content="{% block description %}An opinionated list of Python frameworks, libraries, tools, and resources. {{ total_entries }} projects across {{ categories | length }} categories.{% endblock %}"
10-
/>
11-
<link rel="canonical" href="https://awesome-python.com/" />
12+
<title>{{ meta_title | trim }}</title>
13+
<meta name="description" content="{{ meta_description | trim }}" />
14+
<link rel="canonical" href="{{ canonical_url }}" />
15+
<link rel="alternate" type="text/markdown" href="/index.md" />
1216
<meta property="og:type" content="website" />
13-
<meta property="og:title" content="Awesome Python" />
14-
<meta
15-
property="og:description"
16-
content="An opinionated list of Python frameworks, libraries, tools, and resources."
17-
/>
18-
<meta
19-
property="og:image"
20-
content="https://awesome-python.com/static/og-image.png"
21-
/>
22-
<meta property="og:url" content="https://awesome-python.com/" />
23-
<meta name="twitter:card" content="summary" />
17+
<meta property="og:title" content="{{ meta_title | trim }}" />
18+
<meta property="og:description" content="{{ meta_description | trim }}" />
19+
<meta property="og:image" content="{{ social_image_url }}" />
20+
<meta property="og:url" content="{{ canonical_url }}" />
21+
<meta name="twitter:card" content="summary_large_image" />
22+
<meta name="twitter:title" content="{{ meta_title | trim }}" />
23+
<meta name="twitter:description" content="{{ meta_description | trim }}" />
24+
<meta name="twitter:image" content="{{ social_image_url }}" />
2425
<meta name="theme-color" content="#1c1410" />
2526
<link rel="icon" href="/static/favicon.svg" type="image/svg+xml" />
2627
<link rel="preconnect" href="https://fonts.googleapis.com" />

website/tests/test_build.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import json
44
import shutil
55
import textwrap
6+
import xml.etree.ElementTree as ET
7+
from datetime import UTC, date, datetime
8+
from html.parser import HTMLParser
69
from pathlib import Path
710

811
from build import (
@@ -15,6 +18,40 @@
1518
)
1619
from readme_parser import parse_readme, slugify
1720

21+
22+
class HeadMetadataParser(HTMLParser):
23+
def __init__(self):
24+
super().__init__()
25+
self.title_count = 0
26+
self.title = ""
27+
self.meta_by_name = {}
28+
self.meta_by_property = {}
29+
self.links_by_rel = {}
30+
self._in_title = False
31+
32+
def handle_starttag(self, tag, attrs):
33+
attrs = dict(attrs)
34+
if tag == "title":
35+
self.title_count += 1
36+
self._in_title = True
37+
elif tag == "meta":
38+
if "name" in attrs:
39+
self.meta_by_name[attrs["name"]] = attrs.get("content", "")
40+
if "property" in attrs:
41+
self.meta_by_property[attrs["property"]] = attrs.get("content", "")
42+
elif tag == "link" and attrs.get("rel"):
43+
for rel in attrs["rel"].split():
44+
self.links_by_rel[rel] = attrs.get("href", "")
45+
46+
def handle_endtag(self, tag):
47+
if tag == "title":
48+
self._in_title = False
49+
50+
def handle_data(self, data):
51+
if self._in_title:
52+
self.title += data
53+
54+
1855
# ---------------------------------------------------------------------------
1956
# slugify
2057
# ---------------------------------------------------------------------------
@@ -72,6 +109,11 @@ def _make_repo(self, tmp_path, readme):
72109
encoding="utf-8",
73110
)
74111

112+
def _copy_real_templates(self, tmp_path):
113+
real_tpl = Path(__file__).parent / ".." / "templates"
114+
tpl_dir = tmp_path / "website" / "templates"
115+
shutil.copytree(real_tpl, tpl_dir)
116+
75117
def test_build_creates_single_page(self, tmp_path):
76118
readme = textwrap.dedent("""\
77119
# Awesome Python
@@ -114,6 +156,97 @@ def test_build_creates_single_page(self, tmp_path):
114156
# No category sub-pages
115157
assert not (site / "categories").exists()
116158

159+
def test_build_creates_root_discovery_files(self, tmp_path):
160+
readme = textwrap.dedent("""\
161+
# Awesome Python
162+
163+
Intro.
164+
165+
---
166+
167+
## Widgets
168+
169+
- [w1](https://example.com) - A widget.
170+
171+
# Contributing
172+
173+
Help!
174+
""")
175+
self._make_repo(tmp_path, readme)
176+
start_date = datetime.now(UTC).date()
177+
build(tmp_path)
178+
end_date = datetime.now(UTC).date()
179+
180+
site = tmp_path / "website" / "output"
181+
robots = (site / "robots.txt").read_text(encoding="utf-8")
182+
assert robots == (
183+
"User-agent: *\n"
184+
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
185+
"Allow: /\n"
186+
"\n"
187+
"Sitemap: https://awesome-python.com/sitemap.xml\n"
188+
)
189+
190+
sitemap = ET.parse(site / "sitemap.xml")
191+
root = sitemap.getroot()
192+
ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
193+
locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)]
194+
lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)]
195+
196+
assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset"
197+
assert locs == ["https://awesome-python.com/"]
198+
assert len(lastmods) == 1
199+
assert start_date <= date.fromisoformat(lastmods[0]) <= end_date
200+
assert all(loc.startswith("https://awesome-python.com/") for loc in locs)
201+
assert all("?" not in loc for loc in locs)
202+
203+
def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path):
204+
readme = textwrap.dedent("""\
205+
# Awesome Python
206+
207+
Intro.
208+
209+
# **Sponsors**
210+
211+
- **[Sponsor](https://sponsor.example.com)**: Sponsored tool.
212+
213+
> Become a sponsor: [Sponsor us](SPONSORSHIP.md).
214+
215+
# Categories
216+
217+
**Tools**
218+
219+
- [Widgets](#widgets)
220+
221+
---
222+
223+
## Widgets
224+
225+
- [w1](https://example.com) - A widget.
226+
227+
# Contributing
228+
229+
Help!
230+
""")
231+
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
232+
self._copy_real_templates(tmp_path)
233+
234+
build(tmp_path)
235+
236+
site = tmp_path / "website" / "output"
237+
index_html = (site / "index.html").read_text(encoding="utf-8")
238+
index_md = (site / "index.md").read_text(encoding="utf-8")
239+
llms_txt = (site / "llms.txt").read_text(encoding="utf-8")
240+
241+
assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html
242+
assert index_md == llms_txt
243+
assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories")
244+
assert "# **Sponsors**" not in index_md
245+
assert "Sponsor" not in index_md
246+
assert "SPONSORSHIP.md" not in index_md
247+
assert "## Widgets" in index_md
248+
assert "- [w1](https://example.com) - A widget." in index_md
249+
117250
def test_build_cleans_stale_output(self, tmp_path):
118251
readme = textwrap.dedent("""\
119252
# T
@@ -235,6 +368,40 @@ def test_build_with_stars_sorts_by_stars(self, tmp_path):
235368
# Expand content present
236369
assert "expand-content" in html
237370

371+
def test_index_contains_aligned_homepage_metadata(self, tmp_path):
372+
readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8")
373+
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
374+
self._copy_real_templates(tmp_path)
375+
376+
build(tmp_path)
377+
378+
parsed_groups = parse_readme(readme)
379+
categories = [cat for group in parsed_groups for cat in group["categories"]]
380+
entries = extract_entries(categories, parsed_groups)
381+
html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8")
382+
parser = HeadMetadataParser()
383+
parser.feed(html)
384+
385+
expected_title = "Awesome Python"
386+
expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development."
387+
expected_url = "https://awesome-python.com/"
388+
expected_image = "https://awesome-python.com/static/og-image.png"
389+
390+
assert parser.title_count == 1
391+
assert parser.title.strip() == expected_title
392+
assert parser.meta_by_name["description"] == expected_description
393+
assert parser.links_by_rel["canonical"] == expected_url
394+
assert parser.meta_by_property["og:type"] == "website"
395+
assert parser.meta_by_property["og:title"] == expected_title
396+
assert parser.meta_by_property["og:description"] == expected_description
397+
assert parser.meta_by_property["og:image"] == expected_image
398+
assert parser.meta_by_property["og:url"] == expected_url
399+
assert parser.meta_by_name["twitter:card"] == "summary_large_image"
400+
assert parser.meta_by_name["twitter:title"] == expected_title
401+
assert parser.meta_by_name["twitter:description"] == expected_description
402+
assert parser.meta_by_name["twitter:image"] == expected_image
403+
assert "<head>\n <meta charset" in html
404+
238405

239406
# ---------------------------------------------------------------------------
240407
# extract_github_repo

0 commit comments

Comments
 (0)