Improve SEO/AEO discovery surface for awesome-python.com (#3103)

vinta · claude · web-flow · commit d9f26a86357b · 2026-05-02T01:53:19.000+08:00
* update gitignore

* feat: tighten homepage metadata

* fix: trim generated HTML whitespace

* feat(website): add discovery files and markdown alternate

* feat(website): add sitemap lastmod

* feat(seo): add Content-Signal directive to robots.txt

Signals search, ai-input, and ai-train to crawlers
via the experimental Content-Signal header in robots.txt.

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -10,12 +10,12 @@ __pycache__/
 website/output/
 website/data/
 
-# claude code
+# planning docs
+docs/
+
+# agents
+.agents/
 .claude/skills/
-.gstack/
-.playwright-cli/
 .superpowers/
+.playwright-cli/
 skills-lock.json
-
-# codex
-.agents/
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Awesome Python
 
-An opinionated list of Python frameworks, libraries, tools, and resources.
+An opinionated guide to the best Python frameworks, libraries, tools, and resources.
 
 # **Sponsors**
 
diff --git a/website/build.py b/website/build.py
@@ -4,6 +4,8 @@
 import json
 import re
 import shutil
+import xml.etree.ElementTree as ET
+from collections.abc import Sequence
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
@@ -12,6 +14,9 @@
 from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
 
 GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
+SITE_URL = "https://awesome-python.com/"
+SITEMAP_URL = f"{SITE_URL}sitemap.xml"
+SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
 
 SOURCE_TYPE_DOMAINS = {
     "docs.python.org": "Built-in",
@@ -67,6 +72,59 @@ def sort_key(entry: dict) -> tuple[int, int, int, str]:
     return sorted(entries, key=sort_key)
 
 
+def build_robots_txt() -> str:
+    return (
+        "User-agent: *\n"
+        "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
+        "Allow: /\n"
+        "\n"
+        f"Sitemap: {SITEMAP_URL}\n"
+    )
+
+
+def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
+    ET.register_namespace("", SITEMAP_NS)
+    urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
+    for url, lastmod in urls:
+        url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
+        loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
+        loc_el.text = url
+        lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
+        lastmod_el.text = lastmod
+
+    ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
+    with path.open("ab") as f:
+        f.write(b"\n")
+
+
+def top_level_heading_text(line: str) -> str | None:
+    stripped = line.strip()
+    if not stripped.startswith("# "):
+        return None
+    return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
+
+
+def remove_sponsors_section(markdown: str) -> str:
+    lines = markdown.splitlines(keepends=True)
+    start_idx = None
+    for i, line in enumerate(lines):
+        heading = top_level_heading_text(line)
+        if heading and heading.lower() == "sponsors":
+            start_idx = i
+            break
+
+    if start_idx is None:
+        return markdown
+
+    end_idx = len(lines)
+    for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
+        if top_level_heading_text(line):
+            end_idx = i
+            break
+
+    return "".join(lines[:start_idx] + lines[end_idx:])
+
+
 def extract_entries(
     categories: list[ParsedSection],
     groups: list[ParsedGroup],
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
     categories = [cat for g in parsed_groups for cat in g["categories"]]
     total_entries = sum(c["entry_count"] for c in categories)
     entries = extract_entries(categories, parsed_groups)
+    build_date = datetime.now(UTC)
 
     stars_data = load_stars(website / "data" / "github_stars.json")
 
@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
     env = Environment(
         loader=FileSystemLoader(website / "templates"),
         autoescape=True,
+        trim_blocks=True,
+        lstrip_blocks=True,
     )
 
     site_dir = website / "output"
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
             total_entries=total_entries,
             total_categories=len(categories),
             repo_stars=repo_stars,
-            build_date=datetime.now(UTC).strftime("%B %d, %Y"),
+            build_date=build_date.strftime("%B %d, %Y"),
             sponsors=sponsors,
         ),
         encoding="utf-8",
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
     if static_src.exists():
         shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
 
-    (site_dir / "llms.txt").write_text(readme_text, encoding="utf-8")
+    markdown_index = remove_sponsors_section(readme_text)
+    (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
+    write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
+    (site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
+    (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
 
     print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
     print(f"Total entries: {total_entries}")
diff --git a/website/templates/base.html b/website/templates/base.html
@@ -1,26 +1,27 @@
 <!doctype html>
 <html lang="en">
   <head>
+    {% set default_meta_title = "Awesome Python" %}
+    {% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %}
+    {% set canonical_url = "https://awesome-python.com/" %}
+    {% set social_image_url = "https://awesome-python.com/static/og-image.png" %}
+    {% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %}
+    {% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %}
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <title>{% block title %}Awesome Python{% endblock %}</title>
-    <meta
-      name="description"
-      content="{% block description %}An opinionated list of Python frameworks, libraries, tools, and resources. {{ total_entries }} projects across {{ categories | length }} categories.{% endblock %}"
-    />
-    <link rel="canonical" href="https://awesome-python.com/" />
+    <title>{{ meta_title | trim }}</title>
+    <meta name="description" content="{{ meta_description | trim }}" />
+    <link rel="canonical" href="{{ canonical_url }}" />
+    <link rel="alternate" type="text/markdown" href="/index.md" />
     <meta property="og:type" content="website" />
-    <meta property="og:title" content="Awesome Python" />
-    <meta
-      property="og:description"
-      content="An opinionated list of Python frameworks, libraries, tools, and resources."
-    />
-    <meta
-      property="og:image"
-      content="https://awesome-python.com/static/og-image.png"
-    />
-    <meta property="og:url" content="https://awesome-python.com/" />
-    <meta name="twitter:card" content="summary" />
+    <meta property="og:title" content="{{ meta_title | trim }}" />
+    <meta property="og:description" content="{{ meta_description | trim }}" />
+    <meta property="og:image" content="{{ social_image_url }}" />
+    <meta property="og:url" content="{{ canonical_url }}" />
+    <meta name="twitter:card" content="summary_large_image" />
+    <meta name="twitter:title" content="{{ meta_title | trim }}" />
+    <meta name="twitter:description" content="{{ meta_description | trim }}" />
+    <meta name="twitter:image" content="{{ social_image_url }}" />
     <meta name="theme-color" content="#1c1410" />
     <link rel="icon" href="/static/favicon.svg" type="image/svg+xml" />
     <link rel="preconnect" href="https://fonts.googleapis.com" />
diff --git a/website/tests/test_build.py b/website/tests/test_build.py
@@ -3,6 +3,9 @@
 import json
 import shutil
 import textwrap
+import xml.etree.ElementTree as ET
+from datetime import UTC, date, datetime
+from html.parser import HTMLParser
 from pathlib import Path
 
 from build import (
@@ -15,6 +18,40 @@
 )
 from readme_parser import parse_readme, slugify
 
+
+class HeadMetadataParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.title_count = 0
+        self.title = ""
+        self.meta_by_name = {}
+        self.meta_by_property = {}
+        self.links_by_rel = {}
+        self._in_title = False
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag == "title":
+            self.title_count += 1
+            self._in_title = True
+        elif tag == "meta":
+            if "name" in attrs:
+                self.meta_by_name[attrs["name"]] = attrs.get("content", "")
+            if "property" in attrs:
+                self.meta_by_property[attrs["property"]] = attrs.get("content", "")
+        elif tag == "link" and attrs.get("rel"):
+            for rel in attrs["rel"].split():
+                self.links_by_rel[rel] = attrs.get("href", "")
+
+    def handle_endtag(self, tag):
+        if tag == "title":
+            self._in_title = False
+
+    def handle_data(self, data):
+        if self._in_title:
+            self.title += data
+
+
 # ---------------------------------------------------------------------------
 # slugify
 # ---------------------------------------------------------------------------
@@ -72,6 +109,11 @@ def _make_repo(self, tmp_path, readme):
             encoding="utf-8",
         )
 
+    def _copy_real_templates(self, tmp_path):
+        real_tpl = Path(__file__).parent / ".." / "templates"
+        tpl_dir = tmp_path / "website" / "templates"
+        shutil.copytree(real_tpl, tpl_dir)
+
     def test_build_creates_single_page(self, tmp_path):
         readme = textwrap.dedent("""\
             # Awesome Python
@@ -114,6 +156,97 @@ def test_build_creates_single_page(self, tmp_path):
         # No category sub-pages
         assert not (site / "categories").exists()
 
+    def test_build_creates_root_discovery_files(self, tmp_path):
+        readme = textwrap.dedent("""\
+            # Awesome Python
+
+            Intro.
+
+            ---
+
+            ## Widgets
+
+            - [w1](https://example.com) - A widget.
+
+            # Contributing
+
+            Help!
+        """)
+        self._make_repo(tmp_path, readme)
+        start_date = datetime.now(UTC).date()
+        build(tmp_path)
+        end_date = datetime.now(UTC).date()
+
+        site = tmp_path / "website" / "output"
+        robots = (site / "robots.txt").read_text(encoding="utf-8")
+        assert robots == (
+            "User-agent: *\n"
+            "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
+            "Allow: /\n"
+            "\n"
+            "Sitemap: https://awesome-python.com/sitemap.xml\n"
+        )
+
+        sitemap = ET.parse(site / "sitemap.xml")
+        root = sitemap.getroot()
+        ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
+        locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)]
+        lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)]
+
+        assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset"
+        assert locs == ["https://awesome-python.com/"]
+        assert len(lastmods) == 1
+        assert start_date <= date.fromisoformat(lastmods[0]) <= end_date
+        assert all(loc.startswith("https://awesome-python.com/") for loc in locs)
+        assert all("?" not in loc for loc in locs)
+
+    def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path):
+        readme = textwrap.dedent("""\
+            # Awesome Python
+
+            Intro.
+
+            # **Sponsors**
+
+            - **[Sponsor](https://sponsor.example.com)**: Sponsored tool.
+
+            > Become a sponsor: [Sponsor us](SPONSORSHIP.md).
+
+            # Categories
+
+            **Tools**
+
+            - [Widgets](#widgets)
+
+            ---
+
+            ## Widgets
+
+            - [w1](https://example.com) - A widget.
+
+            # Contributing
+
+            Help!
+        """)
+        (tmp_path / "README.md").write_text(readme, encoding="utf-8")
+        self._copy_real_templates(tmp_path)
+
+        build(tmp_path)
+
+        site = tmp_path / "website" / "output"
+        index_html = (site / "index.html").read_text(encoding="utf-8")
+        index_md = (site / "index.md").read_text(encoding="utf-8")
+        llms_txt = (site / "llms.txt").read_text(encoding="utf-8")
+
+        assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html
+        assert index_md == llms_txt
+        assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories")
+        assert "# **Sponsors**" not in index_md
+        assert "Sponsor" not in index_md
+        assert "SPONSORSHIP.md" not in index_md
+        assert "## Widgets" in index_md
+        assert "- [w1](https://example.com) - A widget." in index_md
+
     def test_build_cleans_stale_output(self, tmp_path):
         readme = textwrap.dedent("""\
             # T
@@ -235,6 +368,40 @@ def test_build_with_stars_sorts_by_stars(self, tmp_path):
         # Expand content present
         assert "expand-content" in html
 
+    def test_index_contains_aligned_homepage_metadata(self, tmp_path):
+        readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8")
+        (tmp_path / "README.md").write_text(readme, encoding="utf-8")
+        self._copy_real_templates(tmp_path)
+
+        build(tmp_path)
+
+        parsed_groups = parse_readme(readme)
+        categories = [cat for group in parsed_groups for cat in group["categories"]]
+        entries = extract_entries(categories, parsed_groups)
+        html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8")
+        parser = HeadMetadataParser()
+        parser.feed(html)
+
+        expected_title = "Awesome Python"
+        expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development."
+        expected_url = "https://awesome-python.com/"
+        expected_image = "https://awesome-python.com/static/og-image.png"
+
+        assert parser.title_count == 1
+        assert parser.title.strip() == expected_title
+        assert parser.meta_by_name["description"] == expected_description
+        assert parser.links_by_rel["canonical"] == expected_url
+        assert parser.meta_by_property["og:type"] == "website"
+        assert parser.meta_by_property["og:title"] == expected_title
+        assert parser.meta_by_property["og:description"] == expected_description
+        assert parser.meta_by_property["og:image"] == expected_image
+        assert parser.meta_by_property["og:url"] == expected_url
+        assert parser.meta_by_name["twitter:card"] == "summary_large_image"
+        assert parser.meta_by_name["twitter:title"] == expected_title
+        assert parser.meta_by_name["twitter:description"] == expected_description
+        assert parser.meta_by_name["twitter:image"] == expected_image
+        assert "<head>\n    <meta charset" in html
+
 
 # ---------------------------------------------------------------------------
 # extract_github_repo