|
3 | 3 | import json |
4 | 4 | import shutil |
5 | 5 | import textwrap |
| 6 | +import xml.etree.ElementTree as ET |
| 7 | +from datetime import UTC, date, datetime |
| 8 | +from html.parser import HTMLParser |
6 | 9 | from pathlib import Path |
7 | 10 |
|
8 | 11 | from build import ( |
|
15 | 18 | ) |
16 | 19 | from readme_parser import parse_readme, slugify |
17 | 20 |
|
| 21 | + |
| 22 | +class HeadMetadataParser(HTMLParser): |
| 23 | + def __init__(self): |
| 24 | + super().__init__() |
| 25 | + self.title_count = 0 |
| 26 | + self.title = "" |
| 27 | + self.meta_by_name = {} |
| 28 | + self.meta_by_property = {} |
| 29 | + self.links_by_rel = {} |
| 30 | + self._in_title = False |
| 31 | + |
| 32 | + def handle_starttag(self, tag, attrs): |
| 33 | + attrs = dict(attrs) |
| 34 | + if tag == "title": |
| 35 | + self.title_count += 1 |
| 36 | + self._in_title = True |
| 37 | + elif tag == "meta": |
| 38 | + if "name" in attrs: |
| 39 | + self.meta_by_name[attrs["name"]] = attrs.get("content", "") |
| 40 | + if "property" in attrs: |
| 41 | + self.meta_by_property[attrs["property"]] = attrs.get("content", "") |
| 42 | + elif tag == "link" and attrs.get("rel"): |
| 43 | + for rel in attrs["rel"].split(): |
| 44 | + self.links_by_rel[rel] = attrs.get("href", "") |
| 45 | + |
| 46 | + def handle_endtag(self, tag): |
| 47 | + if tag == "title": |
| 48 | + self._in_title = False |
| 49 | + |
| 50 | + def handle_data(self, data): |
| 51 | + if self._in_title: |
| 52 | + self.title += data |
| 53 | + |
| 54 | + |
18 | 55 | # --------------------------------------------------------------------------- |
19 | 56 | # slugify |
20 | 57 | # --------------------------------------------------------------------------- |
@@ -72,6 +109,11 @@ def _make_repo(self, tmp_path, readme): |
72 | 109 | encoding="utf-8", |
73 | 110 | ) |
74 | 111 |
|
| 112 | + def _copy_real_templates(self, tmp_path): |
| 113 | + real_tpl = Path(__file__).parent / ".." / "templates" |
| 114 | + tpl_dir = tmp_path / "website" / "templates" |
| 115 | + shutil.copytree(real_tpl, tpl_dir) |
| 116 | + |
75 | 117 | def test_build_creates_single_page(self, tmp_path): |
76 | 118 | readme = textwrap.dedent("""\ |
77 | 119 | # Awesome Python |
@@ -114,6 +156,97 @@ def test_build_creates_single_page(self, tmp_path): |
114 | 156 | # No category sub-pages |
115 | 157 | assert not (site / "categories").exists() |
116 | 158 |
|
| 159 | + def test_build_creates_root_discovery_files(self, tmp_path): |
| 160 | + readme = textwrap.dedent("""\ |
| 161 | + # Awesome Python |
| 162 | +
|
| 163 | + Intro. |
| 164 | +
|
| 165 | + --- |
| 166 | +
|
| 167 | + ## Widgets |
| 168 | +
|
| 169 | + - [w1](https://example.com) - A widget. |
| 170 | +
|
| 171 | + # Contributing |
| 172 | +
|
| 173 | + Help! |
| 174 | + """) |
| 175 | + self._make_repo(tmp_path, readme) |
| 176 | + start_date = datetime.now(UTC).date() |
| 177 | + build(tmp_path) |
| 178 | + end_date = datetime.now(UTC).date() |
| 179 | + |
| 180 | + site = tmp_path / "website" / "output" |
| 181 | + robots = (site / "robots.txt").read_text(encoding="utf-8") |
| 182 | + assert robots == ( |
| 183 | + "User-agent: *\n" |
| 184 | + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" |
| 185 | + "Allow: /\n" |
| 186 | + "\n" |
| 187 | + "Sitemap: https://awesome-python.com/sitemap.xml\n" |
| 188 | + ) |
| 189 | + |
| 190 | + sitemap = ET.parse(site / "sitemap.xml") |
| 191 | + root = sitemap.getroot() |
| 192 | + ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"} |
| 193 | + locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)] |
| 194 | + lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)] |
| 195 | + |
| 196 | + assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset" |
| 197 | + assert locs == ["https://awesome-python.com/"] |
| 198 | + assert len(lastmods) == 1 |
| 199 | + assert start_date <= date.fromisoformat(lastmods[0]) <= end_date |
| 200 | + assert all(loc.startswith("https://awesome-python.com/") for loc in locs) |
| 201 | + assert all("?" not in loc for loc in locs) |
| 202 | + |
| 203 | + def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): |
| 204 | + readme = textwrap.dedent("""\ |
| 205 | + # Awesome Python |
| 206 | +
|
| 207 | + Intro. |
| 208 | +
|
| 209 | + # **Sponsors** |
| 210 | +
|
| 211 | + - **[Sponsor](https://sponsor.example.com)**: Sponsored tool. |
| 212 | +
|
| 213 | + > Become a sponsor: [Sponsor us](SPONSORSHIP.md). |
| 214 | +
|
| 215 | + # Categories |
| 216 | +
|
| 217 | + **Tools** |
| 218 | +
|
| 219 | + - [Widgets](#widgets) |
| 220 | +
|
| 221 | + --- |
| 222 | +
|
| 223 | + ## Widgets |
| 224 | +
|
| 225 | + - [w1](https://example.com) - A widget. |
| 226 | +
|
| 227 | + # Contributing |
| 228 | +
|
| 229 | + Help! |
| 230 | + """) |
| 231 | + (tmp_path / "README.md").write_text(readme, encoding="utf-8") |
| 232 | + self._copy_real_templates(tmp_path) |
| 233 | + |
| 234 | + build(tmp_path) |
| 235 | + |
| 236 | + site = tmp_path / "website" / "output" |
| 237 | + index_html = (site / "index.html").read_text(encoding="utf-8") |
| 238 | + index_md = (site / "index.md").read_text(encoding="utf-8") |
| 239 | + llms_txt = (site / "llms.txt").read_text(encoding="utf-8") |
| 240 | + |
| 241 | + assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html |
| 242 | + assert index_md == llms_txt |
| 243 | + assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories") |
| 244 | + assert "# **Sponsors**" not in index_md |
| 245 | + assert "Sponsor" not in index_md |
| 246 | + assert "SPONSORSHIP.md" not in index_md |
| 247 | + assert "## Widgets" in index_md |
| 248 | + assert "- [w1](https://example.com) - A widget." in index_md |
| 249 | + |
117 | 250 | def test_build_cleans_stale_output(self, tmp_path): |
118 | 251 | readme = textwrap.dedent("""\ |
119 | 252 | # T |
@@ -235,6 +368,40 @@ def test_build_with_stars_sorts_by_stars(self, tmp_path): |
235 | 368 | # Expand content present |
236 | 369 | assert "expand-content" in html |
237 | 370 |
|
| 371 | + def test_index_contains_aligned_homepage_metadata(self, tmp_path): |
| 372 | + readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8") |
| 373 | + (tmp_path / "README.md").write_text(readme, encoding="utf-8") |
| 374 | + self._copy_real_templates(tmp_path) |
| 375 | + |
| 376 | + build(tmp_path) |
| 377 | + |
| 378 | + parsed_groups = parse_readme(readme) |
| 379 | + categories = [cat for group in parsed_groups for cat in group["categories"]] |
| 380 | + entries = extract_entries(categories, parsed_groups) |
| 381 | + html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8") |
| 382 | + parser = HeadMetadataParser() |
| 383 | + parser.feed(html) |
| 384 | + |
| 385 | + expected_title = "Awesome Python" |
| 386 | + expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development." |
| 387 | + expected_url = "https://awesome-python.com/" |
| 388 | + expected_image = "https://awesome-python.com/static/og-image.png" |
| 389 | + |
| 390 | + assert parser.title_count == 1 |
| 391 | + assert parser.title.strip() == expected_title |
| 392 | + assert parser.meta_by_name["description"] == expected_description |
| 393 | + assert parser.links_by_rel["canonical"] == expected_url |
| 394 | + assert parser.meta_by_property["og:type"] == "website" |
| 395 | + assert parser.meta_by_property["og:title"] == expected_title |
| 396 | + assert parser.meta_by_property["og:description"] == expected_description |
| 397 | + assert parser.meta_by_property["og:image"] == expected_image |
| 398 | + assert parser.meta_by_property["og:url"] == expected_url |
| 399 | + assert parser.meta_by_name["twitter:card"] == "summary_large_image" |
| 400 | + assert parser.meta_by_name["twitter:title"] == expected_title |
| 401 | + assert parser.meta_by_name["twitter:description"] == expected_description |
| 402 | + assert parser.meta_by_name["twitter:image"] == expected_image |
| 403 | + assert "<head>\n <meta charset" in html |
| 404 | + |
238 | 405 |
|
239 | 406 | # --------------------------------------------------------------------------- |
240 | 407 | # extract_github_repo |
|
0 commit comments