feat: move away from Wikipedia to UNESCO for a crawling exercise

honzajavorek · honzajavorek · commit fb56b53a007e · 2026-01-14T17:14:44.000+01:00
diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md
@@ -7,7 +7,7 @@ slug: /scraping-basics-python/crawling
 
 import CodeBlock from '@theme/CodeBlock';
 import Exercises from '../scraping_basics/_exercises.mdx';
-import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.py';
+import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.py';
 import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py';
 
 **In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.**
@@ -183,24 +183,21 @@ In the next lesson, we'll scrape the product detail pages so that each product v
 
 <Exercises />
 
-### Scrape calling codes of African countries
+### Scrape UNESCO World Heritage Sites
 
-Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
+Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL:
 
 ```text
-https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+https://www.unesco.org/en/countries
 ```
 
 Your program should print the following:
 
 ```text
-https://en.wikipedia.org/wiki/Algeria +213
-https://en.wikipedia.org/wiki/Angola +244
-https://en.wikipedia.org/wiki/Benin +229
-https://en.wikipedia.org/wiki/Botswana +267
-https://en.wikipedia.org/wiki/Burkina_Faso +226
-https://en.wikipedia.org/wiki/Burundi None
-https://en.wikipedia.org/wiki/Cameroon +237
+https://www.unesco.org/en/countries/af 2
+https://www.unesco.org/en/countries/al 4
+https://www.unesco.org/en/countries/dz 7
+https://www.unesco.org/en/countries/ad 1
 ...
 ```
 
@@ -212,7 +209,7 @@ Locating cells in tables is sometimes easier if you know how to [navigate up](ht
 
 <details>
   <summary>Solution</summary>
-  <CodeBlock language="py">{WikipediaCallingCodesExercise.code}</CodeBlock>
+  <CodeBlock language="py">{UnescoWhsCountsExercise.code}</CodeBlock>
 </details>
 
 ### Scrape authors of F1 news articles
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
@@ -93,12 +93,12 @@ teardown() {
   [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
-@test "prints Wikipedia calling codes" {
-  run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py
+@test "prints counts of UNESCO WHS" {
+  run uv run --with=httpx --with=beautifulsoup4 python unesco_whs_counts.py
 
-  [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
-  [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
-  [[ $(echo "$output" | wc -l) -gt 5 ]]
+  [[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]]
+  [[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 50 ]]
 }
 
 @test "lists Guardian F1 authors" {
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py
@@ -0,0 +1,31 @@
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+def download(url: str) -> BeautifulSoup:
+    response = httpx.get(url)
+    response.raise_for_status()
+    return BeautifulSoup(response.text, "html.parser")
+
+
+def parse_whc_count(soup: BeautifulSoup) -> int:
+    for card in soup.select(".card-body"):
+        card_title = card.select_one(".card-title").text
+        if "World Heritage Sites" in card_title:
+            return int(card.select_one(".card-number").text.strip())
+    return 0
+
+
+listing_url = "https://www.unesco.org/en/countries"
+listing_soup = download(listing_url)
+
+for country in listing_soup.select(".node--type-country"):
+    link = country.select_one("a")
+    if not link or 'href' not in link.attrs:
+        continue
+
+    country_url = urljoin(listing_url, link["href"])
+    country_soup = download(country_url)
+    whs_count = parse_whc_count(country_soup)
+    print(country_url, whs_count)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py