Skip to content

Commit fb56b53

Browse files
committed
feat: move away from Wikipedia to UNESCO for a crawling exercise
1 parent 57ab485 commit fb56b53

4 files changed

Lines changed: 45 additions & 49 deletions

File tree

sources/academy/webscraping/scraping_basics_python/10_crawling.md

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ slug: /scraping-basics-python/crawling
77

88
import CodeBlock from '@theme/CodeBlock';
99
import Exercises from '../scraping_basics/_exercises.mdx';
10-
import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.py';
10+
import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.py';
1111
import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py';
1212

1313
**In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.**
@@ -183,24 +183,21 @@ In the next lesson, we'll scrape the product detail pages so that each product v
183183

184184
<Exercises />
185185

186-
### Scrape calling codes of African countries
186+
### Scrape UNESCO World Heritage Sites
187187

188-
Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
188+
Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL:
189189

190190
```text
191-
https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
191+
https://www.unesco.org/en/countries
192192
```
193193

194194
Your program should print the following:
195195

196196
```text
197-
https://en.wikipedia.org/wiki/Algeria +213
198-
https://en.wikipedia.org/wiki/Angola +244
199-
https://en.wikipedia.org/wiki/Benin +229
200-
https://en.wikipedia.org/wiki/Botswana +267
201-
https://en.wikipedia.org/wiki/Burkina_Faso +226
202-
https://en.wikipedia.org/wiki/Burundi None
203-
https://en.wikipedia.org/wiki/Cameroon +237
197+
https://www.unesco.org/en/countries/af 2
198+
https://www.unesco.org/en/countries/al 4
199+
https://www.unesco.org/en/countries/dz 7
200+
https://www.unesco.org/en/countries/ad 1
204201
...
205202
```
206203

@@ -212,7 +209,7 @@ Locating cells in tables is sometimes easier if you know how to [navigate up](ht
212209

213210
<details>
214211
<summary>Solution</summary>
215-
<CodeBlock language="py">{WikipediaCallingCodesExercise.code}</CodeBlock>
212+
<CodeBlock language="py">{UnescoWhsCountsExercise.code}</CodeBlock>
216213
</details>
217214

218215
### Scrape authors of F1 news articles

sources/academy/webscraping/scraping_basics_python/exercises/test.bats

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,12 @@ teardown() {
9393
[[ $(echo "$output" | wc -l) -gt 5 ]]
9494
}
9595

96-
@test "prints Wikipedia calling codes" {
97-
run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py
96+
@test "prints counts of UNESCO WHS" {
97+
run uv run --with=httpx --with=beautifulsoup4 python unesco_whs_counts.py
9898

99-
[[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
100-
[[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
101-
[[ $(echo "$output" | wc -l) -gt 5 ]]
99+
[[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]]
100+
[[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]]
101+
[[ $(echo "$output" | wc -l) -gt 50 ]]
102102
}
103103

104104
@test "lists Guardian F1 authors" {
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import httpx
2+
from bs4 import BeautifulSoup
3+
from urllib.parse import urljoin
4+
5+
6+
def download(url: str) -> BeautifulSoup:
7+
response = httpx.get(url)
8+
response.raise_for_status()
9+
return BeautifulSoup(response.text, "html.parser")
10+
11+
12+
def parse_whc_count(soup: BeautifulSoup) -> int:
13+
for card in soup.select(".card-body"):
14+
card_title = card.select_one(".card-title").text
15+
if "World Heritage Sites" in card_title:
16+
return int(card.select_one(".card-number").text.strip())
17+
return 0
18+
19+
20+
listing_url = "https://www.unesco.org/en/countries"
21+
listing_soup = download(listing_url)
22+
23+
for country in listing_soup.select(".node--type-country"):
24+
link = country.select_one("a")
25+
if not link or 'href' not in link.attrs:
26+
continue
27+
28+
country_url = urljoin(listing_url, link["href"])
29+
country_soup = download(country_url)
30+
whs_count = parse_whc_count(country_soup)
31+
print(country_url, whs_count)

sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

0 commit comments

Comments
 (0)