Skip to content

Commit b56a4fc

Browse files
committed
feat: move away from Wikipedia to UNESCO for a crawling exercise, JavaScript
1 parent fb56b53 commit b56a4fc

4 files changed

Lines changed: 57 additions & 19 deletions

File tree

sources/academy/webscraping/scraping_basics_javascript/10_crawling.md

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/crawling
88
import CodeBlock from '@theme/CodeBlock';
99
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
1010
import Exercises from '../scraping_basics/_exercises.mdx';
11-
import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.mjs';
11+
import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.mjs';
1212
import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs';
1313

1414
<LegacyJsCourseAdmonition />
@@ -210,24 +210,21 @@ In the next lesson, we'll scrape the product detail pages so that each product v
210210

211211
<Exercises />
212212

213-
### Scrape calling codes of African countries
213+
### Scrape UNESCO World Heritage Sites
214214

215-
Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
215+
Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL:
216216

217217
```text
218-
https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
218+
https://www.unesco.org/en/countries
219219
```
220220

221221
Your program should print the following:
222222

223223
```text
224-
https://en.wikipedia.org/wiki/Algeria +213
225-
https://en.wikipedia.org/wiki/Angola +244
226-
https://en.wikipedia.org/wiki/Benin +229
227-
https://en.wikipedia.org/wiki/Botswana +267
228-
https://en.wikipedia.org/wiki/Burkina_Faso +226
229-
https://en.wikipedia.org/wiki/Burundi null
230-
https://en.wikipedia.org/wiki/Cameroon +237
224+
https://www.unesco.org/en/countries/af 2
225+
https://www.unesco.org/en/countries/al 4
226+
https://www.unesco.org/en/countries/dz 7
227+
https://www.unesco.org/en/countries/ad 1
231228
...
232229
```
233230

@@ -239,7 +236,7 @@ Locating cells in tables is sometimes easier if you know how to [filter](https:/
239236

240237
<details>
241238
<summary>Solution</summary>
242-
<CodeBlock language="js">{WikipediaCallingCodesExercise.code}</CodeBlock>
239+
<CodeBlock language="js">{UnescoWhsCountsExercise.code}</CodeBlock>
243240
</details>
244241

245242
### Scrape authors of F1 news articles

sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ teardown_file() {
9090
run node unesco_links.mjs
9191

9292
[[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]]
93-
[[ $(echo "$output" | wc -l) -gt 50 ]]
93+
[[ $(echo "$output" | wc -l) -gt 5 ]]
9494
}
9595

9696
@test "lists Guardian F1 article links" {
@@ -100,11 +100,11 @@ teardown_file() {
100100
[[ $(echo "$output" | wc -l) -gt 5 ]]
101101
}
102102

103-
@test "prints Wikipedia calling codes" {
104-
run node wikipedia_calling_codes.mjs
103+
@test "prints counts of UNESCO WHS" {
104+
run node unesco_whs_counts.mjs
105105

106-
[[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
107-
[[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
106+
[[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]]
107+
[[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]]
108108
[[ $(echo "$output" | wc -l) -gt 5 ]]
109109
}
110110

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import * as cheerio from 'cheerio';
2+
3+
async function download(url) {
4+
const response = await fetch(url);
5+
if (!response.ok) {
6+
throw new Error(`HTTP ${response.status}`);
7+
}
8+
const html = await response.text();
9+
return cheerio.load(html);
10+
}
11+
12+
function parseWhsCount($) {
13+
for (const element of $('.card-body').toArray()) {
14+
const $card = $(element);
15+
const title = $card.find('.card-title').text();
16+
17+
if (title.includes('World Heritage Sites')) {
18+
const number = $card.find('.card-number').text().trim();
19+
return Number.parseInt(number, 10);
20+
}
21+
}
22+
return 0;
23+
}
24+
25+
const listingUrl = 'https://www.unesco.org/en/countries';
26+
const $listing = await download(listingUrl);
27+
28+
for (const element of $listing('.node--type-country').toArray()) {
29+
const $countryCard = $listing(element);
30+
const $link = $countryCard.find('a').first();
31+
const href = $link.attr('href');
32+
33+
if (!href) {
34+
continue;
35+
}
36+
37+
const countryUrl = new URL(href, listingUrl).href;
38+
const $country = await download(countryUrl);
39+
const whsCount = parseWhsCount($country);
40+
console.log(`${countryUrl} ${whsCount}`);
41+
}

sources/academy/webscraping/scraping_basics_python/exercises/test.bats

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ teardown() {
8383
run uv run --with=httpx --with=beautifulsoup4 python unesco_links.py
8484

8585
[[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]]
86-
[[ $(echo "$output" | wc -l) -gt 50 ]]
86+
[[ $(echo "$output" | wc -l) -gt 5 ]]
8787
}
8888

8989
@test "lists Guardian F1 article links" {
@@ -98,7 +98,7 @@ teardown() {
9898

9999
[[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]]
100100
[[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]]
101-
[[ $(echo "$output" | wc -l) -gt 50 ]]
101+
[[ $(echo "$output" | wc -l) -gt 5 ]]
102102
}
103103

104104
@test "lists Guardian F1 authors" {

0 commit comments

Comments
 (0)