Skip to content

Commit d25264e

Browse files
tifa365timk-nut
authored
[HB] Replace Bremen HTML scraper with INSPIRE shapefile (#221)
Replaces the Bremen HTML scraper with a more robust INSPIRE shapefile-based implementation that includes geolocation support. --------- Co-authored-by: tim <tfangmeyer@gmail.com> Co-authored-by: Knut Hühne <knut@k-nut.eu>
1 parent ee5a05a commit d25264e

5 files changed

Lines changed: 183 additions & 57 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ In details, the IDs are sourced as follows:
2626
|BY| id from the WFS service | `BY-SCHUL_SCHULSTANDORTEGRUNDSCHULEN_2acb7d31-915d-40a9-adcf-27b38251fa48` |❓ unlikely (although we reached out to ask for canonical IDs to be published)|
2727
|BE| Field `bsn` (Berliner Schulnummer) from the WFS Service | `BE-02K10` |✅ likely|
2828
|BB| Field `schul_nr` (Schulnummer) from thw WFS Service | `BB-111430` |✅ likely|
29-
|HB| `id` URL query param on the school's detail page (identical to the SNR (Schulnummer) from the overview page) | `HB-937` |✅ likely|
29+
|HB| Field `snr_txt` (Schulnummer) from the INSPIRE shapefile - official 3-digit ID used in Bremen materials | `HB-002` |✅ likely|
3030
|HH| Field `schul_id` From the WFS Service | `HH-7910-0` |✅ likely|
3131
|HE| `school_no` URL query param of the schools's details page (identical to the Dienststellennummer) | `HE-4024` |✅ likely|
3232
|MV| Field `dstnr` from the WFS | `MV-75130302` |✅ likely|
@@ -47,7 +47,7 @@ When available, we try to use the geolocations provided by the data publishers.
4747
| BY | ✅ Yes | WFS |
4848
| BE | ✅ Yes | WFS |
4949
| BB | ✅ Yes | WFS |
50-
| HB | ❌ No | - |
50+
| HB | ✅ Yes | INSPIRE shapefile (converted from EPSG:25832)|
5151
| HH | ✅ Yes | WFS |
5252
| HE | ⚠️ Partial (~90%) | Extracted from OSM on detail pages. The schools without coordinates are schools with placeholder coordinates that are filtered out and schools with no map data at all. |
5353
| MV | ✅ Yes | WFS |

jedeschule/spiders/bremen.py

Lines changed: 45 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,62 @@
1-
# -*- coding: utf-8 -*-
2-
import scrapy
1+
import io
32
import re
3+
import zipfile
44
from scrapy import Item
5+
import shapefile
6+
from pyproj import Transformer
57

68
from jedeschule.items import School
79
from jedeschule.spiders.school_spider import SchoolSpider
810

911

1012
class BremenSpider(SchoolSpider):
1113
name = "bremen"
12-
start_urls = [
13-
"http://www.bildung.bremen.de/detail.php?template=35_schulsuche_stufe2_d"
14-
]
14+
ZIP_URL = "https://gdi2.geo.bremen.de/inspire/download/Schulstandorte/data/Schulstandorte_HB_BHV.zip"
1515

16-
def parse(self, response):
17-
for link in response.css(".table_daten_container a ::attr(href)").extract():
18-
request = scrapy.Request(response.urljoin(link), callback=self.parse_detail)
19-
request.meta["id"] = link.split("de&Sid=", 1)[1]
20-
yield request
16+
start_urls = [ZIP_URL]
2117

22-
def parse_detail(self, response):
23-
lis = response.css(".kogis_main_visitenkarte ul li")
18+
def parse(self, response):
19+
# Read both shapefiles directly from ZIP (no extractall)
20+
with zipfile.ZipFile(io.BytesIO(response.body), "r") as zf:
21+
for stem in ("gdi_schulen_hb", "gdi_schulen_bhv"):
22+
shp_bytes = io.BytesIO(zf.read(f"{stem}.shp"))
23+
shx_bytes = io.BytesIO(zf.read(f"{stem}.shx"))
24+
dbf_bytes = io.BytesIO(zf.read(f"{stem}.dbf"))
25+
26+
# Detect encoding from .cpg if present
27+
encoding = None
28+
try:
29+
cpg = zf.read(f"{stem}.cpg").decode("ascii", "ignore").strip()
30+
encoding = cpg or None
31+
except KeyError:
32+
pass
33+
34+
sf = shapefile.Reader(
35+
shp=shp_bytes, shx=shx_bytes, dbf=dbf_bytes, encoding=encoding
36+
)
37+
38+
for record in sf.records():
39+
yield record.as_dict()
2440

25-
if len(lis) == 0:
26-
# Detail page contains no info, see https://github.com/Datenschule/jedeschule-scraper/issues/54
27-
return
41+
@staticmethod
42+
def normalize(item: Item) -> School:
43+
item_lower = {k.lower(): v for k, v in item.items()}
44+
snr_txt = (item_lower.get("snr_txt") or "").strip()
45+
if not snr_txt or not re.fullmatch(r"\d{3}", snr_txt):
46+
raise ValueError(f"Invalid or missing SNR_TXT: '{snr_txt}'")
2847

29-
collection = {}
30-
collection["id"] = response.meta["id"].zfill(3)
31-
collection["name"] = response.css(".main_article h3 ::text").extract_first()
32-
for li in lis:
33-
key = li.css("span ::attr(title)").extract_first()
34-
value = " ".join([part.strip() for part in li.css("::text").extract()])
35-
# Filter out this pointless entry
36-
if key is not None:
37-
collection[key] = value
38-
collection["data_url"] = response.url
39-
if collection["name"]:
40-
yield collection
48+
transformer = Transformer.from_crs(25832, 4326, always_xy=True)
49+
lon, lat = transformer.transform(item.get('x_etrs'), item.get('y_etrs'))
4150

42-
def fix_number(number):
43-
new = ""
44-
for letter in number:
45-
if letter.isdigit():
46-
new += letter
47-
return new
4851

49-
@staticmethod
50-
def normalize(item: Item) -> School:
51-
if "Ansprechperson" in item:
52-
ansprechpersonen = (
53-
item["Ansprechperson"]
54-
.replace("Schulleitung:", "")
55-
.replace("Vertretung:", ",")
56-
.split(",")
57-
)
58-
director = ansprechpersonen[0].replace("\n", "").strip()
59-
else:
60-
director = None
6152
return School(
62-
name=item.get("name").strip(),
63-
id="HB-{}".format(item.get("id")),
64-
address=re.split(r"\d{5}", item.get("Anschrift:").strip())[0].strip(),
65-
zip=re.findall(r"\d{5}", item.get("Anschrift:").strip())[0],
66-
city=re.split(r"\d{5}", item.get("Anschrift:").strip())[1].strip(),
67-
website=item.get("Internet").strip() if item.get("Internet") else None,
68-
email=item.get("E-Mail-Adresse").strip(),
69-
fax=BremenSpider.fix_number(item.get("Telefax")),
70-
phone=BremenSpider.fix_number(item.get("Telefon")),
71-
director=director,
53+
id=f"HB-{snr_txt}",
54+
name=(item_lower.get("nam") or "").strip(),
55+
address=(item_lower.get("strasse") or "").strip(),
56+
zip=(item_lower.get("plz") or "").strip(),
57+
city=(item_lower.get("ort") or "").strip(),
58+
school_type=(item_lower.get("schulart_2") or "").strip(),
59+
provider=(item_lower.get("traegernam") or "").strip(),
60+
latitude=lat,
61+
longitude=lon,
7262
)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies = [
1010
"openpyxl>=3.1.5",
1111
"psycopg2==2.9.11",
1212
"pyproj==3.7.2",
13+
"pyshp>=3.0.2.post1",
1314
"requests==2.33.0",
1415
"scrapy==2.14.2",
1516
"sqlalchemy==2.0.49",

test/test_bremen.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import tempfile
2+
import unittest
3+
import zipfile
4+
from pathlib import Path
5+
6+
import shapefile
7+
from pyproj import Transformer
8+
from scrapy.http import Response
9+
10+
from jedeschule.spiders.bremen import BremenSpider
11+
12+
13+
class TestBremenSpider(unittest.TestCase):
14+
def test_parse(self):
15+
with tempfile.TemporaryDirectory() as tmpdir:
16+
tmp_path = Path(tmpdir)
17+
transformer = Transformer.from_crs(4326, 25832, always_xy=True)
18+
19+
self._write_shapefile(
20+
tmp_path / "gdi_schulen_hb",
21+
[
22+
{
23+
"SNR_TXT": "002",
24+
"NAM": "Schule an der Admiralstrasse",
25+
"STRASSE": "Winterstr. 20",
26+
"PLZ": "28215",
27+
"ORT": "Bremen",
28+
"ORTSTEILNA": "Findorff",
29+
"TRAEGERNAM": "Stadt Bremen",
30+
"SCHULART_2": "Grundschule",
31+
"coords": transformer.transform(8.807362345274395, 53.09026336124759),
32+
}
33+
],
34+
)
35+
self._write_shapefile(
36+
tmp_path / "gdi_schulen_bhv",
37+
[
38+
{
39+
"SNR_TXT": "150",
40+
"NAM": "Amerikanische Schule",
41+
"STRASSE": "Kleiner Blink 8",
42+
"PLZ": "27580",
43+
"ORT": "Bremerhaven",
44+
"ORTSTEILNA": "Lehe",
45+
"TRAEGERNAM": "Stadt Bremerhaven",
46+
"SCHULART_2": "Grundschule",
47+
"coords": transformer.transform(8.587648, 53.579061),
48+
}
49+
],
50+
)
51+
52+
response = Response(
53+
url=BremenSpider.ZIP_URL,
54+
body=self._build_zip(tmp_path),
55+
)
56+
57+
spider = BremenSpider()
58+
schools = list(spider.parse(response))
59+
60+
self.assertEqual(len(schools), 2)
61+
62+
first_school = spider.normalize(schools[0])
63+
second_school = spider.normalize(schools[1])
64+
65+
self.assertEqual(first_school["id"], "HB-002")
66+
self.assertEqual(first_school["name"], "Schule an der Admiralstrasse")
67+
self.assertEqual(first_school["address"], "Winterstr. 20")
68+
self.assertEqual(first_school["zip"], "28215")
69+
self.assertEqual(first_school["city"], "Bremen")
70+
self.assertEqual(first_school["provider"], "Stadt Bremen")
71+
self.assertEqual(first_school["school_type"], "Grundschule")
72+
self.assertAlmostEqual(first_school["latitude"], 53.09026336124759)
73+
self.assertAlmostEqual(first_school["longitude"], 8.807362345274395)
74+
75+
self.assertEqual(second_school["id"], "HB-150")
76+
self.assertEqual(second_school["city"], "Bremerhaven")
77+
self.assertEqual(second_school["provider"], "Stadt Bremerhaven")
78+
self.assertEqual(second_school["school_type"], "Grundschule")
79+
self.assertAlmostEqual(second_school["latitude"], 53.579061, places=5)
80+
self.assertAlmostEqual(second_school["longitude"], 8.587648, places=5)
81+
82+
def _write_shapefile(self, base_path: Path, rows: list[dict]):
83+
with shapefile.Writer(str(base_path), shapeType=shapefile.POINT) as writer:
84+
writer.field("SNR_TXT", "C")
85+
writer.field("NAM", "C")
86+
writer.field("STRASSE", "C")
87+
writer.field("PLZ", "C")
88+
writer.field("ORT", "C")
89+
writer.field("ORTSTEILNA", "C")
90+
writer.field("TRAEGERNAM", "C")
91+
writer.field("SCHULART_2", "C")
92+
writer.field("x_etrs", "C")
93+
writer.field("y_etrs", "C")
94+
95+
for row in rows:
96+
x, y = row["coords"]
97+
writer.point(x, y)
98+
writer.record(
99+
row["SNR_TXT"],
100+
row["NAM"],
101+
row["STRASSE"],
102+
row["PLZ"],
103+
row["ORT"],
104+
row["ORTSTEILNA"],
105+
row["TRAEGERNAM"],
106+
row["SCHULART_2"],
107+
x,
108+
y,
109+
)
110+
111+
(base_path.with_suffix(".cpg")).write_text("UTF-8", encoding="ascii")
112+
113+
def _build_zip(self, tmp_path: Path) -> bytes:
114+
zip_path = tmp_path / "bremen.zip"
115+
with zipfile.ZipFile(zip_path, "w") as zf:
116+
for stem in ("gdi_schulen_hb", "gdi_schulen_bhv"):
117+
for suffix in (".shp", ".shx", ".dbf", ".cpg"):
118+
path = tmp_path / f"{stem}{suffix}"
119+
zf.write(path, arcname=path.name)
120+
return zip_path.read_bytes()
121+
122+
123+
if __name__ == "__main__":
124+
unittest.main()

uv.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)