[HB] Replace Bremen HTML scraper with INSPIRE shapefile (#221)

tifa365 · tim · k-nut · web-flow · commit d25264e54989 · 2026-04-20T11:30:15.000+02:00
Replaces the Bremen HTML scraper with a more robust INSPIRE shapefile-based implementation that includes geolocation support.



---------

Co-authored-by: tim &lt;tfangmeyer@gmail.com&gt;
Co-authored-by: Knut Hühne &lt;knut@k-nut.eu&gt;
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ In details, the IDs are sourced as follows:
 |BY| id from the WFS service                                                                                      | `BY-SCHUL_SCHULSTANDORTEGRUNDSCHULEN_2acb7d31-915d-40a9-adcf-27b38251fa48` |❓ unlikely (although we reached out to ask for canonical IDs to be published)|
 |BE| Field `bsn` (Berliner Schulnummer) from the WFS Service                                                      | `BE-02K10`                                                                 |✅ likely|
 |BB| Field `schul_nr` (Schulnummer) from thw WFS Service                                                          | `BB-111430`                                                                |✅ likely|
-|HB| `id` URL query param on the school's detail page (identical to the SNR (Schulnummer) from the overview page) | `HB-937`                                                                   |✅ likely|
+|HB| Field `snr_txt` (Schulnummer) from the INSPIRE shapefile - official 3-digit ID used in Bremen materials  | `HB-002`                                                                   |✅ likely|
 |HH| Field `schul_id` From the WFS Service                                                                        | `HH-7910-0`                                                                |✅ likely|
 |HE| `school_no` URL query param of the schools's details page (identical to the Dienststellennummer)             | `HE-4024`                                                                  |✅ likely|
 |MV| Field `dstnr` from the WFS                                                                                   | `MV-75130302`                                                              |✅ likely|
@@ -47,7 +47,7 @@ When available, we try to use the geolocations provided by the data publishers.
 | BY    | ✅ Yes                | WFS                                          |
 | BE    | ✅ Yes                | WFS                                          |
 | BB    | ✅ Yes                | WFS                                          |
-| HB    | ❌ No                 | -                                            |
+| HB    | ✅ Yes                | INSPIRE shapefile (converted from EPSG:25832)|
 | HH    | ✅ Yes                | WFS                                          |
 | HE    | ⚠️  Partial (~90%)    | Extracted from OSM on detail pages. The schools without coordinates are schools with placeholder coordinates that are filtered out and schools with no map data at all. |
 | MV    | ✅ Yes                | WFS                                          |
diff --git a/jedeschule/spiders/bremen.py b/jedeschule/spiders/bremen.py
@@ -1,72 +1,62 @@
-# -*- coding: utf-8 -*-
-import scrapy
+import io
 import re
+import zipfile
 from scrapy import Item
+import shapefile
+from pyproj import Transformer
 
 from jedeschule.items import School
 from jedeschule.spiders.school_spider import SchoolSpider
 
 
 class BremenSpider(SchoolSpider):
     name = "bremen"
-    start_urls = [
-        "http://www.bildung.bremen.de/detail.php?template=35_schulsuche_stufe2_d"
-    ]
+    ZIP_URL = "https://gdi2.geo.bremen.de/inspire/download/Schulstandorte/data/Schulstandorte_HB_BHV.zip"
 
-    def parse(self, response):
-        for link in response.css(".table_daten_container a ::attr(href)").extract():
-            request = scrapy.Request(response.urljoin(link), callback=self.parse_detail)
-            request.meta["id"] = link.split("de&Sid=", 1)[1]
-            yield request
+    start_urls = [ZIP_URL]
 
-    def parse_detail(self, response):
-        lis = response.css(".kogis_main_visitenkarte ul li")
+    def parse(self, response):
+        # Read both shapefiles directly from ZIP (no extractall)
+        with zipfile.ZipFile(io.BytesIO(response.body), "r") as zf:
+            for stem in ("gdi_schulen_hb", "gdi_schulen_bhv"):
+                shp_bytes = io.BytesIO(zf.read(f"{stem}.shp"))
+                shx_bytes = io.BytesIO(zf.read(f"{stem}.shx"))
+                dbf_bytes = io.BytesIO(zf.read(f"{stem}.dbf"))
+
+                # Detect encoding from .cpg if present
+                encoding = None
+                try:
+                    cpg = zf.read(f"{stem}.cpg").decode("ascii", "ignore").strip()
+                    encoding = cpg or None
+                except KeyError:
+                    pass
+
+                sf = shapefile.Reader(
+                    shp=shp_bytes, shx=shx_bytes, dbf=dbf_bytes, encoding=encoding
+                )
+
+                for record in sf.records():
+                    yield record.as_dict()
 
-        if len(lis) == 0:
-            # Detail page contains no info, see https://github.com/Datenschule/jedeschule-scraper/issues/54
-            return
+    @staticmethod
+    def normalize(item: Item) -> School:
+        item_lower = {k.lower(): v for k, v in item.items()}
+        snr_txt = (item_lower.get("snr_txt") or "").strip()
+        if not snr_txt or not re.fullmatch(r"\d{3}", snr_txt):
+            raise ValueError(f"Invalid or missing SNR_TXT: '{snr_txt}'")
 
-        collection = {}
-        collection["id"] = response.meta["id"].zfill(3)
-        collection["name"] = response.css(".main_article h3 ::text").extract_first()
-        for li in lis:
-            key = li.css("span ::attr(title)").extract_first()
-            value = " ".join([part.strip() for part in li.css("::text").extract()])
-            # Filter out this pointless entry
-            if key is not None:
-                collection[key] = value
-            collection["data_url"] = response.url
-        if collection["name"]:
-            yield collection
+        transformer = Transformer.from_crs(25832, 4326, always_xy=True)
+        lon, lat = transformer.transform(item.get('x_etrs'), item.get('y_etrs'))
 
-    def fix_number(number):
-        new = ""
-        for letter in number:
-            if letter.isdigit():
-                new += letter
-        return new
 
-    @staticmethod
-    def normalize(item: Item) -> School:
-        if "Ansprechperson" in item:
-            ansprechpersonen = (
-                item["Ansprechperson"]
-                .replace("Schulleitung:", "")
-                .replace("Vertretung:", ",")
-                .split(",")
-            )
-            director = ansprechpersonen[0].replace("\n", "").strip()
-        else:
-            director = None
         return School(
-            name=item.get("name").strip(),
-            id="HB-{}".format(item.get("id")),
-            address=re.split(r"\d{5}", item.get("Anschrift:").strip())[0].strip(),
-            zip=re.findall(r"\d{5}", item.get("Anschrift:").strip())[0],
-            city=re.split(r"\d{5}", item.get("Anschrift:").strip())[1].strip(),
-            website=item.get("Internet").strip() if item.get("Internet") else None,
-            email=item.get("E-Mail-Adresse").strip(),
-            fax=BremenSpider.fix_number(item.get("Telefax")),
-            phone=BremenSpider.fix_number(item.get("Telefon")),
-            director=director,
+            id=f"HB-{snr_txt}",
+            name=(item_lower.get("nam") or "").strip(),
+            address=(item_lower.get("strasse") or "").strip(),
+            zip=(item_lower.get("plz") or "").strip(),
+            city=(item_lower.get("ort") or "").strip(),
+            school_type=(item_lower.get("schulart_2") or "").strip(),
+            provider=(item_lower.get("traegernam") or "").strip(),
+            latitude=lat,
+            longitude=lon,
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
     "openpyxl>=3.1.5",
     "psycopg2==2.9.11",
     "pyproj==3.7.2",
+    "pyshp>=3.0.2.post1",
     "requests==2.33.0",
     "scrapy==2.14.2",
     "sqlalchemy==2.0.49",
diff --git a/test/test_bremen.py b/test/test_bremen.py
@@ -0,0 +1,124 @@
+import tempfile
+import unittest
+import zipfile
+from pathlib import Path
+
+import shapefile
+from pyproj import Transformer
+from scrapy.http import Response
+
+from jedeschule.spiders.bremen import BremenSpider
+
+
+class TestBremenSpider(unittest.TestCase):
+    def test_parse(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_path = Path(tmpdir)
+            transformer = Transformer.from_crs(4326, 25832, always_xy=True)
+
+            self._write_shapefile(
+                tmp_path / "gdi_schulen_hb",
+                [
+                    {
+                        "SNR_TXT": "002",
+                        "NAM": "Schule an der Admiralstrasse",
+                        "STRASSE": "Winterstr. 20",
+                        "PLZ": "28215",
+                        "ORT": "Bremen",
+                        "ORTSTEILNA": "Findorff",
+                        "TRAEGERNAM": "Stadt Bremen",
+                        "SCHULART_2": "Grundschule",
+                        "coords": transformer.transform(8.807362345274395, 53.09026336124759),
+                    }
+                ],
+            )
+            self._write_shapefile(
+                tmp_path / "gdi_schulen_bhv",
+                [
+                    {
+                        "SNR_TXT": "150",
+                        "NAM": "Amerikanische Schule",
+                        "STRASSE": "Kleiner Blink 8",
+                        "PLZ": "27580",
+                        "ORT": "Bremerhaven",
+                        "ORTSTEILNA": "Lehe",
+                        "TRAEGERNAM": "Stadt Bremerhaven",
+                        "SCHULART_2": "Grundschule",
+                        "coords": transformer.transform(8.587648, 53.579061),
+                    }
+                ],
+            )
+
+            response = Response(
+                url=BremenSpider.ZIP_URL,
+                body=self._build_zip(tmp_path),
+            )
+
+            spider = BremenSpider()
+            schools = list(spider.parse(response))
+
+        self.assertEqual(len(schools), 2)
+
+        first_school = spider.normalize(schools[0])
+        second_school = spider.normalize(schools[1])
+
+        self.assertEqual(first_school["id"], "HB-002")
+        self.assertEqual(first_school["name"], "Schule an der Admiralstrasse")
+        self.assertEqual(first_school["address"], "Winterstr. 20")
+        self.assertEqual(first_school["zip"], "28215")
+        self.assertEqual(first_school["city"], "Bremen")
+        self.assertEqual(first_school["provider"], "Stadt Bremen")
+        self.assertEqual(first_school["school_type"], "Grundschule")
+        self.assertAlmostEqual(first_school["latitude"], 53.09026336124759)
+        self.assertAlmostEqual(first_school["longitude"], 8.807362345274395)
+
+        self.assertEqual(second_school["id"], "HB-150")
+        self.assertEqual(second_school["city"], "Bremerhaven")
+        self.assertEqual(second_school["provider"], "Stadt Bremerhaven")
+        self.assertEqual(second_school["school_type"], "Grundschule")
+        self.assertAlmostEqual(second_school["latitude"], 53.579061, places=5)
+        self.assertAlmostEqual(second_school["longitude"], 8.587648, places=5)
+
+    def _write_shapefile(self, base_path: Path, rows: list[dict]):
+        with shapefile.Writer(str(base_path), shapeType=shapefile.POINT) as writer:
+            writer.field("SNR_TXT", "C")
+            writer.field("NAM", "C")
+            writer.field("STRASSE", "C")
+            writer.field("PLZ", "C")
+            writer.field("ORT", "C")
+            writer.field("ORTSTEILNA", "C")
+            writer.field("TRAEGERNAM", "C")
+            writer.field("SCHULART_2", "C")
+            writer.field("x_etrs", "C")
+            writer.field("y_etrs", "C")
+
+            for row in rows:
+                x, y = row["coords"]
+                writer.point(x, y)
+                writer.record(
+                    row["SNR_TXT"],
+                    row["NAM"],
+                    row["STRASSE"],
+                    row["PLZ"],
+                    row["ORT"],
+                    row["ORTSTEILNA"],
+                    row["TRAEGERNAM"],
+                    row["SCHULART_2"],
+                    x,
+                    y,
+                )
+
+        (base_path.with_suffix(".cpg")).write_text("UTF-8", encoding="ascii")
+
+    def _build_zip(self, tmp_path: Path) -> bytes:
+        zip_path = tmp_path / "bremen.zip"
+        with zipfile.ZipFile(zip_path, "w") as zf:
+            for stem in ("gdi_schulen_hb", "gdi_schulen_bhv"):
+                for suffix in (".shp", ".shx", ".dbf", ".cpg"):
+                    path = tmp_path / f"{stem}{suffix}"
+                    zf.write(path, arcname=path.name)
+        return zip_path.read_bytes()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/uv.lock b/uv.lock