diff --git a/alembic/versions/e6c0d3f4a5b6_add_state_key_to_schools.py b/alembic/versions/e6c0d3f4a5b6_add_state_key_to_schools.py new file mode 100644 index 0000000..120d951 --- /dev/null +++ b/alembic/versions/e6c0d3f4a5b6_add_state_key_to_schools.py @@ -0,0 +1,35 @@ +"""add state_key to schools + +Revision ID: e6c0d3f4a5b6 +Revises: c4a8f1b2d3e4 +Create Date: 2026-04-17 + +Land code (ISO 3166-2:DE without DE- prefix) set by each spider's ``state_key`` and stored on insert/update. +Optional backfill for existing rows: first segment before ``-`` when it is exactly two characters. +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text + + +revision = "e6c0d3f4a5b6" +down_revision = "c4a8f1b2d3e4" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column("schools", sa.Column("state_key", sa.String(), nullable=True)) + op.execute( + text( + "UPDATE schools SET state_key = split_part(id, '-', 1) " + "WHERE state_key IS NULL " + "AND strpos(id, '-') > 0 " + "AND length(split_part(id, '-', 1)) = 2" + ) + ) + + +def downgrade(): + op.drop_column("schools", "state_key") diff --git a/jedeschule/items.py b/jedeschule/items.py index 38f80b9..b455455 100644 --- a/jedeschule/items.py +++ b/jedeschule/items.py @@ -11,6 +11,7 @@ class School(scrapy.Item): name = scrapy.Field() id = scrapy.Field() + state_key = scrapy.Field() address = scrapy.Field() address2 = scrapy.Field() zip = scrapy.Field() diff --git a/jedeschule/pipelines/db_pipeline.py b/jedeschule/pipelines/db_pipeline.py index c1cc0f7..405e5db 100644 --- a/jedeschule/pipelines/db_pipeline.py +++ b/jedeschule/pipelines/db_pipeline.py @@ -28,6 +28,7 @@ def get_session(): class School(Base): __tablename__ = "schools" id = Column(String, primary_key=True) + state_key = Column(String, nullable=True) name = Column(String) address = Column(String) address2 = Column(String) diff --git a/jedeschule/pipelines/school_pipeline.py b/jedeschule/pipelines/school_pipeline.py index 658158e..30ad295 100644 --- a/jedeschule/pipelines/school_pipeline.py +++ b/jedeschule/pipelines/school_pipeline.py @@ -15,4 +15,11 @@ class SchoolPipelineItem: class SchoolPipeline(object): def process_item(self, item, spider: SchoolSpider) -> SchoolPipelineItem: school = spider.normalize(item) + sk = spider.state_key + if not isinstance(sk, str) or not sk.strip(): + raise ValueError( + f"Spider {spider.name!r} must set a non-empty string state_key " + f"(ISO 3166-2:DE code without DE- prefix)" + ) + school["state_key"] = sk.strip() return SchoolPipelineItem(info=school, item=item) diff --git a/jedeschule/spiders/baden_wuerttemberg.py b/jedeschule/spiders/baden_wuerttemberg.py index 9e6cab2..c318530 100644 --- a/jedeschule/spiders/baden_wuerttemberg.py +++ b/jedeschule/spiders/baden_wuerttemberg.py @@ -2,8 +2,8 @@ import scrapy from scrapy import Item -from jedeschule.spiders.school_spider import SchoolSpider from jedeschule.items import School +from jedeschule.spiders.school_spider import SchoolSpider # Pattern to extract DISCH (8-digit school ID) from Baden-Württemberg email addresses @@ -35,6 +35,7 @@ def extract_disch(email: str | None) -> str | None: class BadenWuerttembergSpider(SchoolSpider): name = "baden-wuerttemberg" + state_key = "BW" start_urls = [ "https://gis.kultus-bw.de/geoserver/us-govserv/ows?" @@ -134,12 +135,12 @@ def parse(self, response): yield item - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: # Prefer DISCH (stable government ID) over UUID when available disch = item.get("disch") uuid = item.get("uuid") - school_id = f"BW-{disch}" if disch else f"BW-UUID-{uuid}" + sk = self.state_key + school_id = f"{sk}-{disch}" if disch else f"{sk}-UUID-{uuid}" return School( id=school_id, diff --git a/jedeschule/spiders/bayern.py b/jedeschule/spiders/bayern.py index e6c3404..2fa24d3 100644 --- a/jedeschule/spiders/bayern.py +++ b/jedeschule/spiders/bayern.py @@ -7,6 +7,7 @@ class BayernSpider(SchoolSpider): name = "bayern" + state_key = "BY" start_urls = [ "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?" "SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=" @@ -47,15 +48,14 @@ def parse(self, response, **kwargs): yield data_elem - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: return School( name=item.get("schulname"), address=item.get("strasse"), city=item.get("ort"), school_type=item.get("schulart"), zip=item.get("postleitzahl"), - id="BY-{}".format(item.get("id")), + id=self.make_school_id("{}".format(item.get("id"))), latitude=item.get("lat"), longitude=item.get("lon"), ) diff --git a/jedeschule/spiders/berlin.py b/jedeschule/spiders/berlin.py index cda1111..f2a0aee 100644 --- a/jedeschule/spiders/berlin.py +++ b/jedeschule/spiders/berlin.py @@ -7,6 +7,7 @@ class BerlinSpider(SchoolSpider): name = "berlin" + state_key = "BE" start_urls = [ "https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326" "&typename=fis:schulen&outputFormat=application/json" @@ -15,11 +16,10 @@ class BerlinSpider(SchoolSpider): def parse(self, response, **kwargs): yield from parse_geojson_features(response) - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: return School( name=item.get("schulname"), - id="BE-{}".format(item.get("bsn")), + id=self.make_school_id("{}".format(item.get("bsn"))), address=" ".join([item.get("strasse"), item.get("hausnr")]), zip=item.get("plz"), city="Berlin", diff --git a/jedeschule/spiders/brandenburg.py b/jedeschule/spiders/brandenburg.py index 90e5b7f..90f7e98 100644 --- a/jedeschule/spiders/brandenburg.py +++ b/jedeschule/spiders/brandenburg.py @@ -7,6 +7,7 @@ class BrandenburgSpider(SchoolSpider): name = "brandenburg" + state_key = "BB" start_urls = [ "https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte" @@ -16,11 +17,10 @@ class BrandenburgSpider(SchoolSpider): def parse(self, response, **kwargs): yield from parse_geojson_features(response) - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: return School( name=item.get("schulname"), - id="BB-{}".format(item.get("schul_nr")), + id=self.make_school_id("{}".format(item.get("schul_nr"))), address=item.get("strasse_hausnr"), zip=item.get("plz"), city=item.get("ort"), diff --git a/jedeschule/spiders/bremen.py b/jedeschule/spiders/bremen.py index 1cba614..f14a502 100644 --- a/jedeschule/spiders/bremen.py +++ b/jedeschule/spiders/bremen.py @@ -9,6 +9,7 @@ class BremenSpider(SchoolSpider): name = "bremen" + state_key = "HB" start_urls = [ "http://www.bildung.bremen.de/detail.php?template=35_schulsuche_stufe2_d" ] @@ -46,8 +47,7 @@ def fix_number(number): new += letter return new - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: if "Ansprechperson" in item: ansprechpersonen = ( item["Ansprechperson"] @@ -60,7 +60,7 @@ def normalize(item: Item) -> School: director = None return School( name=item.get("name").strip(), - id="HB-{}".format(item.get("id")), + id=self.make_school_id("{}".format(item.get("id"))), address=re.split(r"\d{5}", item.get("Anschrift:").strip())[0].strip(), zip=re.findall(r"\d{5}", item.get("Anschrift:").strip())[0], city=re.split(r"\d{5}", item.get("Anschrift:").strip())[1].strip(), diff --git a/jedeschule/spiders/hamburg.py b/jedeschule/spiders/hamburg.py index c312036..379de31 100644 --- a/jedeschule/spiders/hamburg.py +++ b/jedeschule/spiders/hamburg.py @@ -7,6 +7,7 @@ class HamburgSpider(SchoolSpider): name = "hamburg" + state_key = "HH" start_urls = [ "https://api.hamburg.de/datasets/v1/schulen/collections/staatliche_schulen/items" @@ -24,13 +25,12 @@ class HamburgSpider(SchoolSpider): def parse(self, response, **kwargs): yield from parse_geojson_features(response) - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: city_parts = item.get("adresse_ort").split() zip_code, city = city_parts[0], city_parts[1:] return School( name=item.get("schulname"), - id="HH-{}".format(item.get("schul_id")), + id=self.make_school_id("{}".format(item.get("schul_id"))), address=item.get("adresse_strasse_hausnr"), address2="", zip=zip_code, diff --git a/jedeschule/spiders/hessen.py b/jedeschule/spiders/hessen.py index 29f0aff..803d98d 100644 --- a/jedeschule/spiders/hessen.py +++ b/jedeschule/spiders/hessen.py @@ -17,6 +17,7 @@ class HessenSpider(SchoolSpider): 3. Extracting contact info and coordinates from detail pages """ name = "hessen" + state_key = "HE" start_urls = ["https://schul-db.bildung.hessen.de/schul_db.html"] @@ -124,8 +125,7 @@ def parse_details(self, response): yield school - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: """Transform raw scraped data into standardized School model""" return School( name=item.get("name"), @@ -136,7 +136,7 @@ def normalize(item: Item) -> School: city=item.get("ort"), zip=item.get("plz"), school_type=item.get("schultyp"), - id="HE-{}".format(item.get("id")), # Prefix with state code + id=self.make_school_id("{}".format(item.get("id"))), latitude=item.get("latitude"), longitude=item.get("longitude"), ) diff --git a/jedeschule/spiders/mecklenburg_vorpommern.py b/jedeschule/spiders/mecklenburg_vorpommern.py index cd2ff58..c52da02 100644 --- a/jedeschule/spiders/mecklenburg_vorpommern.py +++ b/jedeschule/spiders/mecklenburg_vorpommern.py @@ -14,6 +14,7 @@ def as_string(value: str): class MecklenburgVorpommernSpider(SchoolSpider): name = "mecklenburg-vorpommern" + state_key = "MV" start_urls = [ "https://www.geodaten-mv.de/dienste/schulstandorte_wfs?" "SERVICE=WFS&REQUEST=GetFeature&VERSION=2.0.0&srsname=EPSG%3A4326&typeNames=" @@ -67,8 +68,7 @@ def _extract_school_data(school): return data_elem - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: def safe_strip(value): if not value or not value.strip(): return None @@ -76,7 +76,7 @@ def safe_strip(value): return School( name=safe_strip(item.get("schulname")), - id="MV-{}".format(as_string(item.get("dstnr", ""))), + id=self.make_school_id(as_string(item.get("dstnr", ""))), address=safe_strip(item.get("strassehnr")), address2="", zip=as_string(item.get("plz", "")).zfill(5), diff --git a/jedeschule/spiders/niedersachsen.py b/jedeschule/spiders/niedersachsen.py index 2cfaef3..96bd7ec 100644 --- a/jedeschule/spiders/niedersachsen.py +++ b/jedeschule/spiders/niedersachsen.py @@ -11,6 +11,7 @@ class NiedersachsenSpider(SchoolSpider): name = "niedersachsen" + state_key = "NI" start_urls = ["https://schulen.nibis.de/search/advanced"] def parse(self, response: Response): @@ -54,8 +55,7 @@ def _get(dict_like, key, default): # at all. return dict_like.get(key) or default - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: name = " ".join( [item.get("schulname", ""), item.get("namenszuatz", "")] ).strip() @@ -75,5 +75,5 @@ def normalize(item: Item) -> School: school_type=school_type, provider=provider, legal_status=item.get("sdb_traegerschaft", {}).get("bezeichnung"), - id="NI-{}".format(item.get("schulnr")), + id=self.make_school_id("{}".format(item.get("schulnr"))), ) diff --git a/jedeschule/spiders/nordrhein_westfalen.py b/jedeschule/spiders/nordrhein_westfalen.py index 17ba712..fe74406 100644 --- a/jedeschule/spiders/nordrhein_westfalen.py +++ b/jedeschule/spiders/nordrhein_westfalen.py @@ -15,6 +15,7 @@ class NordrheinWestfalenSpider(SchoolSpider): name = "nordrhein-westfalen" + state_key = "NW" start_urls = [ "https://www.schulministerium.nrw.de/BiPo/OpenData/Schuldaten/schuldaten.csv", @@ -27,8 +28,7 @@ def parse(self, response): for line in reader: yield line - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: name = " ".join( [ item.get("Schulbezeichnung_1", ""), @@ -49,7 +49,7 @@ def normalize(item: Item) -> School: return School( name=name, - id="NW-{}".format(item.get("Schulnummer")), + id=self.make_school_id("{}".format(item.get("Schulnummer"))), address=item.get("Strasse"), zip=item.get("PLZ"), city=item.get("Ort"), diff --git a/jedeschule/spiders/rheinland_pfalz.py b/jedeschule/spiders/rheinland_pfalz.py index 49726d4..2cf60ad 100644 --- a/jedeschule/spiders/rheinland_pfalz.py +++ b/jedeschule/spiders/rheinland_pfalz.py @@ -33,6 +33,7 @@ class RheinlandPfalzSpider(CrawlSpider, SchoolSpider): name = "rheinland-pfalz" + state_key = "RP" # Note, one could also use the geo portal: # https://www.geoportal.rlp.de/spatial-objects/350/collections/schulstandorte/items?f=html&limit=4000 start_urls = ["https://bildung.rlp.de/schulen"] @@ -79,7 +80,7 @@ def normalize(self, item: Item) -> School: return School( name=item.get("name"), - id="RP-{}".format(item.get("id")), + id=self.make_school_id("{}".format(item.get("id"))), address=item.get("Anschrift")[1], city=city, zip=zip, diff --git a/jedeschule/spiders/saarland.py b/jedeschule/spiders/saarland.py index 57a68f1..674ea4a 100644 --- a/jedeschule/spiders/saarland.py +++ b/jedeschule/spiders/saarland.py @@ -7,6 +7,7 @@ class SaarlandSpider(SchoolSpider): name = "saarland" + state_key = "SL" start_urls = [ "https://geoportal.saarland.de/spatial-objects/257/collections/Staatliche_Dienste:Schulen_SL/items?f=json&limit=2500" ] @@ -14,8 +15,7 @@ class SaarlandSpider(SchoolSpider): def parse(self, response, **kwargs): yield from parse_geojson_features(response) - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: # The data also contains a field called `Schulkennz` which implies that it might be an id # that could be used, but some schools share ids (especially `0` or `000000`) or # do not have any set at all which makes for collisions @@ -25,7 +25,7 @@ def normalize(item: Item) -> School: address=item.get("Straße", "").strip(), city=item.get("Ort"), fax=item.get("Fax"), - id=f"SL-{school_id}", + id=self.make_school_id("{}".format(school_id)), latitude=item.get("lat"), longitude=item.get("lon"), name=item.get("Bezeichnung"), diff --git a/jedeschule/spiders/sachsen.py b/jedeschule/spiders/sachsen.py index 94f196c..ecaa18e 100644 --- a/jedeschule/spiders/sachsen.py +++ b/jedeschule/spiders/sachsen.py @@ -9,6 +9,7 @@ class SachsenSpider(SchoolSpider): name = "sachsen" + state_key = "SN" # URL was created via https://schuldatenbank.sachsen.de/index.php?id=30 start_urls = [ @@ -19,11 +20,13 @@ def parse(self, response, **kwargs): for school in json.loads(response.text): yield school - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: helper = SachsenHelper() building = item.get("buildings", [None])[0] - school = School(name=item.get("name"), id="SN-{}".format(item.get("id"))) + school = School( + name=item.get("name"), + id=self.make_school_id("{}".format(item.get("id"))), + ) if building is None: return school school["address"] = building.get("street") diff --git a/jedeschule/spiders/sachsen_anhalt.py b/jedeschule/spiders/sachsen_anhalt.py index a7a1316..267c548 100644 --- a/jedeschule/spiders/sachsen_anhalt.py +++ b/jedeschule/spiders/sachsen_anhalt.py @@ -8,6 +8,7 @@ class SachsenAnhaltSpider(SchoolSpider): name = "sachsen-anhalt" + state_key = "ST" # ArcGIS FeatureServer API - contains 857 schools with coordinates # Note: This dataset excludes vocational schools (Berufsbildende Schulen) @@ -47,11 +48,10 @@ def parse(self, response): "object_id": attrs.get("OBJECTID"), } - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: """Normalize ArcGIS data to School item""" # Generate ID from OBJECTID - school_id = f"ST-ARC{item.get('object_id', 0):05d}" + school_id = self.make_school_id(f"ARC{item.get('object_id', 0):05d}") return School( name=item.get("name"), diff --git a/jedeschule/spiders/schleswig_holstein.py b/jedeschule/spiders/schleswig_holstein.py index f926885..ee0ecf5 100644 --- a/jedeschule/spiders/schleswig_holstein.py +++ b/jedeschule/spiders/schleswig_holstein.py @@ -8,6 +8,7 @@ class SchleswigHolsteinSpider(SchoolSpider): name = "schleswig-holstein" + state_key = "SH" base_url = "https://opendata.schleswig-holstein.de/collection/schulen/aktuell.csv" start_urls = [base_url] @@ -16,11 +17,10 @@ def parse(self, response): for row in reader: yield row - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: return School( name=item.get("name"), - id="SH-{}".format(item.get("id")), + id=self.make_school_id("{}".format(item.get("id"))), address=" ".join( [item.get("street", ""), item.get("houseNumber", "")] ).strip(), diff --git a/jedeschule/spiders/school_spider.py b/jedeschule/spiders/school_spider.py index 40ec965..0069ae7 100644 --- a/jedeschule/spiders/school_spider.py +++ b/jedeschule/spiders/school_spider.py @@ -7,6 +7,12 @@ class SchoolSpider(scrapy.Spider, ABC): - @staticmethod - def normalize(item: Item) -> School: - pass + #: ISO 3166-2:DE code (no ``DE-`` prefix). Set on each Land spider; persisted as ``schools.state_key``. + state_key: str = "" + + def make_school_id(self, tail: str) -> str: + """``{state_key}-{tail}`` — same prefix as stored ``state_key`` / composed ``id``.""" + return f"{self.state_key}-{tail}" + + def normalize(self, item: Item) -> School: + raise NotImplementedError diff --git a/jedeschule/spiders/thueringen.py b/jedeschule/spiders/thueringen.py index 73492d2..1bddaa9 100644 --- a/jedeschule/spiders/thueringen.py +++ b/jedeschule/spiders/thueringen.py @@ -7,6 +7,7 @@ class ThueringenSpider(SchoolSpider): name = "thueringen" + state_key = "TH" start_urls = [ "https://www.geoproxy.geoportal-th.de/geoproxy/services/kommunal/komm_wfs?" "SERVICE=WFS&REQUEST=GetFeature&typeNames=kommunal:komm_schul&" @@ -43,11 +44,10 @@ def parse(self, response, **kwargs): yield data_elem - @staticmethod - def normalize(item: Item) -> School: + def normalize(self, item: Item) -> School: return School( name=item.get("Name"), - id="TH-{}".format(item.get("Schulnummer")), + id=self.make_school_id("{}".format(item.get("Schulnummer"))), address=" ".join( filter(None, [item.get("Strasse"), item.get("Hausnummer")]) ),