Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions alembic/versions/e6c0d3f4a5b6_add_state_key_to_schools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""add state_key to schools

Revision ID: e6c0d3f4a5b6
Revises: c4a8f1b2d3e4
Create Date: 2026-04-17

Land code (ISO 3166-2:DE without DE- prefix) set by each spider's ``state_key`` and stored on insert/update.
Optional backfill for existing rows: first segment before ``-`` when it is exactly two characters.
"""

from alembic import op
import sqlalchemy as sa
from sqlalchemy import text


revision = "e6c0d3f4a5b6"
down_revision = "c4a8f1b2d3e4"
branch_labels = None
depends_on = None


def upgrade():
op.add_column("schools", sa.Column("state_key", sa.String(), nullable=True))
op.execute(
text(
"UPDATE schools SET state_key = split_part(id, '-', 1) "
"WHERE state_key IS NULL "
"AND strpos(id, '-') > 0 "
"AND length(split_part(id, '-', 1)) = 2"
)
)


def downgrade():
op.drop_column("schools", "state_key")
1 change: 1 addition & 0 deletions jedeschule/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class School(scrapy.Item):
name = scrapy.Field()
id = scrapy.Field()
state_key = scrapy.Field()
address = scrapy.Field()
address2 = scrapy.Field()
zip = scrapy.Field()
Expand Down
1 change: 1 addition & 0 deletions jedeschule/pipelines/db_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def get_session():
class School(Base):
__tablename__ = "schools"
id = Column(String, primary_key=True)
state_key = Column(String, nullable=True)
name = Column(String)
address = Column(String)
address2 = Column(String)
Expand Down
7 changes: 7 additions & 0 deletions jedeschule/pipelines/school_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,11 @@ class SchoolPipelineItem:
class SchoolPipeline(object):
def process_item(self, item, spider: SchoolSpider) -> SchoolPipelineItem:
school = spider.normalize(item)
sk = spider.state_key
if not isinstance(sk, str) or not sk.strip():
raise ValueError(
f"Spider {spider.name!r} must set a non-empty string state_key "
f"(ISO 3166-2:DE code without DE- prefix)"
)
school["state_key"] = sk.strip()
return SchoolPipelineItem(info=school, item=item)
9 changes: 5 additions & 4 deletions jedeschule/spiders/baden_wuerttemberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import scrapy
from scrapy import Item

from jedeschule.spiders.school_spider import SchoolSpider
from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider


# Pattern to extract DISCH (8-digit school ID) from Baden-Württemberg email addresses
Expand Down Expand Up @@ -35,6 +35,7 @@ def extract_disch(email: str | None) -> str | None:

class BadenWuerttembergSpider(SchoolSpider):
name = "baden-wuerttemberg"
state_key = "BW"

start_urls = [
"https://gis.kultus-bw.de/geoserver/us-govserv/ows?"
Expand Down Expand Up @@ -134,12 +135,12 @@ def parse(self, response):

yield item

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
# Prefer DISCH (stable government ID) over UUID when available
disch = item.get("disch")
uuid = item.get("uuid")
school_id = f"BW-{disch}" if disch else f"BW-UUID-{uuid}"
sk = self.state_key
school_id = f"{sk}-{disch}" if disch else f"{sk}-UUID-{uuid}"

return School(
id=school_id,
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/bayern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class BayernSpider(SchoolSpider):
name = "bayern"
state_key = "BY"
start_urls = [
"https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?"
"SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename="
Expand Down Expand Up @@ -47,15 +48,14 @@ def parse(self, response, **kwargs):

yield data_elem

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
return School(
name=item.get("schulname"),
address=item.get("strasse"),
city=item.get("ort"),
school_type=item.get("schulart"),
zip=item.get("postleitzahl"),
id="BY-{}".format(item.get("id")),
id=self.make_school_id("{}".format(item.get("id"))),
latitude=item.get("lat"),
longitude=item.get("lon"),
)
6 changes: 3 additions & 3 deletions jedeschule/spiders/berlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class BerlinSpider(SchoolSpider):
name = "berlin"
state_key = "BE"
start_urls = [
"https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326"
"&typename=fis:schulen&outputFormat=application/json"
Expand All @@ -15,11 +16,10 @@ class BerlinSpider(SchoolSpider):
def parse(self, response, **kwargs):
yield from parse_geojson_features(response)

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
return School(
name=item.get("schulname"),
id="BE-{}".format(item.get("bsn")),
id=self.make_school_id("{}".format(item.get("bsn"))),
address=" ".join([item.get("strasse"), item.get("hausnr")]),
zip=item.get("plz"),
city="Berlin",
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/brandenburg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class BrandenburgSpider(SchoolSpider):
name = "brandenburg"
state_key = "BB"

start_urls = [
"https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte"
Expand All @@ -16,11 +17,10 @@ class BrandenburgSpider(SchoolSpider):
def parse(self, response, **kwargs):
yield from parse_geojson_features(response)

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
return School(
name=item.get("schulname"),
id="BB-{}".format(item.get("schul_nr")),
id=self.make_school_id("{}".format(item.get("schul_nr"))),
address=item.get("strasse_hausnr"),
zip=item.get("plz"),
city=item.get("ort"),
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/bremen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class BremenSpider(SchoolSpider):
name = "bremen"
state_key = "HB"
start_urls = [
"http://www.bildung.bremen.de/detail.php?template=35_schulsuche_stufe2_d"
]
Expand Down Expand Up @@ -46,8 +47,7 @@ def fix_number(number):
new += letter
return new

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
if "Ansprechperson" in item:
ansprechpersonen = (
item["Ansprechperson"]
Expand All @@ -60,7 +60,7 @@ def normalize(item: Item) -> School:
director = None
return School(
name=item.get("name").strip(),
id="HB-{}".format(item.get("id")),
id=self.make_school_id("{}".format(item.get("id"))),
address=re.split(r"\d{5}", item.get("Anschrift:").strip())[0].strip(),
zip=re.findall(r"\d{5}", item.get("Anschrift:").strip())[0],
city=re.split(r"\d{5}", item.get("Anschrift:").strip())[1].strip(),
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/hamburg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class HamburgSpider(SchoolSpider):
name = "hamburg"
state_key = "HH"

start_urls = [
"https://api.hamburg.de/datasets/v1/schulen/collections/staatliche_schulen/items"
Expand All @@ -24,13 +25,12 @@ class HamburgSpider(SchoolSpider):
def parse(self, response, **kwargs):
yield from parse_geojson_features(response)

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
city_parts = item.get("adresse_ort").split()
zip_code, city = city_parts[0], city_parts[1:]
return School(
name=item.get("schulname"),
id="HH-{}".format(item.get("schul_id")),
id=self.make_school_id("{}".format(item.get("schul_id"))),
address=item.get("adresse_strasse_hausnr"),
address2="",
zip=zip_code,
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/hessen.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class HessenSpider(SchoolSpider):
3. Extracting contact info and coordinates from detail pages
"""
name = "hessen"
state_key = "HE"

start_urls = ["https://schul-db.bildung.hessen.de/schul_db.html"]

Expand Down Expand Up @@ -124,8 +125,7 @@ def parse_details(self, response):

yield school

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
"""Transform raw scraped data into standardized School model"""
return School(
name=item.get("name"),
Expand All @@ -136,7 +136,7 @@ def normalize(item: Item) -> School:
city=item.get("ort"),
zip=item.get("plz"),
school_type=item.get("schultyp"),
id="HE-{}".format(item.get("id")), # Prefix with state code
id=self.make_school_id("{}".format(item.get("id"))),
latitude=item.get("latitude"),
longitude=item.get("longitude"),
)
6 changes: 3 additions & 3 deletions jedeschule/spiders/mecklenburg_vorpommern.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def as_string(value: str):

class MecklenburgVorpommernSpider(SchoolSpider):
name = "mecklenburg-vorpommern"
state_key = "MV"
start_urls = [
"https://www.geodaten-mv.de/dienste/schulstandorte_wfs?"
"SERVICE=WFS&REQUEST=GetFeature&VERSION=2.0.0&srsname=EPSG%3A4326&typeNames="
Expand Down Expand Up @@ -67,16 +68,15 @@ def _extract_school_data(school):

return data_elem

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
def safe_strip(value):
if not value or not value.strip():
return None
return value.strip()

return School(
name=safe_strip(item.get("schulname")),
id="MV-{}".format(as_string(item.get("dstnr", ""))),
id=self.make_school_id(as_string(item.get("dstnr", ""))),
address=safe_strip(item.get("strassehnr")),
address2="",
zip=as_string(item.get("plz", "")).zfill(5),
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/niedersachsen.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

class NiedersachsenSpider(SchoolSpider):
name = "niedersachsen"
state_key = "NI"
start_urls = ["https://schulen.nibis.de/search/advanced"]

def parse(self, response: Response):
Expand Down Expand Up @@ -54,8 +55,7 @@ def _get(dict_like, key, default):
# at all.
return dict_like.get(key) or default

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
name = " ".join(
[item.get("schulname", ""), item.get("namenszuatz", "")]
).strip()
Expand All @@ -75,5 +75,5 @@ def normalize(item: Item) -> School:
school_type=school_type,
provider=provider,
legal_status=item.get("sdb_traegerschaft", {}).get("bezeichnung"),
id="NI-{}".format(item.get("schulnr")),
id=self.make_school_id("{}".format(item.get("schulnr"))),
)
6 changes: 3 additions & 3 deletions jedeschule/spiders/nordrhein_westfalen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

class NordrheinWestfalenSpider(SchoolSpider):
name = "nordrhein-westfalen"
state_key = "NW"

start_urls = [
"https://www.schulministerium.nrw.de/BiPo/OpenData/Schuldaten/schuldaten.csv",
Expand All @@ -27,8 +28,7 @@ def parse(self, response):
for line in reader:
yield line

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
name = " ".join(
[
item.get("Schulbezeichnung_1", ""),
Expand All @@ -49,7 +49,7 @@ def normalize(item: Item) -> School:

return School(
name=name,
id="NW-{}".format(item.get("Schulnummer")),
id=self.make_school_id("{}".format(item.get("Schulnummer"))),
address=item.get("Strasse"),
zip=item.get("PLZ"),
city=item.get("Ort"),
Expand Down
3 changes: 2 additions & 1 deletion jedeschule/spiders/rheinland_pfalz.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

class RheinlandPfalzSpider(CrawlSpider, SchoolSpider):
name = "rheinland-pfalz"
state_key = "RP"
# Note, one could also use the geo portal:
# https://www.geoportal.rlp.de/spatial-objects/350/collections/schulstandorte/items?f=html&limit=4000
start_urls = ["https://bildung.rlp.de/schulen"]
Expand Down Expand Up @@ -79,7 +80,7 @@ def normalize(self, item: Item) -> School:

return School(
name=item.get("name"),
id="RP-{}".format(item.get("id")),
id=self.make_school_id("{}".format(item.get("id"))),
address=item.get("Anschrift")[1],
city=city,
zip=zip,
Expand Down
6 changes: 3 additions & 3 deletions jedeschule/spiders/saarland.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@

class SaarlandSpider(SchoolSpider):
name = "saarland"
state_key = "SL"
start_urls = [
"https://geoportal.saarland.de/spatial-objects/257/collections/Staatliche_Dienste:Schulen_SL/items?f=json&limit=2500"
]

def parse(self, response, **kwargs):
yield from parse_geojson_features(response)

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
# The data also contains a field called `Schulkennz` which implies that it might be an id
# that could be used, but some schools share ids (especially `0` or `000000`) or
# do not have any set at all which makes for collisions
Expand All @@ -25,7 +25,7 @@ def normalize(item: Item) -> School:
address=item.get("Straße", "").strip(),
city=item.get("Ort"),
fax=item.get("Fax"),
id=f"SL-{school_id}",
id=self.make_school_id("{}".format(school_id)),
latitude=item.get("lat"),
longitude=item.get("lon"),
name=item.get("Bezeichnung"),
Expand Down
9 changes: 6 additions & 3 deletions jedeschule/spiders/sachsen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class SachsenSpider(SchoolSpider):
name = "sachsen"
state_key = "SN"

# URL was created via https://schuldatenbank.sachsen.de/index.php?id=30
start_urls = [
Expand All @@ -19,11 +20,13 @@ def parse(self, response, **kwargs):
for school in json.loads(response.text):
yield school

@staticmethod
def normalize(item: Item) -> School:
def normalize(self, item: Item) -> School:
helper = SachsenHelper()
building = item.get("buildings", [None])[0]
school = School(name=item.get("name"), id="SN-{}".format(item.get("id")))
school = School(
name=item.get("name"),
id=self.make_school_id("{}".format(item.get("id"))),
)
if building is None:
return school
school["address"] = building.get("street")
Expand Down
Loading