feat(elt-common): Configure iceberg warehouse based on pipeline directory name (#376)

WHTaylor · web-flow · commit 5a7505d4d0cb · 2026-06-30T11:44:24.000+01:00
ref #321 Instead of requiring the `warehouse` value to be configured for pyiceberg, set it based on the pipeline directory name. This is as per the most recent couple of comments in the issue. I suspect `is_ingest_job` may not be the best way to handle what it does, but probably best to revisit when we add the `transform` functionality and have to actually start thinking about non-ingest jobs.
diff --git a/elt-common/src/elt_common/iceberg/catalog.py b/elt-common/src/elt_common/iceberg/catalog.py
@@ -4,16 +4,43 @@
 a ``connect_catalog()`` helper that returns a connected pyiceberg ``Catalog``.
 """
 
+import logging
+
 from pyiceberg.catalog import Catalog, load_catalog
 from pyiceberg.typedef import Identifier
 from pyiceberg.utils.config import Config as IcebergCatalogConfig
 
+LOGGER = logging.getLogger(__name__)
+
+
+def connect_catalog(warehouse_name: str) -> Catalog:
+    """Connect to the 'default' Iceberg catalog.
+
+    Loads configuration as per `pyiceberg`_, except the value of 'warehouse' which is set directly.
+
+    :param warehouse_name: the name of the warehouse to connect to
+    :return: a catalog that can be used for reading and writing
 
-def connect_catalog() -> Catalog:
+    .. _pyiceberg: https://py.iceberg.apache.org/configuration/
+    """
     """The default load_catalog only allows environment variables set before the first import or pyiceberg.catalog"""
     config = IcebergCatalogConfig()
     name = config.get_default_catalog_name()
-    return load_catalog(name, **config.get_catalog_config(name))  # type: ignore
+    conf = config.get_catalog_config(name)
+
+    if conf is None:
+        raise RuntimeError(f"Couldn't load iceberg configuration for for catalog '{name}'")
+
+    if "warehouse" in conf and warehouse_name != conf["warehouse"]:
+        msg = (
+            "elt configures the destination warehouse based on the pipeline directory. "
+            "Preconfigured value '%s' is being replaced by '%s'"
+        )
+        LOGGER.warning(msg, conf["warehouse"], warehouse_name)
+
+    conf["warehouse"] = warehouse_name
+
+    return load_catalog(name, **conf)
 
 
 def table_identifier(namespace: str, table_name: str) -> Identifier:
diff --git a/elt-common/src/elt_common/pipeline.py b/elt-common/src/elt_common/pipeline.py
@@ -15,7 +15,9 @@ def __init__(self, root: Path) -> None:
         if not ingest_dir.is_dir():
             raise ValueError(f"Invalid project. Ingest directory '{ingest_dir}' does not exist.")
 
-        self._root = root
+        resolved = root.resolve()
+        self._root = resolved
+        self._warehouse = resolved.parts[-1]
         self._ingest_dir = ingest_dir
         self._name = root.name
         self._ingest_jobs = []
@@ -31,44 +33,46 @@ def ingest_dir(self) -> Path:
     @property
     def ingest_jobs(self) -> list[ELTJobManifest]:
         if not self._ingest_jobs:
-            self._ingest_jobs = _discover_jobs(self._ingest_dir)
+            self._ingest_jobs = _discover_jobs(self._warehouse, self._ingest_dir)
 
         return self._ingest_jobs
 
 
-def _discover_jobs(ingest_dir: Path):
-    """Find all subdirectories under *root/ingest* and create manifests describing them.
+def _discover_jobs(warehouse_name: str, ingest_dir: Path):
+    """Find all subdirectories under the warehouse 'ingest' directory and create manifests describing them.
 
     The following directory structure is assumed:
 
-    root/
+    <warehouse_name>/
     |-- ingest/
     |   |-- domain_A/
     |   |   |-- source_A/
     |   |   |-- source_B/
     |   |-- domain_B/
     |       |-- source_A/
-    |-- transform/   # Root of dbt project
 
     Each subdirectory under ingest is considered a domain and each subdirectory
     underneath a domain is a data source from that domain.
 
-    :param ingest_dir: Root directory to search recursively.
+    :param warehouse_name: The top level directory name, which is stored in the manifest.
+    :param ingest_dir: Root ingest directory to search for jobs.
     :returns: List of parsed manifests.
     """
 
     return [
-        _create_ingest_manifest(job_dir)
+        _create_ingest_manifest(warehouse_name, job_dir)
         for domain_dir in ingest_dir.iterdir()
         if domain_dir.is_dir()
         for job_dir in domain_dir.iterdir()
         if job_dir.is_dir()
     ]
 
 
-def _create_ingest_manifest(job_dir: Path) -> ELTJobManifest:
+def _create_ingest_manifest(warehouse_name: str, job_dir: Path) -> ELTJobManifest:
     return ELTJobManifest(
+        warehouse_name=warehouse_name,
         name=job_dir.name,
         domain=job_dir.parent.name,
         ingest_job_dir=job_dir.resolve(),
+        is_ingest_job=True,
     )
diff --git a/elt-common/src/elt_common/runner.py b/elt-common/src/elt_common/runner.py
@@ -41,7 +41,7 @@ def run_job(job: ELTJobManifest) -> None:
 def run_ingest(job: ELTJobManifest) -> dict[str, int]:
     """Import the extract function, call it, and write results to Iceberg."""
 
-    iceberg_io = IcebergIO(connect_catalog())
+    iceberg_io = IcebergIO(connect_catalog(job.destination_warehouse))
 
     # Get object that will do the extraction.
     # Environment variables for the object's configuration must have been set
diff --git a/elt-common/src/elt_common/typing.py b/elt-common/src/elt_common/typing.py
@@ -41,14 +41,20 @@ def write_table(
 class ELTJobManifest:
     """Parsed representation of an ELT job"""
 
+    warehouse_name: str
     name: str
     domain: str
+    is_ingest_job: bool
     ingest_job_dir: Path
 
     @property
     def full_name(self) -> str:
         return f"{self.domain}.{self.name}"
 
+    @property
+    def destination_warehouse(self):
+        return f"{self.warehouse_name}_landing" if self.is_ingest_job else self.warehouse_name
+
     @property
     def destination_namespace(self) -> str:
         """The destination namespace for this job: ``{domain}_{name}``."""
diff --git a/elt-common/tests/unit_tests/iceberg/test_catalog.py b/elt-common/tests/unit_tests/iceberg/test_catalog.py
@@ -1,8 +1,9 @@
 """Tests for elt_common.iceberg.catalog"""
 
+from unittest.mock import MagicMock
+
 import pytest
 from pytest_mock import MockerFixture
-from unittest.mock import MagicMock
 
 from elt_common.iceberg.catalog import (
     connect_catalog,
@@ -26,14 +27,20 @@ def mock_load_catalog(mocker: MockerFixture):
     return mocker.patch("elt_common.iceberg.catalog.load_catalog")
 
 
+def test_no_config_found_raises_error(mock_config):
+    mock_config.get_catalog_config.return_value = None
+    with pytest.raises(RuntimeError):
+        connect_catalog("test_warehouse")
+
+
 def test_connect_catalog_loads_default_catalog(mock_config, mock_load_catalog):
     # Execute
-    connect_catalog()
+    connect_catalog("test_warehouse")
 
     # Assert
     mock_config.get_default_catalog_name.assert_called_once()
     mock_config.get_catalog_config.assert_called_once_with("default")
-    mock_load_catalog.assert_called_once_with("default", warehouse="/tmp/warehouse")
+    mock_load_catalog.assert_called_once_with("default", warehouse="test_warehouse")
 
 
 def test_connect_catalog_forwards_all_options_from_pyiceberg_catalog_config(
@@ -44,13 +51,17 @@ def test_connect_catalog_forwards_all_options_from_pyiceberg_catalog_config(
         "uri": "http://localhost:8181",
         "auth": "oauth2",
     }
+    # 'warehouse' is overwritten by the provided value
+    expected_config = {k: v for k, v in catalog_config.items()}
+    expected_config["warehouse"] = "test_warehouse"
+
     mock_config.get_catalog_config.return_value = catalog_config
 
     # Execute
-    connect_catalog()
+    connect_catalog("test_warehouse")
 
     # Assert
-    mock_load_catalog.assert_called_once_with("default", **catalog_config)
+    mock_load_catalog.assert_called_once_with("default", **expected_config)
 
 
 def test_table_id_returns_tuple_identifier():
diff --git a/elt-common/tests/unit_tests/test_extract.py b/elt-common/tests/unit_tests/test_extract.py
@@ -43,8 +43,10 @@ def test_deserialize_watermark_good_values(serialized, expected):
 def make_error_manifest(filename):
     this_dir = Path(__file__).parent
     return ELTJobManifest(
+        warehouse_name="warehouse",
         name=filename,
         domain="whatever",
+        is_ingest_job=True,
         ingest_job_dir=this_dir / "create_extract_obj_fakes" / "errors",
     )
 
@@ -69,8 +71,10 @@ def test_create_extract_obj_errors(filename, expected_error, expected_error_mess
 def make_manifest(filename):
     this_dir = Path(__file__).parent
     return ELTJobManifest(
+        warehouse_name="test_warehouse",
         name=filename,
         domain="whatever",
+        is_ingest_job=True,
         ingest_job_dir=this_dir / "create_extract_obj_fakes",
     )
 
diff --git a/elt-common/tests/unit_tests/test_pipeline.py b/elt-common/tests/unit_tests/test_pipeline.py
@@ -10,24 +10,32 @@
 )
 
 
-def test_namespace_property_combines_domain_and_name():
-    """Tests for IngestJobManifest.destination_namespace"""
+def test_properties():
     manifest = ELTJobManifest(
+        warehouse_name="test_warehouse",
         name="source_a",
         domain="facility_ops",
+        is_ingest_job=True,
         ingest_job_dir=Path("/some/path"),
     )
-    assert manifest.destination_namespace == "facility_ops_source_a"
-
+    assert manifest.destination_namespace == "facility_ops_source_a", (
+        "Destination namespace should be 'domain'_'name'"
+    )
+    assert manifest.full_name == "facility_ops.source_a", "Name should be 'domain'.'name'"
+    assert manifest.destination_warehouse == "test_warehouse_landing", (
+        "Ingest job destination should have _landing appended"
+    )
 
-def test_full_name_property_combines_domain_and_name():
-    """Tests for IngestJobManifest.full_name"""
-    manifest = ELTJobManifest(
+    non_ingest_manifest = ELTJobManifest(
+        warehouse_name="test_warehouse",
         name="source_a",
         domain="facility_ops",
+        is_ingest_job=False,
         ingest_job_dir=Path("/some/path"),
     )
-    assert manifest.full_name == "facility_ops.source_a"
+    assert non_ingest_manifest.destination_warehouse == "test_warehouse", (
+        "Non-ingest manifest shouldn't change warehouse name"
+    )
 
 
 def test_init_stores_root_and_derives_name(tmp_path: Path):
diff --git a/elt-common/tests/unit_tests/test_runner.py b/elt-common/tests/unit_tests/test_runner.py
@@ -38,8 +38,10 @@ def elt_job(request) -> ELTJobManifest:
     job_name = request.param
 
     return ELTJobManifest(
+        warehouse_name="test_warehouse",
         name=job_name,
         domain=TEST_DOMAIN,
+        is_ingest_job=True,
         ingest_job_dir=this_dir / "runner_extractor_fakes",
     )
 
diff --git a/elt-pipelines/README.md b/elt-pipelines/README.md
@@ -46,7 +46,9 @@ elt-pipelines/
 |    |    |    |    |-- <job name>.py
 ```
 
-- Each 'target warehouse' is the name of an Iceberg warehouse. The data ingested by the pipelines inside that directory end up in that warehouse.
-- The directory structure from `ingest` down is what is required for `elt-common` to be able to run 'ingest' pipelines.
+- This directory structure is required for using `elt-common`
+- Each 'target warehouse' is the name of an Iceberg warehouse. The data ingested by the pipelines inside that directory
+  end up in that warehouse.
 - Data from ingest pipelines is considered 'raw' data, and is loaded into a warehouse suffixed with `_landing`.
-- Under construction: Each warehouse will also have a `transform` subdirectory containing pipelines for converting the raw data into it's final state in the target warehouse.
+- Under construction: Each warehouse will also have a `transform` subdirectory containing pipelines for converting the
+  raw data into its final state in the target warehouse.

Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,10 @@ def elt_job(request) -> ELTJobManifest:`
`38`	`38`	`job_name = request.param`
`39`	`39`
`40`	`40`	`return ELTJobManifest(`
	`41`	`+ warehouse_name="test_warehouse",`
`41`	`42`	`name=job_name,`
`42`	`43`	`domain=TEST_DOMAIN,`
	`44`	`+ is_ingest_job=True,`
`43`	`45`	`ingest_job_dir=this_dir / "runner_extractor_fakes",`
`44`	`46`	`)`
`45`	`47`