Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 77 additions & 3 deletions ccflow/examples/tpch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,77 @@
from .base import *
from .data_generators import *
from .query import *
"""TPC-H example for ccflow.

This package is a *teaching* example showing how to compose a workflow from
``CallableModel``s wired together through the ``ModelRegistry``. The
canonical usage is::

from ccflow import ModelRegistry
from ccflow.examples.tpch import load_config

load_config() # populate the root ModelRegistry from conf.yaml
registry = ModelRegistry.root()
result = registry["/query/Q1"]() # run TPC-H query 1
print(result.df.to_native())

To run the same example at a different TPC-H scale factor, override the
single shared backend on load (every table / answer / query references it,
so the change flows through everywhere)::

load_config(overrides=["tpch.backend.scale_factor=1.0"])
"""

from pathlib import Path
from typing import List, Optional

from ccflow import RootModelRegistry, load_config as _load_config_base

from .data_generators import TPCHAnswerProvider, TPCHDuckDBBackend, TPCHTable, TPCHTableProvider
from .query import TPCHQuery

__all__ = (
"TPCHTable",
"TPCHDuckDBBackend",
"TPCHTableProvider",
"TPCHAnswerProvider",
"TPCHQuery",
"load_config",
)


def load_config(
config_dir: str = "",
config_name: str = "",
overrides: Optional[List[str]] = None,
*,
overwrite: bool = True,
basepath: str = "",
) -> RootModelRegistry:
"""Load the TPC-H example registry into the root ``ModelRegistry``.

Pass hydra-style ``overrides`` to reconfigure entries on load — most
usefully ``["tpch.backend.scale_factor=1.0"]`` to run the example at a
different TPC-H scale factor. Every table / answer / query references the
single ``/tpch/backend`` entry, so this one override flows through to all
22+8 providers.

Args:
config_dir: Optional extra hydra config directory to overlay on top
of the bundled ``config/conf.yaml``. Empty string (the default)
means "use only the bundled config".
config_name: Optional config name within ``config_dir`` to load.
overrides: Hydra override strings, e.g.
``["tpch.backend.scale_factor=1.0"]``.
overwrite: When True (the default), entries already present in the
registry are replaced. This is what you want in notebooks where
you re-call ``load_config()`` after tweaking overrides; set to
False to require a fresh registry.
basepath: Base path for resolving a relative ``config_dir``.
"""
return _load_config_base(
root_config_dir=str(Path(__file__).resolve().parent / "config"),
root_config_name="conf",
config_dir=config_dir,
config_name=config_name,
overrides=overrides,
overwrite=overwrite,
basepath=basepath,
)
22 changes: 0 additions & 22 deletions ccflow/examples/tpch/base.py

This file was deleted.

262 changes: 262 additions & 0 deletions ccflow/examples/tpch/config/conf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
# TPC-H example registry.
#
# This file is loaded by ``ccflow.examples.tpch.load_config()`` into the root
# ``ModelRegistry`` and demonstrates several ccflow features:
#
# 1. A *flat* model graph defined entirely in YAML — Python code defines the
# classes (``TPCHDuckDBBackend``, ``TPCHTableProvider``, ``TPCHAnswerProvider``,
# ``TPCHQuery``); this file decides which instances exist and how they are
# wired together.
# 2. Cross-references between registry entries. Strings beginning with ``/``
# are absolute paths into the registry; ccflow's pydantic validators
# resolve them to the actual configured Python instance at config-load
# time. Resolution is by reference (not copy), so every provider below
# points at the *same* ``/tpch/backend`` instance, and ``dbgen`` runs
# exactly once for the whole registry. Order within this file does not
# matter — references are resolved after the whole file is parsed.
# 3. Explicit dependencies on a generic ``CallableModel``. ``TPCHQuery`` has
# an ``inputs: tuple[CallableModel[NullContext, NarwhalsFrameResult], ...]``
# field; the registry resolves each ``/table/<name>`` reference into the
# corresponding ``TPCHTableProvider`` instance, so each query's table
# dependencies are first-class fields on that query's model instance.
# 4. ``scale_factor`` lives on a single backend entry, so loading the same
# config with a hydra override
# (``load_config(overrides=["tpch.backend.scale_factor=1.0"])``) reconfigures
# every table, answer and query consistently.

# ---------------------------------------------------------------------------
# Shared DuckDB backend. Plain ``ccflow.BaseModel`` — not callable itself,
# but registered so all providers share one connection and one ``dbgen`` call.
# ---------------------------------------------------------------------------
tpch:
backend:
_target_: ccflow.examples.tpch.TPCHDuckDBBackend
scale_factor: 0.1

# ---------------------------------------------------------------------------
# Per-table providers. One instance per TPC-H table; the output schema of
# each instance is fixed by its ``table`` field.
# ---------------------------------------------------------------------------
table:
customer:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: customer
lineitem:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: lineitem
nation:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: nation
orders:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: orders
part:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: part
partsupp:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: partsupp
region:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: region
supplier:
_target_: ccflow.examples.tpch.TPCHTableProvider
backend: /tpch/backend
table: supplier

# ---------------------------------------------------------------------------
# Reference answers, one per query, served straight from DuckDB's
# ``tpch_answers()`` table at the configured scale factor.
# ---------------------------------------------------------------------------
answer:
Q1:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 1
Q2:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 2
Q3:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 3
Q4:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 4
Q5:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 5
Q6:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 6
Q7:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 7
Q8:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 8
Q9:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 9
Q10:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 10
Q11:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 11
Q12:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 12
Q13:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 13
Q14:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 14
Q15:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 15
Q16:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 16
Q17:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 17
Q18:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 18
Q19:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 19
Q20:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 20
Q21:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 21
Q22:
_target_: ccflow.examples.tpch.TPCHAnswerProvider
backend: /tpch/backend
query_id: 22

# ---------------------------------------------------------------------------
# The 22 TPC-H queries. Each ``TPCHQuery`` is the same Python class with a
# different ``query_id`` and a different tuple of table-provider inputs.
# Wiring the inputs in YAML makes each query's table dependencies explicit
# and overridable per-query.
# ---------------------------------------------------------------------------
query:
Q1:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 1
inputs: [/table/lineitem]
Q2:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 2
inputs: [/table/region, /table/nation, /table/supplier, /table/part, /table/partsupp]
Q3:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 3
inputs: [/table/customer, /table/lineitem, /table/orders]
Q4:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 4
inputs: [/table/lineitem, /table/orders]
Q5:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 5
inputs: [/table/region, /table/nation, /table/customer, /table/lineitem, /table/orders, /table/supplier]
Q6:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 6
inputs: [/table/lineitem]
Q7:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 7
inputs: [/table/nation, /table/customer, /table/lineitem, /table/orders, /table/supplier]
Q8:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 8
inputs: [/table/part, /table/supplier, /table/lineitem, /table/orders, /table/customer, /table/nation, /table/region]
Q9:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 9
inputs: [/table/part, /table/partsupp, /table/nation, /table/lineitem, /table/orders, /table/supplier]
Q10:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 10
inputs: [/table/customer, /table/nation, /table/lineitem, /table/orders]
Q11:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 11
inputs: [/table/nation, /table/partsupp, /table/supplier]
Q12:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 12
inputs: [/table/lineitem, /table/orders]
Q13:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 13
inputs: [/table/customer, /table/orders]
Q14:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 14
inputs: [/table/lineitem, /table/part]
Q15:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 15
inputs: [/table/lineitem, /table/supplier]
Q16:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 16
inputs: [/table/part, /table/partsupp, /table/supplier]
Q17:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 17
inputs: [/table/lineitem, /table/part]
Q18:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 18
inputs: [/table/customer, /table/lineitem, /table/orders]
Q19:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 19
inputs: [/table/lineitem, /table/part]
Q20:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 20
inputs: [/table/part, /table/partsupp, /table/nation, /table/lineitem, /table/supplier]
Q21:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 21
inputs: [/table/lineitem, /table/nation, /table/orders, /table/supplier]
Q22:
_target_: ccflow.examples.tpch.TPCHQuery
query_id: 22
inputs: [/table/customer, /table/orders]
Loading
Loading