DOI-USGS
diff --git a/‎demos/chunk_plan.png‎
47.9 KB b/‎demos/chunk_plan.png‎
47.9 KB
diff --git a/‎demos/visualize_chunk_plan.py‎
Lines changed: 270 additions & 0 deletions b/‎demos/visualize_chunk_plan.py‎
Lines changed: 270 additions & 0 deletions
@@ -0,0 +1,270 @@
+"""Visualize a 2D chunk plan from ``waterdata.chunking._plan_joint``.
+
+Builds a representative over-budget query (long site list + long
+top-level-``OR`` filter), runs the joint planner against the real
+``_construct_api_requests`` builder, and renders the resulting
+list × filter cartesian product as a heatmap. Each cell is one
+sub-request the wrapper would issue; the colour is its URL byte
+size relative to the limit. Use this to eyeball plans for the kind
+of correctness properties that are easy to miss in unit tests:
+
+- every cell ≤ the limit (no plan is allowed to overflow),
+- the headroom is reasonably balanced (no chunk is wasted),
+- the filter partition matches the OR-axis layout you expect,
+- the cartesian product is rectangular (every list chunk pairs
+  with every filter chunk exactly once).
+
+Run: ``python demos/visualize_chunk_plan.py``. Saves the figure to
+``demos/chunk_plan.png`` and also prints the plan as a text table.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from dataretrieval.waterdata.chunking import _plan_joint, _request_bytes
+from dataretrieval.waterdata.utils import _construct_api_requests
+
+
+def build_demo_args() -> tuple[dict, int]:
+    """A query that needs both list and filter chunking to fit.
+
+    100 USGS sites + 12 ``time`` OR-clauses against a 1500-byte URL
+    limit. Real ``_construct_api_requests`` URL-encoding applies so
+    the bytes match what production would build.
+    """
+    sites = [f"USGS-{i:08d}" for i in range(60)]
+    # 16 datetime equality clauses — long enough that ``k=1`` forces
+    # many list chunks, but halving the filter frees enough budget that
+    # a moderate ``k>1`` plus a coarser list split is the joint
+    # planner's optimum. The demo aims to exercise BOTH chunking dims.
+    clauses = [
+        f"time='2024-{m:02d}-{d:02d}T00:00:00Z'"
+        for m in range(1, 5)
+        for d in (1, 8, 15, 22)
+    ]
+    args = {
+        "service": "daily",
+        "monitoring_location_id": sites,
+        "filter": " OR ".join(clauses),
+    }
+    return args, 800
+
+
+def gather_subrequest_bytes(
+    args: dict,
+    list_plan: dict[str, list[list]],
+    filter_chunks: list[str | None],
+) -> tuple[np.ndarray, list[str], list[str], str | None]:
+    """Build every sub-request URL the planner would emit and return a
+    ``(rows, cols)`` matrix of byte counts. The list dim becomes rows;
+    filter chunks become columns. Row/column labels summarise the
+    contents of each chunk for the figure axes. ``list_dim`` (4th
+    return value) is the name of the chunked list parameter, used by
+    the partition spot-check."""
+    if not list_plan:
+        # No list chunking; render as a single row.
+        list_dim, list_chunks = None, [None]
+    else:
+        # The demo uses a single list dim; if multiple, take the first.
+        list_dim = next(iter(list_plan))
+        list_chunks = list_plan[list_dim]
+
+    n_rows = len(list_chunks)
+    n_cols = len(filter_chunks)
+    bytes_ = np.zeros((n_rows, n_cols), dtype=int)
+
+    for r, list_chunk in enumerate(list_chunks):
+        for c, filter_chunk in enumerate(filter_chunks):
+            sub_args = dict(args)
+            if list_chunk is not None:
+                sub_args[list_dim] = list_chunk
+            if filter_chunk is not None:
+                sub_args["filter"] = filter_chunk
+            bytes_[r, c] = _request_bytes(_construct_api_requests(**sub_args))
+
+    row_labels = []
+    cursor = 0
+    for list_chunk in list_chunks:
+        if list_chunk is None:
+            row_labels.append("all (no chunking)")
+        else:
+            end = cursor + len(list_chunk)
+            row_labels.append(f"[{cursor}:{end}]\n({len(list_chunk)} items)")
+            cursor = end
+
+    col_labels = []
+    cursor = 0
+    for fc in filter_chunks:
+        if fc is None:
+            col_labels.append("no filter")
+        else:
+            n_clauses = fc.count(" OR ") + 1
+            end = cursor + n_clauses
+            col_labels.append(f"[{cursor}:{end}]\n({n_clauses} clauses)")
+            cursor = end
+
+    return bytes_, row_labels, col_labels, list_dim
+
+
+def draw_heatmap(
+    bytes_: np.ndarray,
+    row_labels: list[str],
+    col_labels: list[str],
+    url_limit: int,
+    out_path: Path,
+    list_dim: str | None,
+) -> None:
+    """Render the byte matrix as a heatmap. Cells are coloured by
+    ``bytes / url_limit``; the limit itself is the colour-scale's red
+    boundary so anything over budget stands out. Each cell is
+    annotated with its byte count; a red cell would mean the planner
+    produced an over-budget sub-request (visible bug)."""
+    n_rows, n_cols = bytes_.shape
+    fig, ax = plt.subplots(figsize=(max(6, 1.2 * n_cols), max(4, 0.35 * n_rows + 1.5)))
+
+    # vmax = url_limit pins the red end of the colour scale to the
+    # budget. Anything over the limit saturates and becomes obvious.
+    im = ax.imshow(
+        bytes_,
+        cmap="RdYlGn_r",
+        vmin=0,
+        vmax=url_limit,
+        aspect="auto",
+    )
+    ax.set_xticks(range(n_cols))
+    ax.set_xticklabels(col_labels, rotation=30, ha="right")
+    ax.set_yticks(range(n_rows))
+    ax.set_yticklabels(row_labels)
+    ax.set_xlabel("Filter sub-chunk (OR-clause range)")
+    ax.set_ylabel(
+        f"List sub-chunk ({list_dim} range)"
+        if list_dim is not None
+        else "List sub-chunk"
+    )
+    ax.set_title(
+        f"Joint chunk plan: {n_rows} × {n_cols} = {n_rows * n_cols} "
+        f"sub-requests · url_limit={url_limit} bytes"
+    )
+
+    # Per-cell annotations.
+    for r in range(n_rows):
+        for c in range(n_cols):
+            ax.text(
+                c,
+                r,
+                f"{bytes_[r, c]}",
+                ha="center",
+                va="center",
+                color="black" if bytes_[r, c] < 0.6 * url_limit else "white",
+                fontsize=9,
+            )
+
+    fig.colorbar(im, ax=ax, label="URL bytes")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=120)
+    plt.close(fig)
+
+
+def print_text_table(
+    bytes_: np.ndarray,
+    row_labels: list[str],
+    col_labels: list[str],
+    url_limit: int,
+) -> None:
+    """ASCII fallback so the plan is also legible without opening the
+    PNG (CI logs, terminals without graphics, etc.)."""
+    print(f"\nurl_limit = {url_limit} bytes")
+    print(
+        f"plan shape: {bytes_.shape[0]} list × {bytes_.shape[1]} filter "
+        f"= {bytes_.size} sub-requests"
+    )
+    print(
+        f"min cell: {bytes_.min()} bytes · max cell: {bytes_.max()} bytes "
+        f"(headroom: {url_limit - bytes_.max()} bytes)"
+    )
+    print()
+    col_w = max(8, max(len(c.replace("\n", " ")) for c in col_labels) + 1)
+    row_w = max(len(r.replace("\n", " ")) for r in row_labels) + 2
+    print(" " * row_w + "".join(c.replace("\n", " ").rjust(col_w) for c in col_labels))
+    for r, row_label in enumerate(row_labels):
+        cells = "".join(
+            f"{int(bytes_[r, c])}".rjust(col_w) for c in range(bytes_.shape[1])
+        )
+        print(row_label.replace("\n", " ").ljust(row_w) + cells)
+
+
+def spot_check_partition(
+    args: dict,
+    list_plan: dict[str, list[list]],
+    filter_chunks: list[str | None],
+    list_dim: str | None,
+) -> None:
+    """Sanity-check that the cartesian-product plan covers every
+    original list value and OR-clause exactly once. Catches partition
+    bugs that the heatmap alone wouldn't surface (e.g. a chunk that
+    drops or duplicates members)."""
+    if list_dim is not None:
+        original = list(args[list_dim])
+        seen = [v for chunk in list_plan[list_dim] for v in chunk]
+        assert sorted(seen) == sorted(original), (
+            f"list partition lost or duplicated values: "
+            f"{len(seen)} seen vs {len(original)} expected"
+        )
+        print(f"list partition covers all {len(original)} {list_dim}s exactly once")
+
+    original_filter = args.get("filter")
+    if original_filter and len(filter_chunks) > 1:
+        original_clauses = [c.strip() for c in original_filter.split(" OR ")]
+        seen_clauses: list[str] = []
+        for fc in filter_chunks:
+            if fc is None:
+                continue
+            seen_clauses.extend(c.strip() for c in fc.split(" OR "))
+        assert seen_clauses == original_clauses, (
+            "filter partition must cover original clauses in order, exactly once"
+        )
+        print(
+            f"filter partition covers all {len(original_clauses)} "
+            f"OR-clauses exactly once, in order"
+        )
+
+
+def main() -> None:
+    args, url_limit = build_demo_args()
+
+    plan = _plan_joint(args, _construct_api_requests, url_limit)
+    if plan is None:
+        raise SystemExit(
+            "Demo args fit under url_limit — pick a tighter limit or a "
+            "longer query so the planner actually fans out."
+        )
+    list_plan, filter_chunks = plan
+
+    bytes_, row_labels, col_labels, list_dim = gather_subrequest_bytes(
+        args, list_plan, filter_chunks
+    )
+
+    out = Path(__file__).parent / "chunk_plan.png"
+    draw_heatmap(bytes_, row_labels, col_labels, url_limit, out, list_dim)
+    print(f"wrote {out}")
+
+    print_text_table(bytes_, row_labels, col_labels, url_limit)
+    spot_check_partition(args, list_plan, filter_chunks, list_dim)
+
+    over = np.argwhere(bytes_ > url_limit)
+    if len(over):
+        print(
+            f"\nBUG: {len(over)} cell(s) over the {url_limit}-byte limit "
+            f"(first: row {over[0, 0]}, col {over[0, 1]} = "
+            f"{bytes_[over[0, 0], over[0, 1]]} bytes)"
+        )
+        raise SystemExit(1)
+    print("all cells within url_limit — plan is valid")
+
+
+if __name__ == "__main__":
+    main()