From 9d6bc837acdd1e02c320fd70b19dd8361190f041 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 14 Apr 2026 13:30:28 -0300 Subject: [PATCH 1/6] ci: add graphify structural impact analysis to PR review and structure audit Add a graphify-based AST analysis tool that builds a directed graph of the codebase (~2s, no LLM calls) to detect architectural impact. Integrates into both the PR review workflow (pre-computed before claude runs) and the Wednesday structure audit (with week-over-week diff). PR review: extracts changed files against the full codebase graph, reports risk level (LOW/MEDIUM/HIGH), god nodes affected, import direction violations, and cross-package dependencies. Output saved to /tmp and read by the review agent. Structure audit: produces god node rankings, cross-package edge summary table, import violation detection, and graph diff against previous week's cached graph. Baselines saved for runner memory trend tracking. --- .agents/recipes/pr-review/recipe.md | 13 +- .agents/recipes/structure/recipe.md | 39 ++- .agents/skills/review-code/SKILL.md | 27 ++ .agents/tools/structural_impact.py | 354 +++++++++++++++++++++ .github/workflows/agentic-ci-daily.yml | 8 +- .github/workflows/agentic-ci-pr-review.yml | 25 ++ 6 files changed, 461 insertions(+), 5 deletions(-) create mode 100644 .agents/tools/structural_impact.py diff --git a/.agents/recipes/pr-review/recipe.md b/.agents/recipes/pr-review/recipe.md index abcbf83e7..cad34f052 100644 --- a/.agents/recipes/pr-review/recipe.md +++ b/.agents/recipes/pr-review/recipe.md @@ -17,10 +17,17 @@ Review pull request #{{pr_number}} using the `review-code` skill. ## Instructions -1. Run `/review-code {{pr_number}}` -2. The skill writes the review to `/tmp/review-{{pr_number}}.md`. If it does +1. If `/tmp/structural-impact-{{pr_number}}.md` exists, read it. This contains + a pre-computed structural impact analysis (risk level, god nodes affected, + import direction violations, cross-package dependencies). Use it to inform + your review - high-risk PRs warrant extra scrutiny on blast radius and + backward compatibility. +2. Run `/review-code {{pr_number}}` +3. The skill writes the review to `/tmp/review-{{pr_number}}.md`. If it does not, save the review output there yourself. -3. Before finishing, read `/tmp/review-{{pr_number}}.md` and verify it contains +4. If the structural impact analysis exists, append its content to the review + file (before the Verdict section) under a `### Structural Impact` heading. +5. Before finishing, read `/tmp/review-{{pr_number}}.md` and verify it contains a valid review (Summary, Findings, Verdict sections). If the file is empty or malformed, write a brief "Review could not be completed" note to the file instead. diff --git a/.agents/recipes/structure/recipe.md b/.agents/recipes/structure/recipe.md index eb5b44a90..590fa3b29 100644 --- a/.agents/recipes/structure/recipe.md +++ b/.agents/recipes/structure/recipe.md @@ -111,7 +111,38 @@ done This enables modern type syntax (`list[str]` instead of `List[str]`, `str | None` instead of `Optional[str]`) and defers annotation evaluation. -### 4. Dead exports +### 4. Graphify structural analysis + +Run the graphify-based structural analysis tool. This builds a directed AST +graph of the entire codebase (~2s) and produces god node rankings, +cross-package edge counts, and import direction violations that grep-based +checks miss (e.g., inferred cross-file dependencies from class-level import +resolution). + +```bash +PREV_GRAPH="" +if [ -f "graphify-out/graph.json" ]; then + PREV_GRAPH="--previous-graph graphify-out/graph.json" +fi +python .agents/tools/structural_impact.py --full $PREV_GRAPH \ + --output /tmp/graphify-structure.md +``` + +Read `/tmp/graphify-structure.md` and incorporate relevant findings: + +- **God nodes**: track the top 10 most-connected entities. Compare against + the previous baseline in runner memory. A god node gaining 10+ connections + since last week signals growing coupling. +- **Cross-package edge summary**: the table shows edge counts per direction. + All edges should flow interface -> engine -> config. Any VIOLATION row is a + finding for the import boundary section. +- **Changes since last run**: new/removed entities and relationships since + the previous audit. Flag any unexpected removals of god nodes. + +After the audit, update `baselines` in runner memory with the current god +node list and cross-package edge counts from `graphify-out/baselines.json`. + +### 5. Dead exports Find symbols in `__all__` that nothing outside their module references: @@ -161,12 +192,18 @@ Write the report to `/tmp/audit-{{suite}}.md`: |---------|--------|--------|----------| | ... | ... | ... | No external imports found | +### Structural graph analysis (graphify) + +(Paste the content of /tmp/graphify-structure.md here) + ### Summary - N import boundary violations (M new since last run) - N lazy import violations (M new) - N files missing future annotations (M new) - N potentially dead exports (M new) +- God nodes: top entity has N connections (delta: +/-M from last run) +- Cross-package edges: N total (M violations) ``` If no findings in any category, write `NO_FINDINGS` on the first line instead. diff --git a/.agents/skills/review-code/SKILL.md b/.agents/skills/review-code/SKILL.md index 31b450acb..e6978e873 100644 --- a/.agents/skills/review-code/SKILL.md +++ b/.agents/skills/review-code/SKILL.md @@ -112,6 +112,33 @@ Before diving into details, build a mental model: 5. **Note cross-cutting concerns** (e.g., a rename that touches many files vs. substantive logic changes) 6. **Check existing feedback** (PR mode): inspect both inline comments (Step 1, item 5) and PR-level review bodies (Step 1, item 5b) so you don't duplicate feedback already given +## Step 3.5: Structural Impact (if available) + +Check for a pre-computed structural impact analysis at +`/tmp/structural-impact-.md`. This file is produced by +`graphify` AST extraction and contains: + +- **Risk level** (LOW/MEDIUM/HIGH) based on god nodes touched, import + violations, and cluster spread +- **Core abstractions modified** - the most-connected entities in the + codebase (high blast radius if changed) +- **Import direction violations** - cross-package edges that violate the + layering rule (interface -> engine -> config) +- **High-connectivity changes** - entities with many dependents +- **Cross-package dependencies** - edges crossing package boundaries + +If the file exists, read it and use it to calibrate your review: + +- **HIGH risk**: apply extra scrutiny in Pass 2 (Design & Architecture). + Verify backward compatibility for god nodes. Check that cross-package + changes don't break existing callers. +- **Import violations**: flag them as Warnings in the review if they + represent real dependency direction issues (not just inferred edges). +- **LOW risk**: the structural analysis confirms a localized change. You + can focus more on correctness (Pass 1) and less on architecture. + +If the file does not exist (e.g. local branch review), skip this step. + ## Step 4: Review Each Changed File (Multi-Pass) Perform **at least 2-3 passes** over the changed files. Each pass has a different focus — this catches issues that a single read-through would miss. diff --git a/.agents/tools/structural_impact.py b/.agents/tools/structural_impact.py new file mode 100644 index 000000000..4f9d77845 --- /dev/null +++ b/.agents/tools/structural_impact.py @@ -0,0 +1,354 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Structural impact analysis using graphify AST extraction. + +Usage: + python .agents/tools/structural_impact.py --changed-files file1.py file2.py + python .agents/tools/structural_impact.py --full [--previous-graph graphify-out/graph.json] + python .agents/tools/structural_impact.py --changed-files file1.py --output /tmp/report.md +""" + +from __future__ import annotations + +import argparse +import json +import time +from pathlib import Path + +from graphify.analyze import god_nodes, graph_diff +from graphify.build import build_from_json +from graphify.cluster import cluster, score_all +from graphify.extract import extract + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent + +# DD layering: interface -> engine -> config (left depends on right). +_LEGAL_DIRECTIONS = {("interface", "engine"), ("interface", "config"), ("engine", "config")} + +_PACKAGE_DIRS = [ + _REPO_ROOT / "packages" / "data-designer-engine" / "src" / "data_designer" / "engine", + _REPO_ROOT / "packages" / "data-designer-config" / "src" / "data_designer" / "config", + _REPO_ROOT / "packages" / "data-designer" / "src" / "data_designer", +] + +_STDLIB_LABELS = {"ABC", "BaseModel", "Enum", "Field"} + + +def _collect_source_files() -> list[Path]: + files: list[Path] = [] + for d in _PACKAGE_DIRS: + if d.exists(): + files.extend(p for p in d.rglob("*.py") if not any(part.startswith(".") for part in p.relative_to(d).parts)) + return sorted(files) + + +def _get_package(filepath: str) -> str: + if "data-designer-engine" in filepath: + return "engine" + if "data-designer-config" in filepath: + return "config" + parts = filepath.split("/") + for i, p in enumerate(parts): + if p == "data-designer" and i + 1 < len(parts) and parts[i + 1] == "src": + return "interface" + return "" + + +def _rel(filepath: str) -> str: + try: + return str(Path(filepath).relative_to(_REPO_ROOT)) + except ValueError: + return filepath + + +def _dedup(items: list[dict], key_a: str = "from_label", key_b: str = "to_label") -> list[dict]: + seen: set[tuple[str, str]] = set() + out: list[dict] = [] + for e in items: + k = (e[key_a][:30], e[key_b][:30]) + if k not in seen: + seen.add(k) + out.append(e) + return out + + +def _build_graph(files: list[Path]) -> dict: + """Extract, build directed graph, cluster, and find god nodes.""" + ext = extract(files) + G = build_from_json(ext, directed=True) + comms = cluster(G) + return {"graph": G, "communities": comms, "cohesion": score_all(G, comms), "god_nodes": god_nodes(G, top_n=15)} + + +def _cross_package_edges(G, node_ids: set[str] | None = None) -> tuple[list[dict], list[dict], dict[str, int]]: + """Collect cross-package edges. If node_ids is given, only from those nodes. + + Returns (cross_pkg_edges, violations, direction_counts). + """ + cross_pkg: list[dict] = [] + violations: list[dict] = [] + direction_counts: dict[str, int] = {} + + edges = ( + ((nid, nbr, G.edges[nid, nbr]) for nid in node_ids if nid in G for nbr in G.successors(nid)) + if node_ids is not None + else ((u, v, d) for u, v, d in G.edges(data=True)) + ) + + for u, v, data in edges: + u_pkg = _get_package(G.nodes[u].get("source_file", "")) + v_pkg = _get_package(G.nodes[v].get("source_file", "")) + if not u_pkg or not v_pkg or u_pkg == v_pkg: + continue + relation = data.get("relation", "?") + if relation == "contains": + continue + key = f"{u_pkg} -> {v_pkg}" + direction_counts[key] = direction_counts.get(key, 0) + 1 + entry = { + "from_label": G.nodes[u].get("label", u)[:50], + "from_pkg": u_pkg, + "to_label": G.nodes[v].get("label", v)[:50], + "to_pkg": v_pkg, + "relation": relation, + } + cross_pkg.append(entry) + if (u_pkg, v_pkg) not in _LEGAL_DIRECTIONS: + violations.append(entry) + + return cross_pkg, violations, direction_counts + + +# -- Markdown formatters -- + + +def _fmt_violations(violations: list[dict], limit: int = 10) -> list[str]: + if not violations: + return [] + lines = [ + f"#### Import Direction Violations ({len(violations)})", + "_Legal direction: interface -> engine -> config_", + "", + ] + for v in violations[:limit]: + lines.append(f"- `{v['from_label']}` ({v['from_pkg']}) --{v['relation']}--> `{v['to_label']}` ({v['to_pkg']})") + if len(violations) > limit: + lines.append(f"- +{len(violations) - limit} more") + lines.append("") + return lines + + +def _fmt_gods(gods: list[dict], affected_only: bool = False) -> list[str]: + if not gods: + return [] + if affected_only: + lines = ["#### Core Abstractions Modified"] + for g in gods: + lines.append(f"- `{g['label']}` - #{g['rank']} most connected ({g['edges']} deps)") + else: + lines = [ + "#### God Nodes (most connected entities)", + "", + "| Rank | Entity | Connections |", + "|------|--------|-------------|", + ] + for g in gods: + lines.append(f"| {g['rank']} | `{g['label']}` | {g['edges']} |") + lines.append("") + return lines + + +def _fmt_high_impact(items: list[dict], limit: int = 8) -> list[str]: + if not items: + return [] + lines = ["#### High-Connectivity Changes"] + for h in items[:limit]: + lines.append(f"- `{h['label']}` ({h['degree']} deps) in `{h['source']}`") + if len(items) > limit: + lines.append(f"- +{len(items) - limit} more") + lines.append("") + return lines + + +def _fmt_cross_pkg(items: list[dict], limit: int = 6) -> list[str]: + if not items: + return [] + lines = ["#### Cross-Package Dependencies"] + for e in items[:limit]: + lines.append(f"- `{e['from_label']}` ({e['from_pkg']}) --{e['relation']}--> `{e['to_label']}` ({e['to_pkg']})") + if len(items) > limit: + lines.append(f"- +{len(items) - limit} more") + lines.append("") + return lines + + +# -- Modes -- + + +def _changed_files_mode(changed_files: list[Path]) -> str: + """PR review: analyze changed files against full codebase.""" + t0 = time.monotonic() + analysis = _build_graph(_collect_source_files()) + G, communities, gods = analysis["graph"], analysis["communities"], analysis["god_nodes"] + + changed_node_ids = {n["id"] for n in extract(changed_files)["nodes"]} + + node_to_community = {n: cid for cid, nodes in communities.items() for n in nodes} + affected_gods = [{"rank": gods.index(g) + 1, **g} for g in gods if g["id"] in changed_node_ids] + affected_communities = {node_to_community[nid] for nid in changed_node_ids if nid in node_to_community} + + high_impact = sorted( + [ + { + "label": G.nodes[nid].get("label", nid), + "source": _rel(G.nodes[nid].get("source_file", "")), + "degree": G.degree(nid), + } + for nid in changed_node_ids + if nid in G + and G.degree(nid) >= 5 + and G.nodes[nid].get("source_file") + and G.nodes[nid].get("label", "") not in _STDLIB_LABELS + ], + key=lambda x: x["degree"], + reverse=True, + ) + + cross_pkg, violations, _ = _cross_package_edges(G, changed_node_ids) + unique_cross, unique_violations = _dedup(cross_pkg), _dedup(violations) + + # Risk level. + if affected_gods or unique_violations: + reasons = [] + if affected_gods: + reasons.append(f"{len(affected_gods)} core abstraction(s) modified") + if unique_violations: + reasons.append(f"{len(unique_violations)} import direction violation(s)") + risk, risk_reason = "HIGH", "; ".join(reasons) + elif len(affected_communities) > 3 or any(h["degree"] > 20 for h in high_impact): + risk, risk_reason = "MEDIUM", f"{len(affected_communities)} clusters affected" + else: + risk, risk_reason = "LOW", "localized change" + + elapsed = time.monotonic() - t0 + lines = [ + f"### Structural Impact _(graphify, {elapsed:.1f}s)_", + "", + f"**Risk: {risk}** ({risk_reason})", + f"- {len(changed_files)} Python files, {len(changed_node_ids)} AST entities, " + f"{len(affected_communities)}/{len(communities)} clusters", + "", + ] + lines += _fmt_violations(unique_violations, limit=5) + lines += _fmt_gods(affected_gods, affected_only=True) + lines += _fmt_high_impact(high_impact) + lines += _fmt_cross_pkg(unique_cross) + return "\n".join(lines) + + +def _full_mode(previous_graph_path: str | None = None) -> str: + """Structure audit: full codebase analysis with optional diff.""" + t0 = time.monotonic() + analysis = _build_graph(_collect_source_files()) + G, communities, gods = analysis["graph"], analysis["communities"], analysis["god_nodes"] + elapsed = time.monotonic() - t0 + + ranked_gods = [{"rank": i, **g} for i, g in enumerate(gods[:10], 1)] + cross_pkg, violations, direction_counts = _cross_package_edges(G) + unique_violations = _dedup(violations) + + lines = [ + f"### Structural Analysis _(graphify, {elapsed:.1f}s)_", + "", + f"Codebase: {G.number_of_nodes()} entities, {G.number_of_edges()} relationships, {len(communities)} clusters", + "", + ] + lines += _fmt_gods(ranked_gods) + + # Cross-package direction summary table. + lines += ["#### Cross-Package Edge Summary", "", "| Direction | Count | Status |", "|-----------|-------|--------|"] + for d, count in sorted(direction_counts.items(), key=lambda x: -x[1]): + pkgs = d.split(" -> ") + lines.append(f"| {d} | {count} | {'OK' if (pkgs[0], pkgs[1]) in _LEGAL_DIRECTIONS else 'VIOLATION'} |") + lines.append("") + + lines += _fmt_violations(unique_violations) + + # Graph diff against previous run. + if previous_graph_path and Path(previous_graph_path).exists(): + try: + from networkx.readwrite import json_graph + + prev_data = json.loads(Path(previous_graph_path).read_text(encoding="utf-8")) + try: + G_prev = json_graph.node_link_graph(prev_data, edges="links") + except TypeError: + G_prev = json_graph.node_link_graph(prev_data) + diff = graph_diff(G_prev, G) + lines += ["#### Changes Since Last Run", "", f"**{diff['summary']}**", ""] + for key, label in [("new_nodes", "New entities"), ("removed_nodes", "Removed entities")]: + if diff[key]: + names = [n["label"] for n in diff[key][:8]] + lines.append(f"{label}: {', '.join(f'`{l}`' for l in names)}") + if len(diff[key]) > 8: + lines.append(f" +{len(diff[key]) - 8} more") + lines.append("") + except Exception as exc: + lines += [f"_Could not diff against previous graph: {exc}_", ""] + + # Save graph + baselines for next run. + from graphify.export import to_json + + out_dir = _REPO_ROOT / "graphify-out" + out_dir.mkdir(exist_ok=True) + to_json(G, communities, str(out_dir / "graph.json")) + (out_dir / "baselines.json").write_text( + json.dumps( + { + "total_nodes": G.number_of_nodes(), + "total_edges": G.number_of_edges(), + "total_communities": len(communities), + "god_nodes": [{"label": g["label"], "edges": g["edges"]} for g in gods[:10]], + "cross_package_edges": direction_counts, + "violation_count": len(violations), + }, + indent=2, + ), + encoding="utf-8", + ) + return "\n".join(lines) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Structural impact analysis via graphify") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--changed-files", nargs="+", help="PR review mode: list of changed Python files") + group.add_argument("--full", action="store_true", help="Full analysis mode for the structure audit") + parser.add_argument("--previous-graph", help="Path to previous graph.json for diff (full mode only)") + parser.add_argument("--output", help="Write output to file instead of stdout") + args = parser.parse_args() + + if args.changed_files: + changed = [ + p + for raw in args.changed_files + for p in [Path(raw) if Path(raw).is_absolute() else _REPO_ROOT / raw] + if p.exists() and p.suffix == ".py" + ] + report = ( + _changed_files_mode(changed) + if changed + else "### Structural Impact\n\nNo Python files changed - skipping.\n" + ) + else: + report = _full_mode(args.previous_graph) + + if args.output: + Path(args.output).write_text(report, encoding="utf-8") + else: + print(report) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/agentic-ci-daily.yml b/.github/workflows/agentic-ci-daily.yml index 263f6e0ca..cae2e4e22 100644 --- a/.github/workflows/agentic-ci-daily.yml +++ b/.github/workflows/agentic-ci-daily.yml @@ -92,7 +92,9 @@ jobs: id: cache uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 with: - path: .agentic-ci-state + path: | + .agentic-ci-state + graphify-out key: agentic-ci-state-${{ matrix.suite }}-${{ github.run_id }} restore-keys: | agentic-ci-state-${{ matrix.suite }}- @@ -119,6 +121,10 @@ jobs: print(f' config: {cv} engine: {ev}') " 2>/dev/null || echo " (version check skipped)" + - name: Install graphify + if: matrix.suite == 'structure' + run: .venv/bin/python -m pip install graphifyy 2>&1 | tail -3 + - name: Pre-flight checks env: ANTHROPIC_BASE_URL: ${{ secrets.AGENTIC_CI_API_BASE_URL }} diff --git a/.github/workflows/agentic-ci-pr-review.yml b/.github/workflows/agentic-ci-pr-review.yml index 7b995d58e..8441cc00e 100644 --- a/.github/workflows/agentic-ci-pr-review.yml +++ b/.github/workflows/agentic-ci-pr-review.yml @@ -152,6 +152,31 @@ jobs: sparse-checkout: .agents/recipes path: base-recipes + - name: Install dev environment + run: | + make install-dev + echo "${{ github.workspace }}/.venv/bin" >> "$GITHUB_PATH" + + - name: Install graphify + run: .venv/bin/python -m pip install graphifyy 2>&1 | tail -3 + + - name: Structural impact analysis + env: + PR_NUMBER: ${{ steps.pr.outputs.number }} + GH_TOKEN: ${{ github.token }} + run: | + CHANGED_PY=$(gh pr diff "$PR_NUMBER" --name-only | grep '\.py$' || true) + if [ -n "$CHANGED_PY" ]; then + .venv/bin/python .agents/tools/structural_impact.py \ + --changed-files $CHANGED_PY \ + --output "/tmp/structural-impact-${PR_NUMBER}.md" + echo "Structural impact analysis complete:" + cat "/tmp/structural-impact-${PR_NUMBER}.md" + else + echo "No Python files changed - skipping structural analysis." + fi + continue-on-error: true + - name: Pre-flight checks env: ANTHROPIC_BASE_URL: ${{ secrets.AGENTIC_CI_API_BASE_URL }} From cfaf90406d8efb07aa33b7333f86162832be3792 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 21 Apr 2026 20:39:52 +0000 Subject: [PATCH 2/6] fix: harden graphify integration - security, correctness, and CI weight - Fix KeyError: god_nodes() returns 'degree' not 'edges' (3 call sites) - Fix deduped vs raw violation count inconsistency in baselines.json - Security: run structural_impact.py from base-branch checkout so fork PRs cannot inject code that executes with GH_TOKEN in scope - Add --repo-root flag so the tool resolves package paths correctly when invoked from a different checkout directory - Replace make install-dev + .venv with lightweight /tmp/graphify-venv (only graphifyy needed, saves ~2min CI per PR review) - Add graphify-out/ to .gitignore (9MB graph cache is CI-only) --- .agents/tools/structural_impact.py | 29 ++++++++------ .github/workflows/agentic-ci-pr-review.yml | 45 ++++++++++------------ .gitignore | 3 ++ 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/.agents/tools/structural_impact.py b/.agents/tools/structural_impact.py index 4f9d77845..d5477f0bf 100644 --- a/.agents/tools/structural_impact.py +++ b/.agents/tools/structural_impact.py @@ -6,7 +6,7 @@ Usage: python .agents/tools/structural_impact.py --changed-files file1.py file2.py python .agents/tools/structural_impact.py --full [--previous-graph graphify-out/graph.json] - python .agents/tools/structural_impact.py --changed-files file1.py --output /tmp/report.md + python .agents/tools/structural_impact.py --changed-files file1.py --repo-root /path/to/repo --output /tmp/report.md """ from __future__ import annotations @@ -21,15 +21,16 @@ from graphify.cluster import cluster, score_all from graphify.extract import extract -_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +_REPO_ROOT_DEFAULT = Path(__file__).resolve().parent.parent.parent +_REPO_ROOT = _REPO_ROOT_DEFAULT # DD layering: interface -> engine -> config (left depends on right). _LEGAL_DIRECTIONS = {("interface", "engine"), ("interface", "config"), ("engine", "config")} -_PACKAGE_DIRS = [ - _REPO_ROOT / "packages" / "data-designer-engine" / "src" / "data_designer" / "engine", - _REPO_ROOT / "packages" / "data-designer-config" / "src" / "data_designer" / "config", - _REPO_ROOT / "packages" / "data-designer" / "src" / "data_designer", +_PACKAGE_SUBDIRS = [ + Path("packages") / "data-designer-engine" / "src" / "data_designer" / "engine", + Path("packages") / "data-designer-config" / "src" / "data_designer" / "config", + Path("packages") / "data-designer" / "src" / "data_designer", ] _STDLIB_LABELS = {"ABC", "BaseModel", "Enum", "Field"} @@ -37,7 +38,7 @@ def _collect_source_files() -> list[Path]: files: list[Path] = [] - for d in _PACKAGE_DIRS: + for d in [_REPO_ROOT / sub for sub in _PACKAGE_SUBDIRS]: if d.exists(): files.extend(p for p in d.rglob("*.py") if not any(part.startswith(".") for part in p.relative_to(d).parts)) return sorted(files) @@ -145,7 +146,7 @@ def _fmt_gods(gods: list[dict], affected_only: bool = False) -> list[str]: if affected_only: lines = ["#### Core Abstractions Modified"] for g in gods: - lines.append(f"- `{g['label']}` - #{g['rank']} most connected ({g['edges']} deps)") + lines.append(f"- `{g['label']}` - #{g['rank']} most connected ({g['degree']} deps)") else: lines = [ "#### God Nodes (most connected entities)", @@ -154,7 +155,7 @@ def _fmt_gods(gods: list[dict], affected_only: bool = False) -> list[str]: "|------|--------|-------------|", ] for g in gods: - lines.append(f"| {g['rank']} | `{g['label']}` | {g['edges']} |") + lines.append(f"| {g['rank']} | `{g['label']}` | {g['degree']} |") lines.append("") return lines @@ -309,9 +310,9 @@ def _full_mode(previous_graph_path: str | None = None) -> str: "total_nodes": G.number_of_nodes(), "total_edges": G.number_of_edges(), "total_communities": len(communities), - "god_nodes": [{"label": g["label"], "edges": g["edges"]} for g in gods[:10]], + "god_nodes": [{"label": g["label"], "degree": g["degree"]} for g in gods[:10]], "cross_package_edges": direction_counts, - "violation_count": len(violations), + "violation_count": len(unique_violations), }, indent=2, ), @@ -321,14 +322,20 @@ def _full_mode(previous_graph_path: str | None = None) -> str: def main() -> None: + global _REPO_ROOT + parser = argparse.ArgumentParser(description="Structural impact analysis via graphify") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--changed-files", nargs="+", help="PR review mode: list of changed Python files") group.add_argument("--full", action="store_true", help="Full analysis mode for the structure audit") parser.add_argument("--previous-graph", help="Path to previous graph.json for diff (full mode only)") parser.add_argument("--output", help="Write output to file instead of stdout") + parser.add_argument("--repo-root", help="Override repo root (when script runs from a different checkout)") args = parser.parse_args() + if args.repo_root: + _REPO_ROOT = Path(args.repo_root).resolve() + if args.changed_files: changed = [ p diff --git a/.github/workflows/agentic-ci-pr-review.yml b/.github/workflows/agentic-ci-pr-review.yml index 8441cc00e..d7d835bc8 100644 --- a/.github/workflows/agentic-ci-pr-review.yml +++ b/.github/workflows/agentic-ci-pr-review.yml @@ -142,23 +142,17 @@ jobs: ref: ${{ steps.head.outputs.sha }} fetch-depth: 0 - # SECURITY: Recipe files define the agent's prompt. Always read them - # from the base branch so a fork PR cannot inject malicious instructions - # while API secrets are in scope. - - name: Checkout base branch recipes + # SECURITY: Recipe and tool files define the agent's prompt and run + # with secrets in scope. Always read them from the base branch so a + # fork PR cannot inject malicious instructions or code. + - name: Checkout base branch agent files uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.base.sha || 'main' }} - sparse-checkout: .agents/recipes - path: base-recipes - - - name: Install dev environment - run: | - make install-dev - echo "${{ github.workspace }}/.venv/bin" >> "$GITHUB_PATH" - - - name: Install graphify - run: .venv/bin/python -m pip install graphifyy 2>&1 | tail -3 + sparse-checkout: | + .agents/recipes + .agents/tools + path: base-agents - name: Structural impact analysis env: @@ -166,15 +160,18 @@ jobs: GH_TOKEN: ${{ github.token }} run: | CHANGED_PY=$(gh pr diff "$PR_NUMBER" --name-only | grep '\.py$' || true) - if [ -n "$CHANGED_PY" ]; then - .venv/bin/python .agents/tools/structural_impact.py \ - --changed-files $CHANGED_PY \ - --output "/tmp/structural-impact-${PR_NUMBER}.md" - echo "Structural impact analysis complete:" - cat "/tmp/structural-impact-${PR_NUMBER}.md" - else + if [ -z "$CHANGED_PY" ]; then echo "No Python files changed - skipping structural analysis." + exit 0 fi + python -m venv /tmp/graphify-venv + /tmp/graphify-venv/bin/python -m pip install graphifyy --quiet 2>&1 | tail -3 + /tmp/graphify-venv/bin/python base-agents/.agents/tools/structural_impact.py \ + --repo-root "${{ github.workspace }}" \ + --changed-files $CHANGED_PY \ + --output "/tmp/structural-impact-${PR_NUMBER}.md" + echo "Structural impact analysis complete:" + cat "/tmp/structural-impact-${PR_NUMBER}.md" continue-on-error: true - name: Pre-flight checks @@ -217,10 +214,10 @@ jobs: set -o pipefail # Build the prompt from _runner.md + recipe, substituting template vars. - # Read from base-recipes/ (checked out from the base branch) so fork + # Read from base-agents/ (checked out from the base branch) so fork # PRs cannot tamper with the agent prompt. - RUNNER_CTX=$(cat base-recipes/.agents/recipes/_runner.md) - RECIPE_BODY=$(cat base-recipes/.agents/recipes/pr-review/recipe.md \ + RUNNER_CTX=$(cat base-agents/.agents/recipes/_runner.md) + RECIPE_BODY=$(cat base-agents/.agents/recipes/pr-review/recipe.md \ | sed '1,/^---$/{ /^---$/,/^---$/d }') PROMPT=$(printf '%s\n\n%s\n' "${RUNNER_CTX}" "${RECIPE_BODY}" \ diff --git a/.gitignore b/.gitignore index 520c5aff9..d972ad842 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,6 @@ packages/data-designer/README.md # Claude worktrees .claude/worktrees/ + +# Graphify structural analysis cache (CI-only, cached via actions/cache) +graphify-out/ From bfc5db69cf44bc8368045d9477122dbc2877e728 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 21 Apr 2026 20:57:46 +0000 Subject: [PATCH 3/6] fix: pin graphifyy version and fix dedup truncation Pin graphifyy==0.4.23 in both CI workflows to prevent breaking changes from unpinned installs. Fix _dedup() label truncation at 30 chars that could merge distinct entities sharing a common prefix. --- .agents/tools/structural_impact.py | 2 +- .github/workflows/agentic-ci-daily.yml | 2 +- .github/workflows/agentic-ci-pr-review.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.agents/tools/structural_impact.py b/.agents/tools/structural_impact.py index d5477f0bf..12561b05b 100644 --- a/.agents/tools/structural_impact.py +++ b/.agents/tools/structural_impact.py @@ -67,7 +67,7 @@ def _dedup(items: list[dict], key_a: str = "from_label", key_b: str = "to_label" seen: set[tuple[str, str]] = set() out: list[dict] = [] for e in items: - k = (e[key_a][:30], e[key_b][:30]) + k = (e[key_a], e[key_b]) if k not in seen: seen.add(k) out.append(e) diff --git a/.github/workflows/agentic-ci-daily.yml b/.github/workflows/agentic-ci-daily.yml index cae2e4e22..151dbee40 100644 --- a/.github/workflows/agentic-ci-daily.yml +++ b/.github/workflows/agentic-ci-daily.yml @@ -123,7 +123,7 @@ jobs: - name: Install graphify if: matrix.suite == 'structure' - run: .venv/bin/python -m pip install graphifyy 2>&1 | tail -3 + run: .venv/bin/python -m pip install graphifyy==0.4.23 2>&1 | tail -3 - name: Pre-flight checks env: diff --git a/.github/workflows/agentic-ci-pr-review.yml b/.github/workflows/agentic-ci-pr-review.yml index d7d835bc8..5f334ca44 100644 --- a/.github/workflows/agentic-ci-pr-review.yml +++ b/.github/workflows/agentic-ci-pr-review.yml @@ -165,7 +165,7 @@ jobs: exit 0 fi python -m venv /tmp/graphify-venv - /tmp/graphify-venv/bin/python -m pip install graphifyy --quiet 2>&1 | tail -3 + /tmp/graphify-venv/bin/python -m pip install graphifyy==0.4.23 --quiet 2>&1 | tail -3 /tmp/graphify-venv/bin/python base-agents/.agents/tools/structural_impact.py \ --repo-root "${{ github.workspace }}" \ --changed-files $CHANGED_PY \ From 913cb28093624acf939575e800b22cf3b9175b7d Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 21 Apr 2026 21:03:29 +0000 Subject: [PATCH 4/6] fix(ci): use array expansion for changed-files arg to handle special filenames Replace unquoted $CHANGED_PY word-split with mapfile + array expansion to prevent glob expansion and correctly handle filenames containing spaces or special characters. --- .github/workflows/agentic-ci-pr-review.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/agentic-ci-pr-review.yml b/.github/workflows/agentic-ci-pr-review.yml index 5f334ca44..0d1ee1c44 100644 --- a/.github/workflows/agentic-ci-pr-review.yml +++ b/.github/workflows/agentic-ci-pr-review.yml @@ -159,8 +159,8 @@ jobs: PR_NUMBER: ${{ steps.pr.outputs.number }} GH_TOKEN: ${{ github.token }} run: | - CHANGED_PY=$(gh pr diff "$PR_NUMBER" --name-only | grep '\.py$' || true) - if [ -z "$CHANGED_PY" ]; then + mapfile -t CHANGED_PY < <(gh pr diff "$PR_NUMBER" --name-only | grep '\.py$' || true) + if [ ${#CHANGED_PY[@]} -eq 0 ]; then echo "No Python files changed - skipping structural analysis." exit 0 fi @@ -168,7 +168,7 @@ jobs: /tmp/graphify-venv/bin/python -m pip install graphifyy==0.4.23 --quiet 2>&1 | tail -3 /tmp/graphify-venv/bin/python base-agents/.agents/tools/structural_impact.py \ --repo-root "${{ github.workspace }}" \ - --changed-files $CHANGED_PY \ + --changed-files "${CHANGED_PY[@]}" \ --output "/tmp/structural-impact-${PR_NUMBER}.md" echo "Structural impact analysis complete:" cat "/tmp/structural-impact-${PR_NUMBER}.md" From 55f0bddab465077695120f4eb90ebc80030cd6ca Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 21 Apr 2026 21:15:24 +0000 Subject: [PATCH 5/6] fix: derive changed nodes from graph and improve MEDIUM risk reason Derive changed_node_ids from the already-built graph by matching source_file paths instead of running a separate extraction pass. Removes implicit dependency on graphify ID stability across independent extractions. Fix MEDIUM risk reason to reflect the actual trigger (cluster spread vs high-connectivity entity) instead of always reporting cluster count. --- .agents/tools/structural_impact.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.agents/tools/structural_impact.py b/.agents/tools/structural_impact.py index 12561b05b..00ddd31c9 100644 --- a/.agents/tools/structural_impact.py +++ b/.agents/tools/structural_impact.py @@ -193,7 +193,8 @@ def _changed_files_mode(changed_files: list[Path]) -> str: analysis = _build_graph(_collect_source_files()) G, communities, gods = analysis["graph"], analysis["communities"], analysis["god_nodes"] - changed_node_ids = {n["id"] for n in extract(changed_files)["nodes"]} + changed_paths = {str(p.resolve()) for p in changed_files} + changed_node_ids = {nid for nid in G.nodes() if G.nodes[nid].get("source_file") in changed_paths} node_to_community = {n: cid for cid, nodes in communities.items() for n in nodes} affected_gods = [{"rank": gods.index(g) + 1, **g} for g in gods if g["id"] in changed_node_ids] @@ -228,7 +229,13 @@ def _changed_files_mode(changed_files: list[Path]) -> str: reasons.append(f"{len(unique_violations)} import direction violation(s)") risk, risk_reason = "HIGH", "; ".join(reasons) elif len(affected_communities) > 3 or any(h["degree"] > 20 for h in high_impact): - risk, risk_reason = "MEDIUM", f"{len(affected_communities)} clusters affected" + medium_reasons = [] + if len(affected_communities) > 3: + medium_reasons.append(f"{len(affected_communities)} clusters affected") + high_deg = [h for h in high_impact if h["degree"] > 20] + if high_deg: + medium_reasons.append(f"high-connectivity entity ({high_deg[0]['label']}, {high_deg[0]['degree']} deps)") + risk, risk_reason = "MEDIUM", "; ".join(medium_reasons) else: risk, risk_reason = "LOW", "localized change" From a63e74dac255d930862bb27f717523f02b0d214c Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 21 Apr 2026 21:32:29 +0000 Subject: [PATCH 6/6] fix: address Codex review findings - security, edge coverage, dedup, stale artifacts Split the workflow step to isolate GH_TOKEN from graphifyy execution, preventing a compromised package release from exfiltrating write-scoped tokens. Scan both edge directions in _cross_package_edges so inbound dependents and violations where the changed node is the target are visible. Detect deleted files and report them as a risk signal. Include relation type in dedup key so distinct edge types between the same labels are not collapsed. Clean stale /tmp artifacts before running analysis to prevent reruns from reading old reports. --- .agents/tools/structural_impact.py | 35 ++++++++++++++-------- .github/workflows/agentic-ci-pr-review.yml | 20 ++++++++----- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/.agents/tools/structural_impact.py b/.agents/tools/structural_impact.py index 00ddd31c9..a4fcd3847 100644 --- a/.agents/tools/structural_impact.py +++ b/.agents/tools/structural_impact.py @@ -63,11 +63,11 @@ def _rel(filepath: str) -> str: return filepath -def _dedup(items: list[dict], key_a: str = "from_label", key_b: str = "to_label") -> list[dict]: - seen: set[tuple[str, str]] = set() +def _dedup(items: list[dict], keys: tuple[str, ...] = ("from_label", "to_label", "relation")) -> list[dict]: + seen: set[tuple[str, ...]] = set() out: list[dict] = [] for e in items: - k = (e[key_a], e[key_b]) + k = tuple(e[field] for field in keys) if k not in seen: seen.add(k) out.append(e) @@ -83,7 +83,7 @@ def _build_graph(files: list[Path]) -> dict: def _cross_package_edges(G, node_ids: set[str] | None = None) -> tuple[list[dict], list[dict], dict[str, int]]: - """Collect cross-package edges. If node_ids is given, only from those nodes. + """Collect cross-package edges. If node_ids is given, edges touching those nodes (either direction). Returns (cross_pkg_edges, violations, direction_counts). """ @@ -92,7 +92,7 @@ def _cross_package_edges(G, node_ids: set[str] | None = None) -> tuple[list[dict direction_counts: dict[str, int] = {} edges = ( - ((nid, nbr, G.edges[nid, nbr]) for nid in node_ids if nid in G for nbr in G.successors(nid)) + ((u, v, d) for u, v, d in G.edges(data=True) if u in node_ids or v in node_ids) if node_ids is not None else ((u, v, d) for u, v, d in G.edges(data=True)) ) @@ -187,7 +187,7 @@ def _fmt_cross_pkg(items: list[dict], limit: int = 6) -> list[str]: # -- Modes -- -def _changed_files_mode(changed_files: list[Path]) -> str: +def _changed_files_mode(changed_files: list[Path], deleted_files: list[Path] | None = None) -> str: """PR review: analyze changed files against full codebase.""" t0 = time.monotonic() analysis = _build_graph(_collect_source_files()) @@ -220,6 +220,8 @@ def _changed_files_mode(changed_files: list[Path]) -> str: cross_pkg, violations, _ = _cross_package_edges(G, changed_node_ids) unique_cross, unique_violations = _dedup(cross_pkg), _dedup(violations) + n_deleted = len(deleted_files) if deleted_files else 0 + # Risk level. if affected_gods or unique_violations: reasons = [] @@ -236,18 +238,24 @@ def _changed_files_mode(changed_files: list[Path]) -> str: if high_deg: medium_reasons.append(f"high-connectivity entity ({high_deg[0]['label']}, {high_deg[0]['degree']} deps)") risk, risk_reason = "MEDIUM", "; ".join(medium_reasons) + elif n_deleted > 0: + risk, risk_reason = "MEDIUM", f"{n_deleted} file(s) deleted" else: risk, risk_reason = "LOW", "localized change" elapsed = time.monotonic() - t0 + file_count = len(changed_files) + n_deleted lines = [ f"### Structural Impact _(graphify, {elapsed:.1f}s)_", "", f"**Risk: {risk}** ({risk_reason})", - f"- {len(changed_files)} Python files, {len(changed_node_ids)} AST entities, " + f"- {file_count} Python files, {len(changed_node_ids)} AST entities, " f"{len(affected_communities)}/{len(communities)} clusters", "", ] + if n_deleted: + lines.append(f"- {n_deleted} Python file(s) deleted (impact not fully analyzable)") + lines.append("") lines += _fmt_violations(unique_violations, limit=5) lines += _fmt_gods(affected_gods, affected_only=True) lines += _fmt_high_impact(high_impact) @@ -344,15 +352,16 @@ def main() -> None: _REPO_ROOT = Path(args.repo_root).resolve() if args.changed_files: - changed = [ - p + all_paths = [ + Path(raw) if Path(raw).is_absolute() else _REPO_ROOT / raw for raw in args.changed_files - for p in [Path(raw) if Path(raw).is_absolute() else _REPO_ROOT / raw] - if p.exists() and p.suffix == ".py" + if raw.endswith(".py") ] + changed = [p for p in all_paths if p.exists()] + deleted = [p for p in all_paths if not p.exists()] report = ( - _changed_files_mode(changed) - if changed + _changed_files_mode(changed, deleted_files=deleted or None) + if changed or deleted else "### Structural Impact\n\nNo Python files changed - skipping.\n" ) else: diff --git a/.github/workflows/agentic-ci-pr-review.yml b/.github/workflows/agentic-ci-pr-review.yml index 0d1ee1c44..d2b1a8915 100644 --- a/.github/workflows/agentic-ci-pr-review.yml +++ b/.github/workflows/agentic-ci-pr-review.yml @@ -154,24 +154,28 @@ jobs: .agents/tools path: base-agents - - name: Structural impact analysis + - name: List changed Python files + id: changed-py env: PR_NUMBER: ${{ steps.pr.outputs.number }} GH_TOKEN: ${{ github.token }} run: | - mapfile -t CHANGED_PY < <(gh pr diff "$PR_NUMBER" --name-only | grep '\.py$' || true) - if [ ${#CHANGED_PY[@]} -eq 0 ]; then - echo "No Python files changed - skipping structural analysis." - exit 0 - fi + gh pr diff "$PR_NUMBER" --name-only | grep '\.py$' > /tmp/changed-py.txt || true + echo "count=$(wc -l < /tmp/changed-py.txt | tr -d ' ')" >> "$GITHUB_OUTPUT" + + - name: Structural impact analysis + if: steps.changed-py.outputs.count != '0' + run: | + rm -f "/tmp/structural-impact-${{ steps.pr.outputs.number }}.md" + mapfile -t CHANGED_PY < /tmp/changed-py.txt python -m venv /tmp/graphify-venv /tmp/graphify-venv/bin/python -m pip install graphifyy==0.4.23 --quiet 2>&1 | tail -3 /tmp/graphify-venv/bin/python base-agents/.agents/tools/structural_impact.py \ --repo-root "${{ github.workspace }}" \ --changed-files "${CHANGED_PY[@]}" \ - --output "/tmp/structural-impact-${PR_NUMBER}.md" + --output "/tmp/structural-impact-${{ steps.pr.outputs.number }}.md" echo "Structural impact analysis complete:" - cat "/tmp/structural-impact-${PR_NUMBER}.md" + cat "/tmp/structural-impact-${{ steps.pr.outputs.number }}.md" continue-on-error: true - name: Pre-flight checks