-
Notifications
You must be signed in to change notification settings - Fork 0
feat: add CelestialBody ingestion layer to A2A handoff server #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,8 @@ | |||||||||
| from flask_cors import CORS | ||||||||||
| import threading | ||||||||||
|
|
||||||||||
| from celestial_ingestion import IngestionError, ingest_repository | ||||||||||
|
|
||||||||||
| app = Flask(__name__) | ||||||||||
| CORS(app) | ||||||||||
|
|
||||||||||
|
|
@@ -244,6 +246,27 @@ def list_sessions(): | |||||||||
| "count": len(manager.active_sessions) | ||||||||||
| }) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @app.route('/api/a2a/ingest', methods=['POST']) | ||||||||||
| def ingest_celestial_body(): | ||||||||||
| """Convert a raw repository path into a CelestialBody payload.""" | ||||||||||
| data = request.json or {} | ||||||||||
| repo_path = data.get("repo_path") | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Comment on lines
+250
to
+254
|
||||||||||
| metadata = data.get("metadata", {}) | ||||||||||
|
Comment on lines
+253
to
+255
|
||||||||||
|
|
||||||||||
| if not repo_path: | ||||||||||
| return jsonify({"error": "repo_path is required"}), 400 | ||||||||||
|
|
||||||||||
|
||||||||||
| if not isinstance(metadata, dict): | |
| return jsonify({"error": "metadata must be an object"}), 400 |
Copilot
AI
Apr 10, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This endpoint accepts an arbitrary repo_path and passes it to ingest_repository, which walks the filesystem and reads files (README + code files). Without an allowlist/sandbox, a client can point this at any readable directory on the server, causing unintended data exposure. Recommend restricting ingestion to a configured base directory (and verifying the resolved path stays within it), or requiring a pre-registered repo identifier instead of a raw filesystem path.
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,160 @@ | ||||||||||||
| #!/usr/bin/env python3 | ||||||||||||
| """A2A ingestion utilities for converting repositories into CelestialBody objects.""" | ||||||||||||
|
|
||||||||||||
| from __future__ import annotations | ||||||||||||
|
|
||||||||||||
| import hashlib | ||||||||||||
| import json | ||||||||||||
| from dataclasses import asdict, dataclass | ||||||||||||
| from pathlib import Path | ||||||||||||
| from typing import Any | ||||||||||||
|
|
||||||||||||
| CODE_EXTENSIONS = { | ||||||||||||
| ".py": "python", | ||||||||||||
| ".rs": "rust", | ||||||||||||
| ".js": "javascript", | ||||||||||||
| ".ts": "typescript", | ||||||||||||
| ".jsx": "react", | ||||||||||||
| ".tsx": "react", | ||||||||||||
| ".sol": "solidity", | ||||||||||||
| ".go": "go", | ||||||||||||
| ".java": "java", | ||||||||||||
| ".c": "c", | ||||||||||||
| ".cpp": "cpp", | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| @dataclass | ||||||||||||
| class CelestialBody: | ||||||||||||
| body_id: str | ||||||||||||
| name: str | ||||||||||||
| source_path: str | ||||||||||||
|
||||||||||||
| mass: float | ||||||||||||
| atmosphere: dict[str, Any] | ||||||||||||
| gravity: dict[str, Any] | ||||||||||||
| orbital_state: dict[str, Any] | ||||||||||||
| seismic_test: dict[str, Any] | ||||||||||||
|
|
||||||||||||
| def to_dict(self) -> dict[str, Any]: | ||||||||||||
| return asdict(self) | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| class IngestionError(ValueError): | ||||||||||||
| """Raised when ingestion input is invalid.""" | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _infer_language_counts(repo_path: Path) -> dict[str, int]: | ||||||||||||
| counts: dict[str, int] = {} | ||||||||||||
| for file in repo_path.rglob("*"): | ||||||||||||
| if not file.is_file() or ".git" in file.parts: | ||||||||||||
| continue | ||||||||||||
| lang = CODE_EXTENSIONS.get(file.suffix.lower()) | ||||||||||||
| if lang: | ||||||||||||
| counts[lang] = counts.get(lang, 0) + 1 | ||||||||||||
| return counts | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _line_count(repo_path: Path) -> int: | ||||||||||||
| total = 0 | ||||||||||||
| for file in repo_path.rglob("*"): | ||||||||||||
| if not file.is_file() or ".git" in file.parts: | ||||||||||||
| continue | ||||||||||||
| try: | ||||||||||||
| if file.suffix.lower() in CODE_EXTENSIONS: | ||||||||||||
| total += len(file.read_text(encoding="utf-8", errors="ignore").splitlines()) | ||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using
Suggested change
|
||||||||||||
| except OSError: | ||||||||||||
| continue | ||||||||||||
| return total | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _read_summary(repo_path: Path) -> str: | ||||||||||||
| readme = repo_path / "README.md" | ||||||||||||
| if readme.exists(): | ||||||||||||
| return readme.read_text(encoding="utf-8", errors="ignore")[:2000] | ||||||||||||
| return "" | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _compute_atmosphere(summary: str, metadata: dict[str, Any]) -> dict[str, Any]: | ||||||||||||
| text = f"{summary} {json.dumps(metadata, sort_keys=True)}".lower() | ||||||||||||
| traits = { | ||||||||||||
| "collaborative": any(k in text for k in ["agent", "handoff", "orchestr", "a2a"]), | ||||||||||||
| "stability_focus": any(k in text for k in ["test", "verify", "integrity", "security"]), | ||||||||||||
| "novelty_drive": any(k in text for k in ["experimental", "research", "prototype", "novel"]), | ||||||||||||
| } | ||||||||||||
| intent = "ecosystem" if traits["collaborative"] else "specialist" | ||||||||||||
| return { | ||||||||||||
| "intent": intent, | ||||||||||||
| "traits": traits, | ||||||||||||
| "summary_excerpt": summary[:240], | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _compute_mass(file_count: int, loc: int) -> float: | ||||||||||||
| raw = (file_count * 0.4) + (loc * 0.001) | ||||||||||||
| return round(min(max(raw, 0.1), 100.0), 3) | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _compute_gravity(mass: float, atmosphere: dict[str, Any], language_counts: dict[str, int]) -> dict[str, Any]: | ||||||||||||
| language_diversity = len(language_counts) | ||||||||||||
| influence = round(min(1.0, (mass / 100.0) + (language_diversity * 0.03)), 6) | ||||||||||||
| pull = "high" if influence >= 0.75 else "medium" if influence >= 0.35 else "low" | ||||||||||||
| return { | ||||||||||||
| "influence_score": influence, | ||||||||||||
| "pull_tier": pull, | ||||||||||||
| "language_diversity": language_diversity, | ||||||||||||
| "anchors": sorted(language_counts, key=language_counts.get, reverse=True)[:3], | ||||||||||||
| "sanctuary_alignment": atmosphere["traits"]["stability_focus"], | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _run_seismic_test(payload: dict[str, Any]) -> dict[str, Any]: | ||||||||||||
| canonical = json.dumps(payload, sort_keys=True, separators=(",", ":")) | ||||||||||||
| digest_a = hashlib.sha256(canonical.encode()).hexdigest() | ||||||||||||
| digest_b = hashlib.sha256(canonical.encode()).hexdigest() | ||||||||||||
|
||||||||||||
| digest_b = hashlib.sha256(canonical.encode()).hexdigest() | |
| rematerialized = json.loads(canonical) | |
| canonical_rematerialized = json.dumps(rematerialized, sort_keys=True, separators=(",", ":")) | |
| digest_b = hashlib.sha256(canonical_rematerialized.encode()).hexdigest() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calculating the same SHA-256 hash twice (digest_a and digest_b) from the same input and then comparing them is redundant. This does not provide a meaningful "invariance" check or stress test. If the intent is to provide a deterministic citation for the payload, a single calculation is sufficient.
| digest_a = hashlib.sha256(canonical.encode()).hexdigest() | |
| digest_b = hashlib.sha256(canonical.encode()).hexdigest() | |
| invariant = digest_a == digest_b | |
| digest_a = hashlib.sha256(canonical.encode()).hexdigest() | |
| invariant = True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The repository is traversed twice: once in _infer_language_counts and again in _line_count. This doubles the I/O overhead, which can be significant for large repositories. Consider refactoring these into a single pass over the filesystem that collects both language statistics and line counts simultaneously.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Validate metadata type before field access
ingest_repository assumes metadata is a mapping and calls .get(...), but the new /api/a2a/ingest endpoint accepts arbitrary JSON values for metadata. If a client sends a non-object (for example a string or array), this raises AttributeError and escapes as a 500 because the route only catches IngestionError; invalid client input should be rejected as 400 instead of crashing the request.
Useful? React with 👍 / 👎.
Copilot
AI
Apr 10, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ingestion payload includes source_path as an absolute resolved server filesystem path. Since this payload is returned by the API, it can unintentionally disclose server directory structure. Consider omitting source_path from the returned payload, or returning only a sanitized identifier (e.g., repo basename) or a path relative to a configured allowed base directory.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Compute seismic citation from the returned payload
The citation is generated before seismic_test is populated (payload["seismic_test"] is still {} at hash time), and then the payload is mutated with the final seismic block. As a result, consumers cannot validate the returned object by hashing the response body: the advertised deterministic citation will always disagree with the final payload content, breaking downstream verification workflows.
Useful? React with 👍 / 👎.
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,32 @@ | ||||||||||||||||||||||||||||||||||||
| import sys | ||||||||||||||||||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| from celestial_ingestion import IngestionError, ingest_repository | ||||||||||||||||||||||||||||||||||||
|
Comment on lines
+1
to
+6
|
||||||||||||||||||||||||||||||||||||
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) | |
| from celestial_ingestion import IngestionError, ingest_repository | |
| import importlib.util | |
| from pathlib import Path | |
| _MODULE_PATH = Path(__file__).resolve().parents[1] / "celestial_ingestion.py" | |
| _SPEC = importlib.util.spec_from_file_location("celestial_ingestion", _MODULE_PATH) | |
| _MODULE = importlib.util.module_from_spec(_SPEC) | |
| assert _SPEC is not None and _SPEC.loader is not None | |
| _SPEC.loader.exec_module(_MODULE) | |
| IngestionError = _MODULE.IngestionError | |
| ingest_repository = _MODULE.ingest_repository |
Copilot
AI
Apr 10, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test is likely to be flaky because it hardcodes a path under /tmp that could exist on some systems. Also, the try/except + assert True/False pattern is harder to read and can hide unexpected exceptions. Prefer using a tmp-based guaranteed-missing path (e.g., from tmp_path) and pytest.raises(IngestionError).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using
request.jsoncan raise a BadRequest exception for invalid JSON with anapplication/jsoncontent-type, which would bypass this handler and return a default error. Preferrequest.get_json(silent=True) or {}so you can consistently return a structured 400 response for invalid bodies.