From 651aa37bbde9294c8f3c912b8922aae771392543 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sat, 30 May 2026 23:42:23 +0200 Subject: [PATCH 1/2] feat(data): shared download manifest for artefacts and binaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a single, language-agnostic manifest (data/manifest.schema.json) that lists every downloadable data artefact and external-binary bundle with a SHA256, consumed by both raven-python and (via the same JSON) MATLAB RAVEN. The manifest is a superset of the two runtime registries: * manifest["data"] -> raven_python.data._DATA_REGISTRY * manifest["binaries"] -> raven_python.binaries._REGISTRY Added: * data/manifest.schema.json (JSON Schema) + data/manifest.example.json (worked example) + data/manifest.json (empty, the live source of truth until assets are published). * raven_python.manifest — load_manifest / to_*_registry / load_into_registries. * Lazy autoload: data.ensure_* and binaries.ensure_binary populate themselves from $RAVEN_PYTHON_MANIFEST on first use when their registry is still empty (guarded; no effect when a registry is passed explicitly or the env var is unset). * scripts/make_registry_snippet.py: a `manifest` subcommand that computes url+sha256+bytes and writes/updates manifest.json. * tests/test_manifest.py (round-trip, converters, lazy autoload via file:// URLs, repo manifests valid). * docs/maintenance/data_manifest.md — format, Python + MATLAB consumers, GitHub-Releases vs Zenodo hosting (incl. a release→Zenodo GitHub Action), and per-asset recommendations. --- data/manifest.example.json | 34 ++++++++ data/manifest.json | 6 ++ data/manifest.schema.json | 82 +++++++++++++++++ docs/maintenance/data_manifest.md | 140 ++++++++++++++++++++++++++++++ docs/maintenance/index.md | 4 + docs/reference/api/resolvers.md | 10 +++ scripts/make_registry_snippet.py | 104 +++++++++++++++++++++- src/raven_python/binaries.py | 14 +++ src/raven_python/data.py | 14 +++ src/raven_python/manifest.py | 123 ++++++++++++++++++++++++++ tests/test_manifest.py | 125 ++++++++++++++++++++++++++ 11 files changed, 652 insertions(+), 4 deletions(-) create mode 100644 data/manifest.example.json create mode 100644 data/manifest.json create mode 100644 data/manifest.schema.json create mode 100644 docs/maintenance/data_manifest.md create mode 100644 src/raven_python/manifest.py create mode 100644 tests/test_manifest.py diff --git a/data/manifest.example.json b/data/manifest.example.json new file mode 100644 index 0000000..80f3e5f --- /dev/null +++ b/data/manifest.example.json @@ -0,0 +1,34 @@ +{ + "manifest_version": 1, + "generated": "2026-05-30", + "data": { + "kegg": { + "version": "kegg116", + "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", + "license": "Derived from KEGG (subscription-licensed bulk dump) — confirm redistribution rights before publishing publicly.", + "doi": "10.5281/zenodo.0000000", + "source": "https://github.com/SysBioChalmers/raven-data/releases/tag/kegg-kegg116", + "files": { + "reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "organism_gene_ko.tsv.xz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/organism_gene_ko.tsv.xz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "prokaryotes.hmm": { "url": "https://zenodo.org/records/0000000/files/prokaryotes.hmm", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } + } + } + }, + "binaries": { + "diamond": { + "version": "2.1.9", + "provides": ["diamond"], + "description": "DIAMOND protein aligner (homology-based reconstruction).", + "license": "GPL-3.0-only — ship the upstream COPYING alongside each ZIP.", + "platforms": { + "linux-x86_64": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9/diamond-2.1.9-linux-x86_64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "macos-arm64": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9/diamond-2.1.9-macos-arm64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "windows-x86_64": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9/diamond-2.1.9-windows-x86_64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } + } + } + } +} diff --git a/data/manifest.json b/data/manifest.json new file mode 100644 index 0000000..baf5ee9 --- /dev/null +++ b/data/manifest.json @@ -0,0 +1,6 @@ +{ + "manifest_version": 1, + "generated": "2026-05-30", + "data": {}, + "binaries": {} +} diff --git a/data/manifest.schema.json b/data/manifest.schema.json new file mode 100644 index 0000000..15fccb8 --- /dev/null +++ b/data/manifest.schema.json @@ -0,0 +1,82 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/SysBioChalmers/raven-data/manifest.schema.json", + "title": "raven-data manifest", + "description": "Language-agnostic registry of downloadable raven-python / RAVEN data artefacts and external binary bundles. Consumed by the Python resolvers (raven_python.data / raven_python.binaries) and by MATLAB RAVEN. Every file carries a SHA256 so consumers verify integrity after download.", + "type": "object", + "required": ["manifest_version"], + "additionalProperties": false, + "properties": { + "manifest_version": { + "type": "integer", + "const": 1, + "description": "Format version of this manifest document." + }, + "generated": { + "type": "string", + "description": "ISO-8601 date the manifest was generated (informational)." + }, + "data": { + "type": "object", + "description": "Data-artefact datasets, keyed by dataset id (e.g. 'kegg'). Maps onto raven_python.data._DATA_REGISTRY.", + "additionalProperties": { "$ref": "#/$defs/dataset" } + }, + "binaries": { + "type": "object", + "description": "External command-line tool bundles, keyed by bundle id (e.g. 'blast', 'diamond', 'hmmer'). Maps onto raven_python.binaries._REGISTRY.", + "additionalProperties": { "$ref": "#/$defs/bundle" } + } + }, + "$defs": { + "file": { + "type": "object", + "required": ["url", "sha256"], + "additionalProperties": false, + "properties": { + "url": { "type": "string", "format": "uri", "description": "Direct download URL (GitHub release asset, Zenodo file, etc.)." }, + "sha256": { "type": "string", "pattern": "^[0-9a-f]{64}$", "description": "Lowercase hex SHA256 of the file." }, + "bytes": { "type": "integer", "minimum": 0, "description": "File size in bytes (informational; for progress bars / sanity checks)." } + } + }, + "dataset": { + "type": "object", + "required": ["version", "files"], + "additionalProperties": false, + "properties": { + "version": { "type": "string", "description": "Dataset version tag, e.g. 'kegg116'. Used in the cache path." }, + "description": { "type": "string" }, + "license": { "type": "string", "description": "SPDX id or free text. NOTE: KEGG-derived artefacts are subject to KEGG's terms — confirm redistribution rights before publishing." }, + "doi": { "type": "string", "description": "Zenodo (or other) DOI for this dataset version, if archived." }, + "source": { "type": "string", "format": "uri", "description": "Human-facing page for the release/record (GitHub release or Zenodo landing page)." }, + "files": { + "type": "object", + "minProperties": 1, + "description": "Artefact files keyed by filename.", + "additionalProperties": { "$ref": "#/$defs/file" } + } + } + }, + "bundle": { + "type": "object", + "required": ["version", "provides", "platforms"], + "additionalProperties": false, + "properties": { + "version": { "type": "string", "description": "Upstream tool version, e.g. '2.16.0'." }, + "provides": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1, + "description": "Executable names this bundle provides, e.g. ['blastp', 'makeblastdb']." + }, + "description": { "type": "string" }, + "license": { "type": "string", "description": "Upstream tool license (e.g. DIAMOND is GPL-3.0-only — ship its license text alongside the ZIP)." }, + "platforms": { + "type": "object", + "minProperties": 1, + "description": "One entry per platform, keyed '-' (e.g. 'linux-x86_64', 'macos-arm64', 'windows-x86_64'). Matches raven_python.binaries._platform_key().", + "additionalProperties": { "$ref": "#/$defs/file" } + } + } + } + } +} diff --git a/docs/maintenance/data_manifest.md b/docs/maintenance/data_manifest.md new file mode 100644 index 0000000..554f08c --- /dev/null +++ b/docs/maintenance/data_manifest.md @@ -0,0 +1,140 @@ +# Data & binary manifest + +Large artefacts (KEGG tables / HMMs, template models) and external-binary bundles +(BLAST / DIAMOND / HMMER) are **not** committed to the code repository. They are published +as downloadable assets and described by a single, language-agnostic **manifest** that both +raven-python and MATLAB RAVEN read. Every file carries a **SHA256**, so consumers verify +integrity after download. + +- Format: [`data/manifest.schema.json`](https://github.com/SysBioChalmers/raven-python/blob/develop/data/manifest.schema.json) (JSON Schema) +- Worked example: [`data/manifest.example.json`](https://github.com/SysBioChalmers/raven-python/blob/develop/data/manifest.example.json) +- Live manifest: [`data/manifest.json`](https://github.com/SysBioChalmers/raven-python/blob/develop/data/manifest.json) (empty until assets are published) + +The manifest is a superset of the two runtime registries: + +| Manifest section | Runtime registry | +| --- | --- | +| `data` | {data}`raven_python.data._DATA_REGISTRY` | +| `binaries` | `raven_python.binaries._REGISTRY` | + +```json +{ + "manifest_version": 1, + "data": { "": { "version": "...", "doi": "...", "files": { "": {"url": "...", "sha256": "...", "bytes": 0} } } }, + "binaries": { "": { "version": "...", "provides": ["..."], "platforms": { "-": {"url": "...", "sha256": "...", "bytes": 0} } } } +} +``` + +## Consuming it — Python + +Point raven-python at a manifest and the resolvers populate themselves on first use, +verifying each download's checksum: + +```bash +export RAVEN_PYTHON_MANIFEST=https://github.com/SysBioChalmers/raven-data/releases/download/manifest-v1/manifest.json +``` + +```python +from raven_python import manifest +manifest.load_into_registries() # or load_into_registries("/path/or/url") +# now data.ensure_kegg_data() / binaries.ensure_binary("diamond") resolve from the manifest +``` + +If `RAVEN_PYTHON_MANIFEST` is set, `data.ensure_*` and `binaries.ensure_binary` load it +lazily — no explicit call needed. + +## Consuming it — MATLAB + +The same JSON is trivial to read from MATLAB (`webread` + `jsondecode`), download +(`websave`), and verify (Java's `MessageDigest`, always available in MATLAB): + +```matlab +function file = ensureDataFile(manifestUrl, dataset, name, cacheDir) + m = jsondecode(webread(manifestUrl, weboptions('ContentType','text'))); + entry = m.data.(dataset).files.(matlab.lang.makeValidName(name)); + file = fullfile(cacheDir, name); + if ~isfile(file) + websave(file, entry.url); + end + assert(strcmp(sha256(file), entry.sha256), 'SHA256 mismatch for %s', name); +end + +function hex = sha256(file) + fid = fopen(file, 'r'); raw = fread(fid, Inf, '*uint8'); fclose(fid); + md = java.security.MessageDigest.getInstance('SHA-256'); + md.update(raw); + hex = lower(reshape(dec2hex(typecast(md.digest(), 'uint8'))', 1, [])); +end +``` + +## Publishing — generating manifest entries + +After uploading a release's files, add/update an entry with the maintainer script +([`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/make_registry_snippet.py)), +which computes each SHA256 and byte size: + +```bash +python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \ + --target data --dataset kegg --version kegg116 --dir artefacts \ + --base-url https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116 \ + --doi 10.5281/zenodo.0000000 --source https://zenodo.org/records/0000000 + +python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \ + --target binary --bundle diamond --version 2.1.9 --provides diamond --dir zips \ + --base-url https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9 \ + --license GPL-3.0-only +``` + +## Where to host: GitHub Releases vs Zenodo + +Both are just URLs in the manifest, so consumers don't care — choose per asset: + +- **GitHub Releases** — simplest, free, language-agnostic, up to ~2 GB per file. Good default, + and you're already on GitHub for the code. +- **Zenodo** — adds a citable **DOI**, long-term archival, and handles files larger than 2 GB + (up to 50 GB/record). Right for the KEGG HMM bundle and anything you want citable. + +### Auto-publishing to Zenodo from GitHub + +:::{important} +The **native GitHub↔Zenodo integration** (flip a switch, publish a Release → DOI) archives +the **repository source zipball** at the tag — it does **not** capture files attached to the +Release. So it only works for assets *committed into the repo*, which defeats the purpose for +multi-GB binaries. Use it for a *software* DOI, not for the data assets. +::: + +For the data assets, keep everything GitHub-driven with a small **GitHub Action** that, on +release, uploads the assets to Zenodo via its REST API (e.g. [`zenodraft`](https://github.com/zenodraft/zenodraft)). +You cut a normal GitHub Release with the files attached; the Action mirrors them to Zenodo and +mints a new version DOI. Drop this in the data repo as `.github/workflows/zenodo.yml`: + +```yaml +name: Mirror release assets to Zenodo +on: + release: + types: [published] +jobs: + zenodo: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: { node-version: "20" } + - name: Download this release's assets + run: gh release download "${{ github.event.release.tag_name }}" --dir assets + env: { GH_TOKEN: "${{ github.token }}" } + - name: Deposit a new version on Zenodo + run: npx zenodraft@latest version create --publish ${{ vars.ZENODO_CONCEPT_DOI }} assets/* + env: { ZENODO_ACCESS_TOKEN: "${{ secrets.ZENODO_TOKEN }}" } +``` + +Then record the resulting DOI in the manifest via the `--doi` flag above. Net result: you only +ever interact with GitHub Releases; Zenodo archiving + DOIs happen automatically. + +## Per-asset recommendations + +| Asset | Home | Notes | +| --- | --- | --- | +| **Software binaries** (BLAST / DIAMOND / HMMER) | **bioconda** preferred; or release ZIPs via the resolver | DIAMOND is **GPL-3.0** — ship its license text in the ZIP; keep it as a separate asset, never bundled into the MIT wheel. | +| **KEGG HMMs / tables** | **Zenodo** (DOI, >2 GB, archival) | ⚠️ Derived from the subscription-licensed KEGG dump — **confirm redistribution rights with KEGG before publishing publicly**. If not permitted, keep access-gated and have users build from their own dump (the resolver supports a local dir). | +| **Template models** (Human-GEM, yeast-GEM) | **Don't re-host** | Fetch from their canonical repos by pinned release tag — respects their licenses and avoids stale copies. | diff --git a/docs/maintenance/index.md b/docs/maintenance/index.md index a382f42..2227d03 100644 --- a/docs/maintenance/index.md +++ b/docs/maintenance/index.md @@ -8,6 +8,9 @@ rebuild and release them. artefact releases. - **[Maintaining binaries](maintaining_binaries.md)** — building and publishing the external-binary (BLAST / DIAMOND / HMMER) ZIP releases. +- **[Data & binary manifest](data_manifest.md)** — the shared manifest that lists every + published artefact / binary (consumed by raven-python and MATLAB RAVEN), where to host + assets (GitHub Releases vs Zenodo), and the GitHub→Zenodo auto-publish setup. ```{toctree} :hidden: @@ -15,4 +18,5 @@ rebuild and release them. kegg_data_format maintaining_kegg_data maintaining_binaries +data_manifest ``` diff --git a/docs/reference/api/resolvers.md b/docs/reference/api/resolvers.md index 3266fcd..45b7dde 100644 --- a/docs/reference/api/resolvers.md +++ b/docs/reference/api/resolvers.md @@ -20,3 +20,13 @@ Data-bundle resolver (KEGG artefacts and template-model data). .. automodule:: raven_python.data :members: ``` + +## `raven_python.manifest` + +Loads a shared [data/binary manifest](../../maintenance/data_manifest.md) into the two +registries above (and is consulted lazily via `$RAVEN_PYTHON_MANIFEST`). + +```{eval-rst} +.. automodule:: raven_python.manifest + :members: +``` diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py index 3efa49e..4700b3c 100644 --- a/scripts/make_registry_snippet.py +++ b/scripts/make_registry_snippet.py @@ -19,6 +19,19 @@ --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\ --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0 +Add/update an entry in the shared ``manifest.json`` (the single source of truth read by +both raven-python and MATLAB RAVEN — see data/manifest.schema.json):: + + python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \\ + --target data --dataset kegg --version kegg116 --dir artefacts \\ + --base-url https://github.com/ORG/raven-data/releases/download/kegg-kegg116 \\ + --doi 10.5281/zenodo.0000000 + + python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \\ + --target binary --bundle diamond --version 2.1.9 --provides diamond --dir zips \\ + --base-url https://github.com/ORG/raven-data/releases/download/diamond-2.1.9 \\ + --license GPL-3.0-only + The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so published checksums always match what ``ensure_data`` / ``ensure_binary`` verify. """ @@ -68,6 +81,55 @@ def render(key: str, entry: dict) -> str: return json.dumps({key: entry}, indent=4) +# --- manifest.json (shared source of truth) -------------------------------- + + +def _file_meta(path: Path, base: str) -> dict: + """Manifest file record: url + sha256 + byte size.""" + return {"url": f"{base}/{path.name}", "sha256": _sha256(path), "bytes": path.stat().st_size} + + +def manifest_data_entry(version: str, base_url: str, directory: Path, **meta: str) -> dict: + """Build a manifest ``data`` dataset entry (registry fields + optional metadata).""" + base = base_url.rstrip("/") + files = {p.name: _file_meta(p, base) for p in _files_in(directory)} + if not files: + raise SystemExit(f"No files found in {directory}") + entry = {"version": version} + entry.update({k: v for k, v in meta.items() if v}) # description/license/doi/source + entry["files"] = files + return entry + + +def manifest_binary_entry( + bundle: str, version: str, provides: list[str], base_url: str, directory: Path, **meta: str +) -> dict: + """Build a manifest ``binaries`` bundle entry from per-platform ZIPs.""" + base = base_url.rstrip("/") + prefix = f"{bundle}-{version}-" + platforms = { + zp.name[len(prefix) : -len(".zip")]: _file_meta(zp, base) + for zp in sorted(directory.glob(f"{prefix}*.zip")) + } + if not platforms: + raise SystemExit(f"No {prefix}*.zip files found in {directory}") + entry = {"version": version, "provides": provides} + entry.update({k: v for k, v in meta.items() if v}) # description/license + entry["platforms"] = platforms + return entry + + +def update_manifest(manifest_path: Path, section: str, key: str, entry: dict) -> None: + """Insert ``entry`` under ``manifest[section][key]`` and write the manifest back.""" + if manifest_path.exists(): + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + else: + manifest = {"manifest_version": 1} + manifest.setdefault("manifest_version", 1) + manifest.setdefault(section, {})[key] = entry + manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") + + def main(argv: list[str] | None = None) -> None: parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) sub = parser.add_subparsers(dest="kind", required=True) @@ -85,17 +147,51 @@ def main(argv: list[str] | None = None) -> None: b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs") b.add_argument("--base-url", required=True, help="release download URL prefix") + m = sub.add_parser("manifest", help="add/update an entry in the shared manifest.json") + m.add_argument("--manifest", required=True, type=Path, help="manifest.json to create/update") + m.add_argument("--target", required=True, choices=["data", "binary"]) + m.add_argument("--version", required=True) + m.add_argument("--dir", required=True, type=Path, help="directory of uploaded files") + m.add_argument("--base-url", required=True, help="release download URL prefix") + m.add_argument("--dataset", help="data: dataset key, e.g. 'kegg'") + m.add_argument("--bundle", help="binary: bundle key, e.g. 'diamond'") + m.add_argument("--provides", nargs="+", help="binary: executables the bundle provides") + m.add_argument("--description") + m.add_argument("--license") + m.add_argument("--doi", help="data: Zenodo (or other) DOI for this version") + m.add_argument("--source", help="data: human-facing release/record page") + args = parser.parse_args(argv) if args.kind == "data": key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir) target = "raven_python/data.py _DATA_REGISTRY" - else: + print(f"# Merge into {target}:", file=sys.stderr) + print(render(key, entry)) + elif args.kind == "binary": key = args.bundle entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir) target = "raven_python/binaries.py _REGISTRY" - - print(f"# Merge into {target}:", file=sys.stderr) - print(render(key, entry)) + print(f"# Merge into {target}:", file=sys.stderr) + print(render(key, entry)) + else: # manifest + if args.target == "data": + if not args.dataset: + parser.error("--dataset is required for --target data") + entry = manifest_data_entry( + args.version, args.base_url, args.dir, + description=args.description, license=args.license, doi=args.doi, source=args.source, + ) + update_manifest(args.manifest, "data", args.dataset, entry) + print(f"Updated {args.manifest}: data/{args.dataset} ({len(entry['files'])} files)", file=sys.stderr) + else: + if not (args.bundle and args.provides): + parser.error("--bundle and --provides are required for --target binary") + entry = manifest_binary_entry( + args.bundle, args.version, args.provides, args.base_url, args.dir, + description=args.description, license=args.license, + ) + update_manifest(args.manifest, "binaries", args.bundle, entry) + print(f"Updated {args.manifest}: binaries/{args.bundle} ({len(entry['platforms'])} platforms)", file=sys.stderr) if __name__ == "__main__": diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py index 78ee0ac..ed7ec7b 100644 --- a/src/raven_python/binaries.py +++ b/src/raven_python/binaries.py @@ -65,6 +65,19 @@ def _bundle_for(executable: str, registry: dict): return None, None +def _maybe_autoload(registry: dict) -> None: + """Populate the default registry from ``$RAVEN_PYTHON_MANIFEST`` on first use, if set. + + Only fires when the caller is using the default (still-empty) ``_REGISTRY`` and the + environment variable points at a manifest; a caller that passes its own ``registry`` + is left untouched. The import is local to avoid a cycle with :mod:`raven_python.manifest`. + """ + if registry is _REGISTRY and not registry and os.environ.get("RAVEN_PYTHON_MANIFEST"): + from raven_python import manifest as _manifest + + _manifest.load_into_registries() + + def _sha256(path: Path) -> str: h = hashlib.sha256() with open(path, "rb") as fh: @@ -81,6 +94,7 @@ def ensure_binary(executable: str, *, registry: dict | None = None) -> Path: path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted. """ registry = _REGISTRY if registry is None else registry + _maybe_autoload(registry) bundle_name, bundle = _bundle_for(executable, registry) if bundle is None: raise FileNotFoundError( diff --git a/src/raven_python/data.py b/src/raven_python/data.py index b1264be..ecb46a3 100644 --- a/src/raven_python/data.py +++ b/src/raven_python/data.py @@ -45,7 +45,21 @@ def _data_cache_dir() -> Path: return Path(base) / "raven_python" / "data" +def _maybe_autoload(registry: dict) -> None: + """Populate the default registry from ``$RAVEN_PYTHON_MANIFEST`` on first use, if set. + + Fires only when the caller relies on the default (still-empty) ``_DATA_REGISTRY`` and + the environment variable points at a manifest. Local import avoids an import cycle with + :mod:`raven_python.manifest`. + """ + if registry is _DATA_REGISTRY and not registry and os.environ.get("RAVEN_PYTHON_MANIFEST"): + from raven_python import manifest as _manifest + + _manifest.load_into_registries() + + def _bundle(dataset: str, registry: dict) -> dict: + _maybe_autoload(registry) bundle = registry.get(dataset) if bundle is None: raise FileNotFoundError( diff --git a/src/raven_python/manifest.py b/src/raven_python/manifest.py new file mode 100644 index 0000000..949c123 --- /dev/null +++ b/src/raven_python/manifest.py @@ -0,0 +1,123 @@ +"""Load a raven-data manifest into the runtime resolver registries. + +A *manifest* is the single, language-agnostic source of truth for every downloadable +artefact (KEGG tables / HMMs, …) and external-binary bundle (BLAST / DIAMOND / HMMER). +It lives in the data repository (and/or a Zenodo record); raven-python and MATLAB RAVEN +both read the same JSON and verify each file's SHA256 after download. See +``data/manifest.schema.json`` for the format and ``data/manifest.example.json`` for a +worked example. + +The manifest is a superset of the two runtime registries: + +* ``manifest["data"]`` → :data:`raven_python.data._DATA_REGISTRY` +* ``manifest["binaries"]`` → :data:`raven_python.binaries._REGISTRY` + +Usage:: + + from raven_python import manifest + manifest.load_into_registries("https://github.com/SysBioChalmers/raven-data/releases/download/manifest-v1/manifest.json") + +or set ``RAVEN_PYTHON_MANIFEST`` (a path or URL) and the resolvers load it lazily on first +use:: + + export RAVEN_PYTHON_MANIFEST=/path/to/manifest.json +""" +from __future__ import annotations + +import json +import os +from pathlib import Path +from urllib.request import urlopen + +#: Environment variable holding a manifest path or URL; consulted lazily by the +#: resolvers when their registry is still empty. +ENV_MANIFEST = "RAVEN_PYTHON_MANIFEST" + +#: Manifest format version this module understands. +SUPPORTED_VERSION = 1 + + +def _read(source: str | os.PathLike) -> str: + """Read manifest text from a local path or an http(s)/ftp URL.""" + s = str(source) + if s.startswith(("http://", "https://", "ftp://")): + with urlopen(s) as resp: # noqa: S310 (trusted, user-supplied manifest source) + return resp.read().decode("utf-8") + return Path(s).read_text(encoding="utf-8") + + +def load_manifest(source: str | os.PathLike | None = None) -> dict: + """Read and validate a manifest from ``source`` (path/URL) or ``$RAVEN_PYTHON_MANIFEST``.""" + source = source or os.environ.get(ENV_MANIFEST) + if not source: + raise ValueError( + f"No manifest source: pass a path/URL or set ${ENV_MANIFEST}." + ) + manifest = json.loads(_read(source)) + version = manifest.get("manifest_version") + if version != SUPPORTED_VERSION: + raise ValueError( + f"Unsupported manifest_version {version!r} (this raven-python understands " + f"{SUPPORTED_VERSION})." + ) + return manifest + + +def to_data_registry(manifest: dict) -> dict: + """Project ``manifest['data']`` onto the ``raven_python.data._DATA_REGISTRY`` shape.""" + return { + dataset: { + "version": spec["version"], + "files": { + name: {"url": f["url"], "sha256": f["sha256"]} + for name, f in spec["files"].items() + }, + } + for dataset, spec in manifest.get("data", {}).items() + } + + +def to_binary_registry(manifest: dict) -> dict: + """Project ``manifest['binaries']`` onto the ``raven_python.binaries._REGISTRY`` shape.""" + return { + bundle: { + "version": spec["version"], + "provides": list(spec["provides"]), + "platforms": { + key: {"url": f["url"], "sha256": f["sha256"]} + for key, f in spec["platforms"].items() + }, + } + for bundle, spec in manifest.get("binaries", {}).items() + } + + +def load_into_registries( + source: str | os.PathLike | None = None, *, replace: bool = False +) -> dict: + """Load a manifest and merge it into the live data/binary registries. + + Parameters + ---------- + source + Manifest path or URL; defaults to ``$RAVEN_PYTHON_MANIFEST``. + replace + If True, clear the existing registries first; otherwise merge (manifest wins). + + Returns + ------- + dict + The parsed manifest. + """ + manifest = load_manifest(source) + # Imported here (not at module top) so data/binaries can lazily call back + # into this module without an import cycle. + from raven_python import binaries as _binaries + from raven_python import data as _data + + if replace: + _data._DATA_REGISTRY.clear() + _binaries._REGISTRY.clear() + _data._DATA_REGISTRY.update(to_data_registry(manifest)) + _binaries._REGISTRY.update(to_binary_registry(manifest)) + return manifest diff --git a/tests/test_manifest.py b/tests/test_manifest.py new file mode 100644 index 0000000..71998dc --- /dev/null +++ b/tests/test_manifest.py @@ -0,0 +1,125 @@ +"""Tests for the data/binary manifest loader (manifest.py) and its wiring into the +resolvers. Uses file:// URLs + a tmp manifest to avoid the network.""" +import hashlib +import json +from pathlib import Path + +import pytest + +from raven_python import binaries, data, manifest + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _sha256(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +@pytest.fixture +def clean_registries(): + """Snapshot and restore the live registries so a test's loads don't leak.""" + data_snap = dict(data._DATA_REGISTRY) + bin_snap = dict(binaries._REGISTRY) + data._DATA_REGISTRY.clear() + binaries._REGISTRY.clear() + yield + data._DATA_REGISTRY.clear() + data._DATA_REGISTRY.update(data_snap) + binaries._REGISTRY.clear() + binaries._REGISTRY.update(bin_snap) + + +def _write_manifest(tmp_path: Path, payload: dict) -> Path: + p = tmp_path / "manifest.json" + p.write_text(json.dumps(payload), encoding="utf-8") + return p + + +def test_converters_strip_to_registry_shapes(): + m = { + "manifest_version": 1, + "data": { + "kegg": { + "version": "kegg116", + "license": "metadata that the registry should drop", + "files": {"x.gz": {"url": "https://e/x.gz", "sha256": "a" * 64, "bytes": 5}}, + } + }, + "binaries": { + "diamond": { + "version": "2.1.9", + "provides": ["diamond"], + "license": "GPL-3.0-only", + "platforms": {"linux-x86_64": {"url": "https://e/d.zip", "sha256": "b" * 64, "bytes": 9}}, + } + }, + } + assert manifest.to_data_registry(m) == { + "kegg": {"version": "kegg116", "files": {"x.gz": {"url": "https://e/x.gz", "sha256": "a" * 64}}} + } + assert manifest.to_binary_registry(m) == { + "diamond": { + "version": "2.1.9", + "provides": ["diamond"], + "platforms": {"linux-x86_64": {"url": "https://e/d.zip", "sha256": "b" * 64}}, + } + } + + +def test_load_manifest_rejects_unknown_version(tmp_path): + p = _write_manifest(tmp_path, {"manifest_version": 999}) + with pytest.raises(ValueError, match="manifest_version"): + manifest.load_manifest(p) + + +def test_load_manifest_requires_a_source(monkeypatch): + monkeypatch.delenv(manifest.ENV_MANIFEST, raising=False) + with pytest.raises(ValueError, match="No manifest source"): + manifest.load_manifest() + + +def test_load_into_registries_populates_both(tmp_path, clean_registries): + p = _write_manifest( + tmp_path, + { + "manifest_version": 1, + "data": {"kegg": {"version": "v1", "files": {"a": {"url": "https://e/a", "sha256": "0" * 64}}}}, + "binaries": {"diamond": {"version": "2", "provides": ["diamond"], "platforms": {}}}, + }, + ) + manifest.load_into_registries(p) + assert data._DATA_REGISTRY["kegg"]["version"] == "v1" + assert binaries._REGISTRY["diamond"]["provides"] == ["diamond"] + + +def test_resolver_lazy_autoload_via_env(tmp_path, monkeypatch, clean_registries): + # A real artefact file served over file://, registered through the manifest. + artefact = tmp_path / "reference_model.yml.gz" + artefact.write_bytes(b"hello kegg") + payload = { + "manifest_version": 1, + "data": { + "kegg": { + "version": "kegg-test", + "files": { + artefact.name: {"url": artefact.as_uri(), "sha256": _sha256(artefact.read_bytes())} + }, + } + }, + } + manifest_path = _write_manifest(tmp_path, payload) + monkeypatch.setenv(manifest.ENV_MANIFEST, str(manifest_path)) + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + + # _DATA_REGISTRY is empty; ensure_data_file must lazily load the manifest and fetch. + got = data.ensure_data_file("kegg", artefact.name) + assert got.read_bytes() == b"hello kegg" + assert data._DATA_REGISTRY["kegg"]["version"] == "kegg-test" + + +@pytest.mark.parametrize("name", ["manifest.json", "manifest.example.json"]) +def test_repo_manifests_are_valid(name): + m = manifest.load_manifest(REPO_ROOT / "data" / name) + # Both must convert cleanly to the runtime registry shapes. + manifest.to_data_registry(m) + manifest.to_binary_registry(m) From f5042b2643160199a2b5d2bda2e1807a87733ec9 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sun, 31 May 2026 00:07:19 +0200 Subject: [PATCH 2/2] docs(data): host assets on existing-repo releases; KEGG redistribution permitted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflect the chosen distribution model: GitHub release assets live outside the git tree, so a separate data repository is optional — attach assets to dedicated tags (e.g. kegg-kegg116, diamond-2.1.9) on an existing RAVEN repo and reuse the same URLs across raven-python and MATLAB RAVEN. Use Zenodo only for DOIs or files >2 GB. KEGG artefacts are redistributed with permission, so the prior 'confirm rights' caveat is removed. Example/schema URLs repointed from a hypothetical raven-data repo to raven-python. --- data/manifest.example.json | 20 +++++++------- data/manifest.schema.json | 4 +-- docs/maintenance/data_manifest.md | 45 ++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/data/manifest.example.json b/data/manifest.example.json index 80f3e5f..2bc5e81 100644 --- a/data/manifest.example.json +++ b/data/manifest.example.json @@ -5,15 +5,15 @@ "kegg": { "version": "kegg116", "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", - "license": "Derived from KEGG (subscription-licensed bulk dump) — confirm redistribution rights before publishing publicly.", + "license": "Derived from the KEGG database; redistributed with permission from KEGG.", "doi": "10.5281/zenodo.0000000", - "source": "https://github.com/SysBioChalmers/raven-data/releases/tag/kegg-kegg116", + "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/kegg-kegg116", "files": { - "reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "organism_gene_ko.tsv.xz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/organism_gene_ko.tsv.xz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116/rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "organism_gene_ko.tsv.xz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/organism_gene_ko.tsv.xz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "prokaryotes.hmm": { "url": "https://zenodo.org/records/0000000/files/prokaryotes.hmm", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } } } @@ -25,9 +25,9 @@ "description": "DIAMOND protein aligner (homology-based reconstruction).", "license": "GPL-3.0-only — ship the upstream COPYING alongside each ZIP.", "platforms": { - "linux-x86_64": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9/diamond-2.1.9-linux-x86_64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "macos-arm64": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9/diamond-2.1.9-macos-arm64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "windows-x86_64": { "url": "https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9/diamond-2.1.9-windows-x86_64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } + "linux-x86_64": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/diamond-2.1.9/diamond-2.1.9-linux-x86_64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "macos-arm64": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/diamond-2.1.9/diamond-2.1.9-macos-arm64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "windows-x86_64": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/diamond-2.1.9/diamond-2.1.9-windows-x86_64.zip", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } } } } diff --git a/data/manifest.schema.json b/data/manifest.schema.json index 15fccb8..a2b4718 100644 --- a/data/manifest.schema.json +++ b/data/manifest.schema.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://github.com/SysBioChalmers/raven-data/manifest.schema.json", - "title": "raven-data manifest", + "$id": "https://github.com/SysBioChalmers/raven-python/manifest.schema.json", + "title": "RAVEN data/binary manifest", "description": "Language-agnostic registry of downloadable raven-python / RAVEN data artefacts and external binary bundles. Consumed by the Python resolvers (raven_python.data / raven_python.binaries) and by MATLAB RAVEN. Every file carries a SHA256 so consumers verify integrity after download.", "type": "object", "required": ["manifest_version"], diff --git a/docs/maintenance/data_manifest.md b/docs/maintenance/data_manifest.md index 554f08c..1f1bc75 100644 --- a/docs/maintenance/data_manifest.md +++ b/docs/maintenance/data_manifest.md @@ -31,7 +31,7 @@ Point raven-python at a manifest and the resolvers populate themselves on first verifying each download's checksum: ```bash -export RAVEN_PYTHON_MANIFEST=https://github.com/SysBioChalmers/raven-data/releases/download/manifest-v1/manifest.json +export RAVEN_PYTHON_MANIFEST=https://github.com/SysBioChalmers/raven-python/releases/download/manifest-v1/manifest.json ``` ```python @@ -76,25 +76,39 @@ which computes each SHA256 and byte size: ```bash python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \ --target data --dataset kegg --version kegg116 --dir artefacts \ - --base-url https://github.com/SysBioChalmers/raven-data/releases/download/kegg-kegg116 \ + --base-url https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116 \ --doi 10.5281/zenodo.0000000 --source https://zenodo.org/records/0000000 python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \ --target binary --bundle diamond --version 2.1.9 --provides diamond --dir zips \ - --base-url https://github.com/SysBioChalmers/raven-data/releases/download/diamond-2.1.9 \ + --base-url https://github.com/SysBioChalmers/raven-python/releases/download/diamond-2.1.9 \ --license GPL-3.0-only ``` -## Where to host: GitHub Releases vs Zenodo +## Where to host -Both are just URLs in the manifest, so consumers don't care — choose per asset: +Release **assets are stored separately from the git tree** (GitHub keeps them in a blob +store), so attaching them to a release does **not** bloat the repository. A dedicated assets +repository is therefore **optional** — attach the assets to releases on an existing RAVEN +repo (this one, or MATLAB [RAVEN](https://github.com/SysBioChalmers/RAVEN)) and have **both +packages reuse the same release-asset URLs** via this manifest. -- **GitHub Releases** — simplest, free, language-agnostic, up to ~2 GB per file. Good default, - and you're already on GitHub for the code. -- **Zenodo** — adds a citable **DOI**, long-term archival, and handles files larger than 2 GB - (up to 50 GB/record). Right for the KEGG HMM bundle and anything you want citable. +Use **dedicated tags** for the assets — e.g. `kegg-kegg116`, `diamond-2.1.9` — rather than +attaching them to code-milestone releases like `v0.1.0a1`. KEGG data updates roughly yearly +while the code changes often; dedicated tags keep the two cadences decoupled while still +living in one repository. The manifest's per-dataset `version` does the rest (it namespaces +the download cache). -### Auto-publishing to Zenodo from GitHub +Both GitHub Releases and Zenodo are just URLs in the manifest, so consumers don't care — +mix them per file: + +- **GitHub Releases** — simplest, free, language-agnostic, up to **~2 GB per file**. The + default home for the manifest and most assets. +- **Zenodo** — adds a citable **DOI**, long-term archival, and handles files **larger than + 2 GB** (up to 50 GB/record). Use it for individual large HMM libraries or anything you want + citable; point just that file's `url` at the Zenodo record. + +### Auto-publishing to Zenodo from GitHub (only if you need DOIs / >2 GB files) :::{important} The **native GitHub↔Zenodo integration** (flip a switch, publish a Release → DOI) archives @@ -103,10 +117,11 @@ Release. So it only works for assets *committed into the repo*, which defeats th multi-GB binaries. Use it for a *software* DOI, not for the data assets. ::: -For the data assets, keep everything GitHub-driven with a small **GitHub Action** that, on -release, uploads the assets to Zenodo via its REST API (e.g. [`zenodraft`](https://github.com/zenodraft/zenodraft)). -You cut a normal GitHub Release with the files attached; the Action mirrors them to Zenodo and -mints a new version DOI. Drop this in the data repo as `.github/workflows/zenodo.yml`: +If you do want Zenodo DOIs (or need to host files >2 GB), keep it GitHub-driven with a small +**GitHub Action** that, on release, uploads the assets to Zenodo via its REST API (e.g. +[`zenodraft`](https://github.com/zenodraft/zenodraft)). You cut a normal GitHub Release with +the files attached; the Action mirrors them to Zenodo and mints a new version DOI. Drop this +into whichever repo hosts the asset releases as `.github/workflows/zenodo.yml`: ```yaml name: Mirror release assets to Zenodo @@ -136,5 +151,5 @@ ever interact with GitHub Releases; Zenodo archiving + DOIs happen automatically | Asset | Home | Notes | | --- | --- | --- | | **Software binaries** (BLAST / DIAMOND / HMMER) | **bioconda** preferred; or release ZIPs via the resolver | DIAMOND is **GPL-3.0** — ship its license text in the ZIP; keep it as a separate asset, never bundled into the MIT wheel. | -| **KEGG HMMs / tables** | **Zenodo** (DOI, >2 GB, archival) | ⚠️ Derived from the subscription-licensed KEGG dump — **confirm redistribution rights with KEGG before publishing publicly**. If not permitted, keep access-gated and have users build from their own dump (the resolver supports a local dir). | +| **KEGG HMMs / tables** | GitHub release (dedicated `kegg-*` tag); Zenodo for libraries >2 GB | Derived from the KEGG dump and **redistributed with permission from KEGG**. Note the provenance in the release notes / manifest `license`. | | **Template models** (Human-GEM, yeast-GEM) | **Don't re-host** | Fetch from their canonical repos by pinned release tag — respects their licenses and avoids stale copies. |