2626import os
2727import sys
2828from pathlib import Path
29+ from typing import Optional
2930
3031from . import parse , run
3132from .projection .canonical import to_canonical
3233from .projection .cbor import canonical_cbor
33- from .runtime .errors import CodifideError
34- from .store import StoreError , SymbolStore , symbol_hash
34+ from .runtime .errors import CodifideError , ParseError
35+ from .store import StoreError , SymbolStore , symbol_hash , symbol_hash_json
36+
37+
38+ # Maximum size of a single .cod source file the CLI will read. Source
39+ # files are small by design; anything over this is either a mistake
40+ # (piped binary, /dev/zero) or hostile. The Rust canonical binary
41+ # already enforces the same bound for its JSON input (2026-05-10 CBOR
42+ # audit P1-7); this is the Python counterpart. See
43+ # dispatches/2026-05-11-cli-audit.md for the finding that motivated
44+ # adding it here.
45+ _MAX_SOURCE_BYTES = 16 * 1024 * 1024 # 16 MiB
3546
3647
3748def _read (path : str ) -> str :
38- return Path (path ).read_text (encoding = "utf-8" )
49+ """Read a .cod source file with a bounded byte count.
50+
51+ Reads at most ``_MAX_SOURCE_BYTES + 1`` bytes so we can distinguish
52+ "exactly at the cap" from "over the cap". Exceeding the cap raises
53+ ``ParseError`` — sourced via the parser's error channel because a
54+ file we refuse to read cannot parse, and the host wants a typed
55+ Codifide error rather than an OS-level error here.
56+ """
57+ try :
58+ with open (path , "rb" ) as fh :
59+ data = fh .read (_MAX_SOURCE_BYTES + 1 )
60+ except OSError as exc :
61+ raise ParseError (f"cannot read { path !r} : { exc } " ) from exc
62+ if len (data ) > _MAX_SOURCE_BYTES :
63+ raise ParseError (
64+ f"source file { path !r} exceeds { _MAX_SOURCE_BYTES } bytes; "
65+ f"refuse to read more. Codifide source files are small by design."
66+ )
67+ try :
68+ return data .decode ("utf-8" )
69+ except UnicodeDecodeError as exc :
70+ raise ParseError (
71+ f"source file { path !r} is not valid UTF-8: { exc } "
72+ ) from exc
3973
4074
4175def cmd_run (args : argparse .Namespace ) -> int :
@@ -134,7 +168,7 @@ def cmd_verify(args: argparse.Namespace) -> int:
134168 from .projection .canonical import canonical_bytes as canon_json
135169 from .projection .cbor import canonical_cbor
136170 from .runtime .interpreter import _check_transitive_effects , _ResolvedImports
137- from .store import SymbolStore , symbol_hash , symbol_hash_cbor
171+ from .store import SymbolStore , symbol_hash , symbol_hash_cbor , symbol_hash_json
138172
139173 try :
140174 src = _read (args .file )
@@ -167,18 +201,20 @@ def cmd_verify(args: argparse.Namespace) -> int:
167201 return 1
168202
169203 # Per-symbol identity — what another agent would receive if they
170- # imported each symbol by content hash.
204+ # imported each symbol by content hash. CBOR is the primary form
205+ # post the 2026-05-11 migration; JSON is shown as a legacy
206+ # inspection aid.
171207 print (f"module: { module .name } " )
172208 print (f"symbols: { len (module .symbols )} " )
173209 print (f"imports: { len (module .imports )} " )
174210 print (f"bytes: JSON { len (j_bytes )} , CBOR { len (c_bytes )} " )
175211 print ()
176212 for defn in module .symbols :
177- h_json = symbol_hash (defn .name , defn )
178213 h_cbor = symbol_hash_cbor (defn .name , defn )
214+ h_json = symbol_hash_json (defn .name , defn )
179215 print (f" { defn .name } " )
180- print (f" json { h_json } " )
181- print (f" cbor { h_cbor } " )
216+ print (f" cbor { h_cbor } (primary) " )
217+ print (f" json { h_json } (legacy) " )
182218 return 0
183219
184220
@@ -201,7 +237,11 @@ def cmd_store_put(args: argparse.Namespace) -> int:
201237 try :
202238 module = parse (_read (args .file ))
203239 store = SymbolStore (_store_root (args ))
204- entries = store .put_module (module , cbor = args .cbor )
240+ # As of 2026-05-11, the primary put path is CBOR. ``--json``
241+ # opts into the legacy JSON identity; ``--cbor`` is accepted
242+ # but redundant. Either flag alone is explicit and fine.
243+ use_cbor = not args .json
244+ entries = store .put_module (module , cbor = use_cbor )
205245 for name , identity in entries :
206246 print (f"{ identity } \t { name } " )
207247 return 0
@@ -231,10 +271,13 @@ def cmd_store_list(args: argparse.Namespace) -> int:
231271def cmd_store_hash (args : argparse .Namespace ) -> int :
232272 # Compute identities without writing anything — useful for scripting
233273 # (e.g. seeing what a module would produce before committing to a put).
274+ # Defaults to the primary (CBOR) identity; ``--json`` prints the
275+ # legacy JSON identity for each symbol.
234276 try :
235277 module = parse (_read (args .file ))
278+ hash_fn = symbol_hash_json if args .json else symbol_hash
236279 for defn in module .symbols :
237- print (f"{ symbol_hash (defn .name , defn )} \t { defn .name } " )
280+ print (f"{ hash_fn (defn .name , defn )} \t { defn .name } " )
238281 return 0
239282 except CodifideError as e :
240283 print (f"codifide: { e } " , file = sys .stderr )
@@ -283,22 +326,27 @@ def cmd_store_index(args: argparse.Namespace) -> int:
283326 )
284327
285328 # The module's content identity is the hash of its canonical bytes.
286- # We compute and store it by writing the canonical JSON directly,
287- # bypassing the symbol store's per-definition envelope since an
288- # index has no definitions.
329+ # Post 2026-05-11 the primary identity is CBOR-over-bytes;
330+ # ``--json`` emits the legacy JSON-hashed identity.
289331 import hashlib
290332 from .projection .canonical import to_canonical
333+ from .projection .cbor import canonical_cbor
291334
292335 canonical_obj = to_canonical (index_module )
293- data = json .dumps (
294- canonical_obj , sort_keys = True , separators = ("," , ":" ), ensure_ascii = True
295- ).encode ("utf-8" )
336+ if getattr (args , "json" , False ):
337+ data = json .dumps (
338+ canonical_obj , sort_keys = True , separators = ("," , ":" ), ensure_ascii = True
339+ ).encode ("utf-8" )
340+ suffix = ".json"
341+ else :
342+ data = canonical_cbor (canonical_obj )
343+ suffix = ".cbor"
296344 identity = f"sha256:{ hashlib .sha256 (data ).hexdigest ()} "
297345
298346 # Write through the store's atomic-write path. We use the internal
299347 # method because the public API is per-symbol; indices are modules,
300- # not symbols. The on-disk layout is identical.
301- store ._write_atomic (identity , data )
348+ # not symbols. The on-disk layout is identical beyond suffix .
349+ store ._write_atomic (identity , data , suffix = suffix )
302350 print (f"{ identity } \t { index_module .name } " )
303351 return 0
304352
@@ -360,6 +408,103 @@ def cmd_store_verify(args: argparse.Namespace) -> int:
360408 return 0
361409
362410
411+ def cmd_store_gc (args : argparse .Namespace ) -> int :
412+ """Report or delete unreachable identities.
413+
414+ Dry-run by default — safe to invoke; prints a plan without
415+ touching the store. Pass ``--execute`` to actually delete.
416+ ``--execute`` refuses to run if the ``ROOTS`` file is empty or
417+ missing; the footgun guard is deliberate.
418+
419+ See ``dispatches/2026-05-11-store-gc-design.readout.md``.
420+ """
421+ from .store .gc import GCError
422+ store = SymbolStore (_store_root (args ))
423+ try :
424+ report = store .gc (execute = args .execute )
425+ except GCError as exc :
426+ print (f"codifide: { exc } " , file = sys .stderr )
427+ return 1
428+ if report .executed :
429+ print (report .summary ())
430+ if report .deleted :
431+ for identity in report .deleted :
432+ print (f" deleted { identity } " )
433+ else :
434+ print (report .summary ())
435+ if report .roots_count == 0 :
436+ print (
437+ "codifide: ROOTS is empty or missing; `--execute` will refuse.\n "
438+ " Add roots with `codifide store roots add <identity>`." ,
439+ file = sys .stderr ,
440+ )
441+ if report .deleted :
442+ print ("would delete:" )
443+ for identity in report .deleted :
444+ print (f" { identity } " )
445+ print ("Pass --execute to actually delete." )
446+ return 0
447+
448+
449+ def cmd_store_roots_list (args : argparse .Namespace ) -> int :
450+ store = SymbolStore (_store_root (args ))
451+ for identity in store .roots ():
452+ print (identity )
453+ return 0
454+
455+
456+ def cmd_store_roots_add (args : argparse .Namespace ) -> int :
457+ try :
458+ store = SymbolStore (_store_root (args ))
459+ store .add_root (args .identity )
460+ return 0
461+ except StoreError as exc :
462+ print (f"codifide: { exc } " , file = sys .stderr )
463+ return 1
464+
465+
466+ def cmd_store_roots_remove (args : argparse .Namespace ) -> int :
467+ store = SymbolStore (_store_root (args ))
468+ removed = store .remove_root (args .identity )
469+ if not removed :
470+ print (
471+ f"codifide: { args .identity } was not in ROOTS; nothing to remove" ,
472+ file = sys .stderr ,
473+ )
474+ return 1
475+ return 0
476+
477+
478+ def cmd_dispatch_index (args : argparse .Namespace ) -> int :
479+ """Regenerate or check-drift the dispatches/INDEX.md file."""
480+ from pathlib import Path as _Path
481+ from .dispatch_index import build_index , check_index , write_index
482+
483+ repo_root = _Path (__file__ ).resolve ().parent .parent
484+ dispatch_dir = repo_root / "dispatches"
485+ if not dispatch_dir .exists ():
486+ print (
487+ f"codifide: no dispatches directory at { dispatch_dir } " ,
488+ file = sys .stderr ,
489+ )
490+ return 1
491+
492+ if args .check :
493+ if check_index (dispatch_dir ):
494+ return 0
495+ print (
496+ "codifide: dispatches/INDEX.md is out of sync with "
497+ "dispatches/ contents.\n "
498+ " Regenerate with `python3 -m codifide dispatch-index`." ,
499+ file = sys .stderr ,
500+ )
501+ return 1
502+
503+ path = write_index (dispatch_dir )
504+ print (f"wrote { path .relative_to (repo_root )} " )
505+ return 0
506+
507+
363508def _default_entry (module ) -> str :
364509 # If there's only one definition, use it; else prefer `main`.
365510 if len (module .symbols ) == 1 :
@@ -419,6 +564,17 @@ def main(argv=None) -> int:
419564 p_verify .add_argument ("file" )
420565 p_verify .set_defaults (func = cmd_verify )
421566
567+ p_dispatch_index = sub .add_parser (
568+ "dispatch-index" ,
569+ help = "regenerate dispatches/INDEX.md from the directory contents" ,
570+ )
571+ p_dispatch_index .add_argument (
572+ "--check" ,
573+ action = "store_true" ,
574+ help = "verify the checked-in INDEX.md matches what would be generated" ,
575+ )
576+ p_dispatch_index .set_defaults (func = cmd_dispatch_index )
577+
422578 # Symbol store. A store root can be passed via --store or the
423579 # CODIFIDE_STORE environment variable; defaults to ~/.codifide/store.
424580 p_store = sub .add_parser (
@@ -436,7 +592,12 @@ def main(argv=None) -> int:
436592 p_put .add_argument (
437593 "--cbor" ,
438594 action = "store_true" ,
439- help = "store in CBOR form (produces different identities than JSON)" ,
595+ help = "(default) store in CBOR form — primary identity since 2026-05-11" ,
596+ )
597+ p_put .add_argument (
598+ "--json" ,
599+ action = "store_true" ,
600+ help = "store in legacy JSON form (produces different identities than CBOR)" ,
440601 )
441602 p_put .set_defaults (func = cmd_store_put )
442603
@@ -452,6 +613,11 @@ def main(argv=None) -> int:
452613 help = "print the content hash of every symbol in a module without storing" ,
453614 )
454615 p_hash .add_argument ("file" )
616+ p_hash .add_argument (
617+ "--json" ,
618+ action = "store_true" ,
619+ help = "print legacy JSON hashes instead of primary CBOR hashes" ,
620+ )
455621 p_hash .set_defaults (func = cmd_store_hash )
456622
457623 p_index = store_sub .add_parser (
@@ -468,6 +634,11 @@ def main(argv=None) -> int:
468634 nargs = "+" ,
469635 help = "name=sha256:<hex> pairs defining the index's exports" ,
470636 )
637+ p_index .add_argument (
638+ "--json" ,
639+ action = "store_true" ,
640+ help = "publish the index as legacy JSON (default is CBOR since 2026-05-11)" ,
641+ )
471642 p_index .set_defaults (func = cmd_store_index )
472643
473644 p_verify = store_sub .add_parser (
@@ -477,6 +648,37 @@ def main(argv=None) -> int:
477648 p_verify .add_argument ("hash" , help = "identity of the module to verify" )
478649 p_verify .set_defaults (func = cmd_store_verify )
479650
651+ # -- Garbage collection (2026-05-11 design dispatch) --------------
652+ p_gc = store_sub .add_parser (
653+ "gc" ,
654+ help = "report or delete identities unreachable from the ROOTS file" ,
655+ )
656+ p_gc .add_argument (
657+ "--execute" ,
658+ action = "store_true" ,
659+ help = "actually delete (default is dry-run)" ,
660+ )
661+ p_gc .set_defaults (func = cmd_store_gc )
662+
663+ p_roots = store_sub .add_parser (
664+ "roots" ,
665+ help = "manage the ROOTS file that declares live identities for GC" ,
666+ )
667+ roots_sub = p_roots .add_subparsers (dest = "roots_cmd" , required = True )
668+
669+ p_roots_list = roots_sub .add_parser ("list" , help = "print current roots" )
670+ p_roots_list .set_defaults (func = cmd_store_roots_list )
671+
672+ p_roots_add = roots_sub .add_parser ("add" , help = "add an identity as a root" )
673+ p_roots_add .add_argument ("identity" )
674+ p_roots_add .set_defaults (func = cmd_store_roots_add )
675+
676+ p_roots_remove = roots_sub .add_parser (
677+ "remove" , help = "remove an identity from the roots"
678+ )
679+ p_roots_remove .add_argument ("identity" )
680+ p_roots_remove .set_defaults (func = cmd_store_roots_remove )
681+
480682 args = parser .parse_args (argv )
481683 return args .func (args )
482684
0 commit comments