@@ -497,6 +497,43 @@ def _order_columns_by_groups(columns, groups, within_group_sort="schema"):
497497 return ordered
498498
499499
500+ def _dtype_to_str (dtype ):
501+ """
502+ Convert a dtype to its string representation.
503+
504+ Handles:
505+ - numpy dtype instances: np.dtype('float32') → 'float32'
506+ - numpy type classes: np.float32 → 'float32'
507+ - strings: 'float32' → 'float32'
508+ - None: None → None
509+
510+ Parameters
511+ ----------
512+ dtype : various
513+ A dtype in various formats
514+
515+ Returns
516+ -------
517+ str or None
518+ String representation of dtype
519+ """
520+ if dtype is None :
521+ return None
522+ if isinstance (dtype , str ):
523+ return dtype
524+ # np.dtype instances have .name
525+ if hasattr (dtype , 'name' ):
526+ return dtype .name
527+ # numpy type classes like np.float32
528+ if hasattr (dtype , '__name__' ):
529+ return dtype .__name__
530+ # Fallback - try to convert via np.dtype
531+ try :
532+ return np .dtype (dtype ).name
533+ except (TypeError , AttributeError ):
534+ return str (dtype )
535+
536+
500537def _export_column_spec_v2 (col_name , col_info , df = None ):
501538 """
502539 Export a single column specification in v2 format.
@@ -552,16 +589,16 @@ def _export_column_spec_v2(col_name, col_info, df=None):
552589 return result
553590
554591
555- def _export_subframe_schema_v2 (subframe_entry , include_compression = False ):
592+ def _export_subframe_schema_v2 (subframe_entry , include_precision_stats = False ):
556593 """
557594 Export a subframe's full schema recursively.
558595
559596 Parameters
560597 ----------
561598 subframe_entry : dict
562599 Entry from SubframeRegistry: {'frame': adf, 'index': [...]}
563- include_compression : bool
564- Whether to include compression section
600+ include_precision_stats : bool
601+ Whether to include precision statistics in compression section
565602
566603 Returns
567604 -------
@@ -604,10 +641,34 @@ def _export_subframe_schema_v2(subframe_entry, include_compression=False):
604641 if sf_schema .get ('groups' ):
605642 result ['groups' ] = sf_schema ['groups' ]
606643
607- # Compression (optional)
608- if include_compression and sf_schema .get ('compression' ):
609- comp = sf_schema ['compression' ].copy ()
610- result ['compression' ] = comp
644+ # Compression - always include when present (required for data interpretation)
645+ if sf_schema .get ('compression' ):
646+ comp = {}
647+ for name , info in sf_schema ['compression' ].items ():
648+ if name == '__meta__' :
649+ comp ['__meta__' ] = copy .deepcopy (info )
650+ continue
651+
652+ entry = {}
653+ # Required fields
654+ for field in ['compressed_col' , 'compress_expr' , 'decompress_expr' ,
655+ 'state' , 'original_removed' ]:
656+ if field in info :
657+ entry [field ] = info [field ]
658+
659+ # Convert dtypes to strings using helper
660+ for dtype_field in ['compressed_dtype' , 'decompressed_dtype' ]:
661+ if dtype_field in info and info [dtype_field ] is not None :
662+ entry [dtype_field ] = _dtype_to_str (info [dtype_field ])
663+
664+ # Optional precision stats
665+ if include_precision_stats and 'precision' in info :
666+ entry ['precision' ] = copy .deepcopy (info ['precision' ])
667+
668+ comp [name ] = entry
669+
670+ if comp :
671+ result ['compression' ] = comp
611672
612673 return result
613674
@@ -1584,9 +1645,7 @@ def build_graph():
15841645 try :
15851646 ordered = list (nx .topological_sort (g .subgraph (expanded )))
15861647 result = [n for n in ordered if n in expanded ]
1587- # Change to:
1588- except (nx .NetworkXError , nx .NetworkXUnfeasible ):
1589- # Cycle detected or graph issue, return unordered
1648+ except nx .NetworkXError :
15901649 result = list (expanded )
15911650
15921651 return result
@@ -3046,7 +3105,7 @@ def compress_columns(self, compression_spec=None, columns=None, suffix='_c', dro
30463105
30473106 # Standard state validation for non-selective modes
30483107 if current_state == CompressionState .COMPRESSED :
3049- # Already compressed - skip (idempotent)
3108+ # Idempotent behavior: skip already compressed columns
30503109 continue
30513110 elif current_state == CompressionState .SCHEMA_ONLY :
30523111 # Valid transition: SCHEMA_ONLY → COMPRESSED
@@ -4358,7 +4417,7 @@ def export_schema(self):
43584417 dict
43594418 JSON-safe schema dictionary
43604419 """
4361- return self .export_schema_v2 (include_compression = True )
4420+ return self .export_schema_v2 (include_precision_stats = True )
43624421
43634422 def save_schema (self , path ):
43644423 """
@@ -4369,7 +4428,7 @@ def save_schema(self, path):
43694428 path : str
43704429 Path to save schema JSON file
43714430 """
4372- self .save_schema_v2 (path , include_compression = True )
4431+ self .save_schema_v2 (path , include_precision_stats = True )
43734432
43744433 @staticmethod
43754434 def load_schema (path ):
@@ -4510,23 +4569,32 @@ def get_column_metadata(self, column):
45104569 # Return all fields except dtype and expr
45114570 return {k : v for k , v in col_info .items () if k not in ('dtype' , 'expr' , 'constant' )}
45124571
4513- def export_schema_v2 (self , include_compression = True , include_subframes = True ,
4572+ def export_schema_v2 (self , include_precision_stats = False , include_subframes = True ,
45144573 within_group_sort = "schema" ):
45154574 """
45164575 Export schema as JSON-safe dictionary (v2 format).
45174576
45184577 Features:
45194578 - Uniform object format for all columns: {"dtype": "..."}
45204579 - No "expr": null for physical columns
4521- - Groups preserved
4580+ - Groups preserved (simple named lists of column names)
45224581 - Column metadata preserved (unit, axisLabel, etc.)
45234582 - Recursive subframe schemas (nested)
4524- - Compression optional
4583+ - Compression always included (definitions required for data interpretation)
4584+
4585+ Schema v2 canonical order:
4586+ 1. __meta__ - version, creation info
4587+ 2. columns - column definitions (physical + aliases)
4588+ 3. groups - logical column groupings (if present)
4589+ 4. compression - storage/compression configuration
4590+ 5. subframes - hierarchical structure (related tables)
45254591
45264592 Parameters
45274593 ----------
4528- include_compression : bool, default=False
4529- Whether to include compression section
4594+ include_precision_stats : bool, default=False
4595+ Whether to include precision statistics (RMSE, max_error, etc.) in
4596+ compression section. The compression definitions (expressions, dtypes,
4597+ state) are always included.
45304598 include_subframes : bool, default=True
45314599 Whether to include recursive subframe schemas
45324600 within_group_sort : str, default="schema"
@@ -4536,25 +4604,22 @@ def export_schema_v2(self, include_compression=True, include_subframes=True,
45364604 Returns
45374605 -------
45384606 dict
4539- JSON-safe schema dictionary
4607+ JSON-safe schema dictionary with canonical key ordering
45404608 """
4541- result = {}
4609+ from collections import OrderedDict
4610+ result = OrderedDict ()
45424611
4543- # __meta__ section
4612+ # 1. __meta__ section (always first)
45444613 result ['__meta__' ] = {
45454614 'schema_version' : SCHEMA_VERSION_V2 ,
45464615 'created_at' : datetime .now (timezone .utc ).isoformat (),
45474616 'schema_id' : self ._schema .get ('__meta__' , {}).get ('schema_id' )
45484617 }
45494618
4550- # Groups section (if present)
4551- groups = self ._schema .get ('groups' , {})
4552- if groups :
4553- result ['groups' ] = groups
4554-
4555- # Columns section
4556- columns = {}
4619+ # 2. Columns section
4620+ columns = OrderedDict ()
45574621 schema_columns = self ._schema .get ('columns' , {})
4622+ groups = self ._schema .get ('groups' , {})
45584623
45594624 # Physical columns from DataFrame first
45604625 for col in self .df .columns :
@@ -4570,63 +4635,96 @@ def export_schema_v2(self, include_compression=True, include_subframes=True,
45704635 columns = _order_columns_by_groups (columns , groups , within_group_sort )
45714636 result ['columns' ] = columns
45724637
4573- # Subframes section
4574- if include_subframes and hasattr (self , '_subframes' ):
4575- subframes = {}
4576- for name , entry in self ._subframes .items ():
4577- subframes [name ] = _export_subframe_schema_v2 (entry , include_compression )
4578- if subframes :
4579- result ['subframes' ] = subframes
4580- elif self ._schema .get ('subframes' ):
4581- # Just include index info if no registry
4582- result ['subframes' ] = {}
4583- for name , info in self ._schema .get ('subframes' , {}).items ():
4584- index_cols = info .get ('index' , [])
4585- if isinstance (index_cols , str ):
4586- index_cols = [index_cols ]
4587- result ['subframes' ][name ] = {'index' : index_cols }
4588-
4589- # Compression section (optional)
4590- if include_compression and self ._schema .get ('compression' ):
4591- comp = copy .deepcopy (self ._schema ['compression' ])
4592- # Remove verbose precision stats from export
4593- for name , info in comp .items ():
4594- if name != '__meta__' and 'precision' in info :
4595- del info ['precision' ]
4596- # Convert dtypes to strings
4597- for name , info in comp .items ():
4638+ # 3. Groups section (if present) - simple named lists of column names
4639+ if groups :
4640+ result ['groups' ] = groups
4641+
4642+ # 4. Compression section - ALWAYS included when present (required for data interpretation)
4643+ if self ._schema .get ('compression' ):
4644+ comp = OrderedDict ()
4645+ for name , info in self ._schema ['compression' ].items ():
45984646 if name == '__meta__' :
4647+ # Preserve compression metadata
4648+ comp ['__meta__' ] = copy .deepcopy (info )
45994649 continue
4650+
4651+ # Build compression entry with essential fields
4652+ entry = {}
4653+
4654+ # Required fields for data interpretation
4655+ if 'compressed_col' in info :
4656+ entry ['compressed_col' ] = info ['compressed_col' ]
4657+ if 'compress_expr' in info :
4658+ entry ['compress_expr' ] = info ['compress_expr' ]
4659+ if 'decompress_expr' in info :
4660+ entry ['decompress_expr' ] = info ['decompress_expr' ]
4661+
4662+ # Convert dtypes to strings using helper
46004663 for dtype_field in ['compressed_dtype' , 'decompressed_dtype' ]:
46014664 if dtype_field in info and info [dtype_field ] is not None :
4602- dtype = info [dtype_field ]
4603- if hasattr (dtype , 'name' ):
4604- info [dtype_field ] = dtype .name
4605- elif not isinstance (dtype , str ):
4606- info [dtype_field ] = str (dtype )
4607- # Remove non-serializable monitor functions
4665+ entry [dtype_field ] = _dtype_to_str (info [dtype_field ])
4666+
4667+ # State and flags
4668+ if 'state' in info :
4669+ entry ['state' ] = info ['state' ]
4670+ if 'original_removed' in info :
4671+ entry ['original_removed' ] = info ['original_removed' ]
4672+
4673+ # Optional: precision statistics (can be verbose)
4674+ if include_precision_stats and 'precision' in info :
4675+ entry ['precision' ] = copy .deepcopy (info ['precision' ])
4676+
4677+ # Optional: monitor info (without non-serializable func)
46084678 if 'monitor' in info and info ['monitor' ]:
46094679 monitor = info ['monitor' ]
46104680 if 'func' in monitor :
4611- info ['monitor' ] = {k : v for k , v in monitor .items () if k != 'func' }
4612- result ['compression' ] = comp
4681+ entry ['monitor' ] = {k : v for k , v in monitor .items () if k != 'func' }
4682+ else :
4683+ entry ['monitor' ] = copy .deepcopy (monitor )
4684+
4685+ comp [name ] = entry
4686+
4687+ if comp :
4688+ result ['compression' ] = comp
4689+
4690+ # 5. Subframes section (last - hierarchical structure)
4691+ if include_subframes :
4692+ subframes = OrderedDict ()
4693+
4694+ # Try subframe registry first
4695+ if hasattr (self , '_subframes' ):
4696+ for name , entry in self ._subframes .items ():
4697+ subframes [name ] = _export_subframe_schema_v2 (entry , include_precision_stats )
4698+
4699+ # Fall back to schema if registry is empty
4700+ if not subframes and self ._schema .get ('subframes' ):
4701+ for name , info in self ._schema .get ('subframes' , {}).items ():
4702+ index_cols = info .get ('index' , [])
4703+ if isinstance (index_cols , str ):
4704+ index_cols = [index_cols ]
4705+ subframes [name ] = {'index' : index_cols }
4706+
4707+ if subframes :
4708+ result ['subframes' ] = subframes
46134709
46144710 return result
46154711
4616- def save_schema_v2 (self , path , include_compression = True , include_subframes = True ,
4712+ def save_schema_v2 (self , path , include_precision_stats = False , include_subframes = True ,
46174713 indent = 2 , max_line_length = 100 , within_group_sort = "schema" ):
46184714 """
46194715 Save schema to JSON file (v2 format).
46204716
46214717 Short entries stay on one line, long entries are split.
46224718 Columns ordered by groups, then ungrouped columns.
4719+ Compression definitions are always saved (required for data interpretation).
46234720
46244721 Parameters
46254722 ----------
46264723 path : str
46274724 Output file path
4628- include_compression : bool, default=False
4629- Whether to include compression section
4725+ include_precision_stats : bool, default=False
4726+ Whether to include precision statistics (RMSE, max_error, etc.) in
4727+ compression section. Compression definitions are always included.
46304728 include_subframes : bool, default=True
46314729 Whether to include recursive subframe schemas
46324730 indent : int, default=2
@@ -4638,7 +4736,7 @@ def save_schema_v2(self, path, include_compression=True, include_subframes=True,
46384736 "alphabetic" - sort alphabetically within each group
46394737 """
46404738 schema = self .export_schema_v2 (
4641- include_compression = include_compression ,
4739+ include_precision_stats = include_precision_stats ,
46424740 include_subframes = include_subframes ,
46434741 within_group_sort = within_group_sort
46444742 )
@@ -4672,6 +4770,12 @@ def load_schema_v2(path):
46724770 # Detect version
46734771 meta = schema .get ('__meta__' , {})
46744772 version = meta .get ('schema_version' , 1 )
4773+ # Handle string versions like '2.0'
4774+ if isinstance (version , str ):
4775+ try :
4776+ version = float (version )
4777+ except (ValueError , TypeError ):
4778+ version = 1
46754779
46764780 if version >= 2 :
46774781 # Already v2 format
0 commit comments