Skip to content

Commit e3efbf5

Browse files
author
miranov25
committed
## Schema v2 Ordering Implementation - Review Request
### Summary We implemented the schema v2 canonical ordering as agreed in our previous review: **New order:** `__meta__` → `columns` → `groups` → `compression` → `subframes` This matches the ecosystem pattern (Arrow, Parquet, JSON Schema, HDF5): > meta → fields → semantic grouping → storage details → hierarchy ### Key Changes 1. **Canonical ordering** using Python's OrderedDict 2. **Groups** remain simple named lists of column names (no nested metadata) 3. **Compression** definitions now always included (required for data interpretation) 4. **Renamed parameter**: `include_compression` → `include_precision_stats` for clarity 5. **Bug fixes**: subframes fallback, string version handling, exception simplification ### Test Results - 368 tests passing - 6 new ordering tests added - All backward compatibility maintained ### Questions for Review 1. Is the implementation consistent with what we agreed? 2. Any concerns about the `include_precision_stats` parameter rename? 3. Should we document the canonical ordering in SCHEMA.md now or defer? ### Files Changed - `AliasDataFrame.py`: ~150 lines changed - `export_schema_v2()`: reordered with OrderedDict - `_dtype_to_str()`: new helper function - `_export_subframe_schema_v2()`: aligned with main export - `load_schema_v2()`: robust version parsing - `test_alias_dataframe.py`: +120 lines - `TestSchemaV2Ordering`: 6 new tests for ordering verification
1 parent 6546f49 commit e3efbf5

2 files changed

Lines changed: 359 additions & 65 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 169 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,43 @@ def _order_columns_by_groups(columns, groups, within_group_sort="schema"):
497497
return ordered
498498

499499

500+
def _dtype_to_str(dtype):
501+
"""
502+
Convert a dtype to its string representation.
503+
504+
Handles:
505+
- numpy dtype instances: np.dtype('float32') → 'float32'
506+
- numpy type classes: np.float32 → 'float32'
507+
- strings: 'float32' → 'float32'
508+
- None: None → None
509+
510+
Parameters
511+
----------
512+
dtype : various
513+
A dtype in various formats
514+
515+
Returns
516+
-------
517+
str or None
518+
String representation of dtype
519+
"""
520+
if dtype is None:
521+
return None
522+
if isinstance(dtype, str):
523+
return dtype
524+
# np.dtype instances have .name
525+
if hasattr(dtype, 'name'):
526+
return dtype.name
527+
# numpy type classes like np.float32
528+
if hasattr(dtype, '__name__'):
529+
return dtype.__name__
530+
# Fallback - try to convert via np.dtype
531+
try:
532+
return np.dtype(dtype).name
533+
except (TypeError, AttributeError):
534+
return str(dtype)
535+
536+
500537
def _export_column_spec_v2(col_name, col_info, df=None):
501538
"""
502539
Export a single column specification in v2 format.
@@ -552,16 +589,16 @@ def _export_column_spec_v2(col_name, col_info, df=None):
552589
return result
553590

554591

555-
def _export_subframe_schema_v2(subframe_entry, include_compression=False):
592+
def _export_subframe_schema_v2(subframe_entry, include_precision_stats=False):
556593
"""
557594
Export a subframe's full schema recursively.
558595
559596
Parameters
560597
----------
561598
subframe_entry : dict
562599
Entry from SubframeRegistry: {'frame': adf, 'index': [...]}
563-
include_compression : bool
564-
Whether to include compression section
600+
include_precision_stats : bool
601+
Whether to include precision statistics in compression section
565602
566603
Returns
567604
-------
@@ -604,10 +641,34 @@ def _export_subframe_schema_v2(subframe_entry, include_compression=False):
604641
if sf_schema.get('groups'):
605642
result['groups'] = sf_schema['groups']
606643

607-
# Compression (optional)
608-
if include_compression and sf_schema.get('compression'):
609-
comp = sf_schema['compression'].copy()
610-
result['compression'] = comp
644+
# Compression - always include when present (required for data interpretation)
645+
if sf_schema.get('compression'):
646+
comp = {}
647+
for name, info in sf_schema['compression'].items():
648+
if name == '__meta__':
649+
comp['__meta__'] = copy.deepcopy(info)
650+
continue
651+
652+
entry = {}
653+
# Required fields
654+
for field in ['compressed_col', 'compress_expr', 'decompress_expr',
655+
'state', 'original_removed']:
656+
if field in info:
657+
entry[field] = info[field]
658+
659+
# Convert dtypes to strings using helper
660+
for dtype_field in ['compressed_dtype', 'decompressed_dtype']:
661+
if dtype_field in info and info[dtype_field] is not None:
662+
entry[dtype_field] = _dtype_to_str(info[dtype_field])
663+
664+
# Optional precision stats
665+
if include_precision_stats and 'precision' in info:
666+
entry['precision'] = copy.deepcopy(info['precision'])
667+
668+
comp[name] = entry
669+
670+
if comp:
671+
result['compression'] = comp
611672

612673
return result
613674

@@ -1584,9 +1645,7 @@ def build_graph():
15841645
try:
15851646
ordered = list(nx.topological_sort(g.subgraph(expanded)))
15861647
result = [n for n in ordered if n in expanded]
1587-
# Change to:
1588-
except (nx.NetworkXError, nx.NetworkXUnfeasible):
1589-
# Cycle detected or graph issue, return unordered
1648+
except nx.NetworkXError:
15901649
result = list(expanded)
15911650

15921651
return result
@@ -3046,7 +3105,7 @@ def compress_columns(self, compression_spec=None, columns=None, suffix='_c', dro
30463105

30473106
# Standard state validation for non-selective modes
30483107
if current_state == CompressionState.COMPRESSED:
3049-
# Already compressed - skip (idempotent)
3108+
# Idempotent behavior: skip already compressed columns
30503109
continue
30513110
elif current_state == CompressionState.SCHEMA_ONLY:
30523111
# Valid transition: SCHEMA_ONLY → COMPRESSED
@@ -4358,7 +4417,7 @@ def export_schema(self):
43584417
dict
43594418
JSON-safe schema dictionary
43604419
"""
4361-
return self.export_schema_v2(include_compression=True)
4420+
return self.export_schema_v2(include_precision_stats=True)
43624421

43634422
def save_schema(self, path):
43644423
"""
@@ -4369,7 +4428,7 @@ def save_schema(self, path):
43694428
path : str
43704429
Path to save schema JSON file
43714430
"""
4372-
self.save_schema_v2(path, include_compression=True)
4431+
self.save_schema_v2(path, include_precision_stats=True)
43734432

43744433
@staticmethod
43754434
def load_schema(path):
@@ -4510,23 +4569,32 @@ def get_column_metadata(self, column):
45104569
# Return all fields except dtype and expr
45114570
return {k: v for k, v in col_info.items() if k not in ('dtype', 'expr', 'constant')}
45124571

4513-
def export_schema_v2(self, include_compression=True, include_subframes=True,
4572+
def export_schema_v2(self, include_precision_stats=False, include_subframes=True,
45144573
within_group_sort="schema"):
45154574
"""
45164575
Export schema as JSON-safe dictionary (v2 format).
45174576
45184577
Features:
45194578
- Uniform object format for all columns: {"dtype": "..."}
45204579
- No "expr": null for physical columns
4521-
- Groups preserved
4580+
- Groups preserved (simple named lists of column names)
45224581
- Column metadata preserved (unit, axisLabel, etc.)
45234582
- Recursive subframe schemas (nested)
4524-
- Compression optional
4583+
- Compression always included (definitions required for data interpretation)
4584+
4585+
Schema v2 canonical order:
4586+
1. __meta__ - version, creation info
4587+
2. columns - column definitions (physical + aliases)
4588+
3. groups - logical column groupings (if present)
4589+
4. compression - storage/compression configuration
4590+
5. subframes - hierarchical structure (related tables)
45254591
45264592
Parameters
45274593
----------
4528-
include_compression : bool, default=False
4529-
Whether to include compression section
4594+
include_precision_stats : bool, default=False
4595+
Whether to include precision statistics (RMSE, max_error, etc.) in
4596+
compression section. The compression definitions (expressions, dtypes,
4597+
state) are always included.
45304598
include_subframes : bool, default=True
45314599
Whether to include recursive subframe schemas
45324600
within_group_sort : str, default="schema"
@@ -4536,25 +4604,22 @@ def export_schema_v2(self, include_compression=True, include_subframes=True,
45364604
Returns
45374605
-------
45384606
dict
4539-
JSON-safe schema dictionary
4607+
JSON-safe schema dictionary with canonical key ordering
45404608
"""
4541-
result = {}
4609+
from collections import OrderedDict
4610+
result = OrderedDict()
45424611

4543-
# __meta__ section
4612+
# 1. __meta__ section (always first)
45444613
result['__meta__'] = {
45454614
'schema_version': SCHEMA_VERSION_V2,
45464615
'created_at': datetime.now(timezone.utc).isoformat(),
45474616
'schema_id': self._schema.get('__meta__', {}).get('schema_id')
45484617
}
45494618

4550-
# Groups section (if present)
4551-
groups = self._schema.get('groups', {})
4552-
if groups:
4553-
result['groups'] = groups
4554-
4555-
# Columns section
4556-
columns = {}
4619+
# 2. Columns section
4620+
columns = OrderedDict()
45574621
schema_columns = self._schema.get('columns', {})
4622+
groups = self._schema.get('groups', {})
45584623

45594624
# Physical columns from DataFrame first
45604625
for col in self.df.columns:
@@ -4570,63 +4635,96 @@ def export_schema_v2(self, include_compression=True, include_subframes=True,
45704635
columns = _order_columns_by_groups(columns, groups, within_group_sort)
45714636
result['columns'] = columns
45724637

4573-
# Subframes section
4574-
if include_subframes and hasattr(self, '_subframes'):
4575-
subframes = {}
4576-
for name, entry in self._subframes.items():
4577-
subframes[name] = _export_subframe_schema_v2(entry, include_compression)
4578-
if subframes:
4579-
result['subframes'] = subframes
4580-
elif self._schema.get('subframes'):
4581-
# Just include index info if no registry
4582-
result['subframes'] = {}
4583-
for name, info in self._schema.get('subframes', {}).items():
4584-
index_cols = info.get('index', [])
4585-
if isinstance(index_cols, str):
4586-
index_cols = [index_cols]
4587-
result['subframes'][name] = {'index': index_cols}
4588-
4589-
# Compression section (optional)
4590-
if include_compression and self._schema.get('compression'):
4591-
comp = copy.deepcopy(self._schema['compression'])
4592-
# Remove verbose precision stats from export
4593-
for name, info in comp.items():
4594-
if name != '__meta__' and 'precision' in info:
4595-
del info['precision']
4596-
# Convert dtypes to strings
4597-
for name, info in comp.items():
4638+
# 3. Groups section (if present) - simple named lists of column names
4639+
if groups:
4640+
result['groups'] = groups
4641+
4642+
# 4. Compression section - ALWAYS included when present (required for data interpretation)
4643+
if self._schema.get('compression'):
4644+
comp = OrderedDict()
4645+
for name, info in self._schema['compression'].items():
45984646
if name == '__meta__':
4647+
# Preserve compression metadata
4648+
comp['__meta__'] = copy.deepcopy(info)
45994649
continue
4650+
4651+
# Build compression entry with essential fields
4652+
entry = {}
4653+
4654+
# Required fields for data interpretation
4655+
if 'compressed_col' in info:
4656+
entry['compressed_col'] = info['compressed_col']
4657+
if 'compress_expr' in info:
4658+
entry['compress_expr'] = info['compress_expr']
4659+
if 'decompress_expr' in info:
4660+
entry['decompress_expr'] = info['decompress_expr']
4661+
4662+
# Convert dtypes to strings using helper
46004663
for dtype_field in ['compressed_dtype', 'decompressed_dtype']:
46014664
if dtype_field in info and info[dtype_field] is not None:
4602-
dtype = info[dtype_field]
4603-
if hasattr(dtype, 'name'):
4604-
info[dtype_field] = dtype.name
4605-
elif not isinstance(dtype, str):
4606-
info[dtype_field] = str(dtype)
4607-
# Remove non-serializable monitor functions
4665+
entry[dtype_field] = _dtype_to_str(info[dtype_field])
4666+
4667+
# State and flags
4668+
if 'state' in info:
4669+
entry['state'] = info['state']
4670+
if 'original_removed' in info:
4671+
entry['original_removed'] = info['original_removed']
4672+
4673+
# Optional: precision statistics (can be verbose)
4674+
if include_precision_stats and 'precision' in info:
4675+
entry['precision'] = copy.deepcopy(info['precision'])
4676+
4677+
# Optional: monitor info (without non-serializable func)
46084678
if 'monitor' in info and info['monitor']:
46094679
monitor = info['monitor']
46104680
if 'func' in monitor:
4611-
info['monitor'] = {k: v for k, v in monitor.items() if k != 'func'}
4612-
result['compression'] = comp
4681+
entry['monitor'] = {k: v for k, v in monitor.items() if k != 'func'}
4682+
else:
4683+
entry['monitor'] = copy.deepcopy(monitor)
4684+
4685+
comp[name] = entry
4686+
4687+
if comp:
4688+
result['compression'] = comp
4689+
4690+
# 5. Subframes section (last - hierarchical structure)
4691+
if include_subframes:
4692+
subframes = OrderedDict()
4693+
4694+
# Try subframe registry first
4695+
if hasattr(self, '_subframes'):
4696+
for name, entry in self._subframes.items():
4697+
subframes[name] = _export_subframe_schema_v2(entry, include_precision_stats)
4698+
4699+
# Fall back to schema if registry is empty
4700+
if not subframes and self._schema.get('subframes'):
4701+
for name, info in self._schema.get('subframes', {}).items():
4702+
index_cols = info.get('index', [])
4703+
if isinstance(index_cols, str):
4704+
index_cols = [index_cols]
4705+
subframes[name] = {'index': index_cols}
4706+
4707+
if subframes:
4708+
result['subframes'] = subframes
46134709

46144710
return result
46154711

4616-
def save_schema_v2(self, path, include_compression=True, include_subframes=True,
4712+
def save_schema_v2(self, path, include_precision_stats=False, include_subframes=True,
46174713
indent=2, max_line_length=100, within_group_sort="schema"):
46184714
"""
46194715
Save schema to JSON file (v2 format).
46204716
46214717
Short entries stay on one line, long entries are split.
46224718
Columns ordered by groups, then ungrouped columns.
4719+
Compression definitions are always saved (required for data interpretation).
46234720
46244721
Parameters
46254722
----------
46264723
path : str
46274724
Output file path
4628-
include_compression : bool, default=False
4629-
Whether to include compression section
4725+
include_precision_stats : bool, default=False
4726+
Whether to include precision statistics (RMSE, max_error, etc.) in
4727+
compression section. Compression definitions are always included.
46304728
include_subframes : bool, default=True
46314729
Whether to include recursive subframe schemas
46324730
indent : int, default=2
@@ -4638,7 +4736,7 @@ def save_schema_v2(self, path, include_compression=True, include_subframes=True,
46384736
"alphabetic" - sort alphabetically within each group
46394737
"""
46404738
schema = self.export_schema_v2(
4641-
include_compression=include_compression,
4739+
include_precision_stats=include_precision_stats,
46424740
include_subframes=include_subframes,
46434741
within_group_sort=within_group_sort
46444742
)
@@ -4672,6 +4770,12 @@ def load_schema_v2(path):
46724770
# Detect version
46734771
meta = schema.get('__meta__', {})
46744772
version = meta.get('schema_version', 1)
4773+
# Handle string versions like '2.0'
4774+
if isinstance(version, str):
4775+
try:
4776+
version = float(version)
4777+
except (ValueError, TypeError):
4778+
version = 1
46754779

46764780
if version >= 2:
46774781
# Already v2 format

0 commit comments

Comments
 (0)