[feat] add xarray coords support, auto-conversion in binType/minDims/maxDims

fangq · fangq · commit 2d1c28f1c9f2 · 2026-01-16T18:49:58.000-05:00
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 - Copyright: (C) Qianqian Fang (2019-2026) <q.fang at neu.edu>
 - License: Apache License, Version 2.0
-- Version: 0.9.0
+- Version: 0.9.1
 - URL: https://github.com/NeuroJSON/pyjdata
 - Acknowledgement: This project is supported by US National Institute of Health (NIH)
   grant [U24-NS124027](https://reporter.nih.gov/project-details/10308329)
diff --git a/jdata/__init__.py b/jdata/__init__.py
@@ -123,7 +123,7 @@
 from .jschema import jsonschema
 from .neurojson import neuroj, neurojgui
 
-__version__ = "0.9.0"
+__version__ = "0.9.1"
 __all__ = [
     "loadjson",
     "savejson",
diff --git a/jdata/jdict.py b/jdata/jdict.py
@@ -149,7 +149,7 @@ def __getattr__(self, name):
         # Check for dimension-based indexing
         dims = _get_attr_value(attr, currentpath, "dims")
         if dims is not None and isinstance(dims, (list, tuple)) and name in dims:
-            return _DimAccessor(self, name, dims.index(name))
+            return _DimAccessor(self, name)
 
         if data is None:
             val = None
@@ -583,36 +583,61 @@ def __str__(self):
 
 
 class _DimAccessor:
-    """Helper class for dimension-based indexing like jd.data.x(1:10)"""
+    """Helper class for dimension-based indexing like jd.data.x('label')"""
 
-    def __init__(self, parent, dimname, dimpos):
+    __slots__ = ("_parent", "_dimname")
+
+    def __init__(self, parent, dimname):
         self._parent = parent
         self._dimname = dimname
-        self._dimpos = dimpos
-
-    def __call__(self, indices):
-        data = self._parent._data
-        attr = self._parent._attr
-        schema = self._parent._schema
-        currentpath = self._parent._currentpath
-        root = self._parent._root
-        dims = _get_attr_value(attr, currentpath, "dims")
 
-        if isinstance(data, np.ndarray):
-            nddata = len(dims) if dims else data.ndim
-            idx = [slice(None)] * nddata
-            idx[self._dimpos] = indices
-            result = data[tuple(idx)]
+    def __call__(self, sel):
+        p = self._parent
+        dims = _get_attr_value(p._attr, p._currentpath, "dims")
+        data = p._data
+        if not isinstance(data, np.ndarray) or not dims:
+            return None
 
-            newobj = jdict.__new__(jdict)
-            object.__setattr__(newobj, "_data", result)
-            object.__setattr__(newobj, "_attr", attr)
-            object.__setattr__(newobj, "_schema", schema)
-            object.__setattr__(newobj, "_currentpath", currentpath)
-            object.__setattr__(newobj, "_root", root)
-            object.__setattr__(newobj, "_flags", {})
-            return newobj
-        return None
+        # Get current position of this dim
+        dimpos = dims.index(self._dimname)
+
+        # Build index tuple
+        idx = [slice(None)] * data.ndim
+        coords = _get_attr_value(p._attr, p._currentpath, "coords")
+        idx[dimpos] = (
+            _coordlookup(coords.get(self._dimname), sel, self._dimname)
+            if coords
+            else sel
+        )
+
+        # Slice and build new jdict
+        result = data[tuple(idx)]
+        is_scalar = isinstance(idx[dimpos], (int, np.integer))
+
+        # Update dims/coords for cascade (remove dim if scalar selection)
+        new_attr = {"$": {}}
+        new_attr["$"]["dims"] = [
+            d for d in dims if not (is_scalar and d == self._dimname)
+        ]
+        if coords:
+            new_attr["$"]["coords"] = {
+                k: v
+                for k, v in coords.items()
+                if not (is_scalar and k == self._dimname)
+            }
+
+        newobj = jdict.__new__(jdict)
+        for attr, val in [
+            ("_data", result),
+            ("_attr", new_attr),
+            ("_schema", p._schema),
+            ("_currentpath", "$"),
+            ("_root", None),
+            ("_flags", {}),
+        ]:
+            object.__setattr__(newobj, attr, val)
+        object.__setattr__(newobj, "_root", newobj)
+        return newobj
 
 
 def _get_attr_value(attr, path, name):
@@ -621,6 +646,46 @@ def _get_attr_value(attr, path, name):
     return None
 
 
+def _coordlookup(coords, sel, dimname):
+    """Convert coordinate labels to indices."""
+    if coords is None:
+        return sel
+
+    coords_arr = np.asarray(coords)
+    is_numeric_coords = np.issubdtype(coords_arr.dtype, np.number)
+
+    # Numeric value(s) on numeric coords -> lookup
+    if is_numeric_coords and isinstance(
+        sel, (int, float, np.number, list, tuple, np.ndarray)
+    ):
+        if isinstance(sel, (int, float, np.number)):
+            idx = np.where(coords_arr == sel)[0]
+            if len(idx) == 0:
+                raise ValueError(f'Coord {sel} not found in "{dimname}"')
+            return int(idx[0])
+        elif all(isinstance(s, (int, float, np.number)) for s in sel):
+            return [int(np.where(coords_arr == s)[0][0]) for s in sel]
+
+    # Int on non-numeric coords -> direct index
+    if isinstance(sel, (int, np.integer)) and not is_numeric_coords:
+        return sel
+
+    # Slice dict -> slice object
+    if isinstance(sel, dict) and "start" in sel:
+        coords_list = coords_arr.tolist()
+        start = coords_list.index(sel["start"]) if sel.get("start") else 0
+        stop = (
+            coords_list.index(sel["stop"]) + 1 if sel.get("stop") else len(coords_list)
+        )
+        return slice(start, stop)
+
+    # String or list of strings -> index lookup
+    coords_list = coords_arr.tolist()
+    if isinstance(sel, (list, tuple)):
+        return [coords_list.index(s) for s in sel]
+    return coords_list.index(sel)
+
+
 def _esckey(key):
     """Escape dots in key for JSONPath - Python compatible version."""
     if "." not in key:
diff --git a/jdata/jschema.py b/jdata/jschema.py
@@ -24,6 +24,23 @@
 import numpy as np
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+_BINTYPES = {
+    "uint8": np.uint8,
+    "int8": np.int8,
+    "uint16": np.uint16,
+    "int16": np.int16,
+    "uint32": np.uint32,
+    "int32": np.int32,
+    "uint64": np.uint64,
+    "int64": np.int64,
+    "float32": np.float32,
+    "single": np.float32,
+    "float64": np.float64,
+    "double": np.float64,
+    "bool": np.bool_,
+    "logical": np.bool_,
+}
+
 
 def jsonschema(
     data: Any, schema: Any = None, **kwargs
@@ -142,7 +159,7 @@ def _validatedata(
             errors.extend(errmsg)
 
     # numpy array validation
-    if isinstance(data, np.ndarray):
+    if isinstance(data, np.ndarray) or "binType" in schema:
         isvalid, errmsg = _validatebinary(data, schema, path)
         if not isvalid:
             valid = False
@@ -327,92 +344,48 @@ def _validatenumeric(
 
 
 def _validatebinary(data, schema: dict, path: str) -> Tuple[bool, List[str]]:
-    valid = True
-    errors = []
+    """Validate binary/array data against binType and dims."""
+    valid, errors = True, []
 
     if "binType" in schema:
-        bintype = schema["binType"]
-        typemap = {
-            "uint8": np.uint8,
-            "int8": np.int8,
-            "uint16": np.uint16,
-            "int16": np.int16,
-            "uint32": np.uint32,
-            "int32": np.int32,
-            "uint64": np.uint64,
-            "int64": np.int64,
-            "float32": np.float32,
-            "single": np.float32,
-            "float64": np.float64,
-            "double": np.float64,
-            "bool": np.bool_,
-            "logical": np.bool_,
-        }
-        if bintype not in typemap:
-            valid = False
-            errors.append(f'{path}: invalid binType "{bintype}"')
-        elif data.dtype != typemap[bintype]:
-            valid = False
-            errors.append(f"{path}: expected {bintype}, got {data.dtype}")
-
-    actualsize = list(data.shape)
-
-    for dimtype in ["minDims", "maxDims"]:
-        if dimtype in schema:
-            dims = schema[dimtype]
-            if isinstance(dims, (int, float)):
-                dims = [int(dims)]
-            elif isinstance(dims, (list, tuple)):
-                dims = [int(d) for d in dims]
-
-            ismin = dimtype == "minDims"
-
-            if len(dims) == 1:
-                # Vector check
-                isvector = data.ndim == 1 or (data.ndim == 2 and 1 in data.shape)
-                if not isvector and data.ndim > 1:
-                    errors.append(f"{path}: expected 1D array for {dimtype}")
-                    valid = False
-                else:
-                    actual_len = max(data.shape) if data.ndim > 0 else 0
-                    if ismin and actual_len < dims[0]:
-                        valid = False
-                        errors.append(
-                            f"{path}: length {actual_len} < {dimtype} {dims[0]}"
-                        )
-                    elif not ismin and actual_len > dims[0]:
-                        valid = False
-                        errors.append(
-                            f"{path}: length {actual_len} > {dimtype} {dims[0]}"
-                        )
-            else:
-                if ismin:
-                    actualsize_ext = actualsize + [1] * max(
-                        0, len(dims) - len(actualsize)
-                    )
-                    checklen = len(dims)
-                else:
-                    actualsize_ext = actualsize
-                    checklen = min(len(actualsize), len(dims))
-
-                for i in range(checklen):
-                    if ismin and actualsize_ext[i] < dims[i]:
-                        valid = False
-                        errors.append(
-                            f"{path}: dim {i} is {actualsize_ext[i]}, violates {dimtype} {dims[i]}"
-                        )
-                    elif not ismin and i < len(actualsize) and actualsize[i] > dims[i]:
-                        valid = False
-                        errors.append(
-                            f"{path}: dim {i} is {actualsize[i]}, violates {dimtype} {dims[i]}"
-                        )
-
-                if not ismin and len(actualsize) > len(dims):
-                    if any(s > 1 for s in actualsize[len(dims) :]):
-                        valid = False
-                        errors.append(
-                            f"{path}: has {len(actualsize)} dimensions, {dimtype} only specifies {len(dims)}"
-                        )
+        dtype = _BINTYPES.get(schema["binType"])
+        if dtype is None:
+            return False, [f'{path}: invalid binType "{schema["binType"]}"']
+        if not isinstance(data, np.ndarray):
+            return False, [f"{path}: expected numpy array, got {type(data).__name__}"]
+        if data.dtype != dtype:
+            return False, [f"{path}: expected {schema['binType']}, got {data.dtype}"]
+
+    if not isinstance(data, np.ndarray):
+        return valid, errors
+
+    # Validate minDims/maxDims
+    for dimtype in ("minDims", "maxDims"):
+        if dimtype not in schema:
+            continue
+        dims = schema[dimtype]
+        dims = [int(dims)] if isinstance(dims, (int, float)) else [int(d) for d in dims]
+        ismin = dimtype == "minDims"
+
+        if len(dims) == 1:  # Vector check
+            actual = (
+                max(data.shape)
+                if data.ndim <= 2 and (data.ndim == 1 or 1 in data.shape)
+                else -1
+            )
+            if actual < 0:
+                valid, errors = False, errors + [f"{path}: expected 1D array"]
+            elif (ismin and actual < dims[0]) or (not ismin and actual > dims[0]):
+                valid, errors = False, errors + [
+                    f"{path}: length {actual} violates {dimtype} {dims[0]}"
+                ]
+        else:  # ND check
+            for i, d in enumerate(dims):
+                actual = data.shape[i] if i < data.ndim else 1
+                if (ismin and actual < d) or (not ismin and actual > d):
+                    valid, errors = False, errors + [
+                        f"{path}: dim {i} is {actual}, violates {dimtype} {d}"
+                    ]
 
     return valid, errors
 
@@ -643,30 +616,13 @@ def _generatedata(schema: dict, opts: dict) -> Any:
             schematype = "array"
 
     if "binType" in schema:
-        bintype = schema["binType"]
+        dtype = _BINTYPES.get(schema["binType"], np.float64)
         dims = schema.get("minDims", 1)
-        if isinstance(dims, (int, float)):
-            dims = (int(dims),)
-        elif isinstance(dims, list):
-            dims = tuple(int(d) for d in dims)
-
-        typemap = {
-            "uint8": np.uint8,
-            "int8": np.int8,
-            "uint16": np.uint16,
-            "int16": np.int16,
-            "uint32": np.uint32,
-            "int32": np.int32,
-            "uint64": np.uint64,
-            "int64": np.int64,
-            "float32": np.float32,
-            "single": np.float32,
-            "float64": np.float64,
-            "double": np.float64,
-            "bool": np.bool_,
-            "logical": np.bool_,
-        }
-        dtype = typemap.get(bintype, np.float64)
+        dims = (
+            (int(dims),)
+            if isinstance(dims, (int, float))
+            else tuple(int(d) for d in dims)
+        )
         return np.zeros(dims, dtype=dtype)
 
     if schematype == "null":
@@ -873,3 +829,16 @@ def _getsubschema(schema: dict, jsonpath: str) -> Optional[dict]:
                 return None
 
     return subschema
+
+
+def coerce(data: Any, schema: dict) -> Any:
+    """Coerce data to match schema's binType. For use before assignment."""
+    if not isinstance(schema, dict) or "binType" not in schema:
+        return data
+    dtype = _BINTYPES.get(schema["binType"])
+    if dtype is None or (isinstance(data, np.ndarray) and data.dtype == dtype):
+        return data
+    try:
+        return np.asarray(data, dtype=dtype)
+    except (ValueError, TypeError):
+        return data
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 setup(
     name="jdata",
     packages=["jdata"],
-    version="0.9.0",
+    version="0.9.1",
     license="Apache license 2.0",
     description="JSON/binary JSON formats for exchanging Python and Numpy data",
     long_description=readme,
diff --git a/test/testjdict.py b/test/testjdict.py