diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 052457ca3..9d1570c9d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,6 +18,7 @@ repos: # hooks: # - id: black - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.11 rev: v0.15.12 hooks: - id: ruff diff --git a/CHANGELOG.md b/CHANGELOG.md index 295483e1e..9f68d7523 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # HDMF Changelog +## Unreleased + +### Breaking changes +- `HDF5IO` `expandable` argument is now a list of data type names instead of a boolean. The default is `("VectorData", "ElementIdentifiers")`, so only `DynamicTable` columns and id are expandable out of the box — previously every dataset with a matching spec shape was expanded. Datasets of types outside this list that previously were expandable by default will now default to fixed-shape on-disk layout; add the relevant type to `expandable` to restore prior behavior. Replace `expandable=True` with an explicit list (e.g. `["VectorData", "ElementIdentifiers", "MyType"]`) and `expandable=False` with `[]`; passing `True`/`False` now raises a `TypeError`. @bendichter @rly [#1439](https://github.com/hdmf-dev/hdmf/pull/1439) + +### Enhancements +- Set sensible default chunk sizes (~4 MB, in the recommended 2-16 MB range for cloud-hosted files) when `chunks=True`, replacing h5py's much smaller defaults. Added public `HDF5IO.compute_default_chunk_shape()` so users can inspect or override the chunk shape that would be used. @bendichter [#1440](https://github.com/hdmf-dev/hdmf/pull/1440) + ## HDMF 6.0.1 (May 5, 2026) ### Fixed diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 5c1fad7fb..3ce44d999 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -1501,6 +1501,9 @@ def compute_default_chunk_shape(data_shape, dtype, target_chunk_bytes=4 * 1024 * The shape of the dataset. dtype : numpy.dtype or type The data type, used to determine bytes per element. + target_chunk_bytes : int, optional + Target chunk size in bytes. Default is 4 MB. + neurodata_type : str, optional target_chunk_bytes : int Target chunk size in bytes. Default is 4 MB. neurodata_type : str @@ -1509,6 +1512,7 @@ def compute_default_chunk_shape(data_shape, dtype, target_chunk_bytes=4 * 1024 * Returns ------- + tuple or True tuple or bool The computed chunk shape, or ``True`` to fall back to h5py auto-chunking when a shape cannot be computed (unsupported dtype or zero-length trailing dimension). diff --git a/src/hdmf/validate/errors.py b/src/hdmf/validate/errors.py index c1b4360a1..5e60f02fd 100644 --- a/src/hdmf/validate/errors.py +++ b/src/hdmf/validate/errors.py @@ -12,7 +12,9 @@ "MissingDataType", "IllegalLinkError", "IncorrectDataType", - "IncorrectQuantityError" + "IncorrectQuantityError", + "ExtraFieldWarning", + "ValidationWarning" ] @@ -212,3 +214,12 @@ def __init__(self, **kwargs): reason = "incorrect data_type - expected '%s', got '%s'" % (expected, received) loc = getargs('location', kwargs) super().__init__(name, reason, location=loc) + + +class ValidationWarning(UserWarning): + """Base class for validation warnings.""" + pass + +class ExtraFieldWarning(ValidationWarning): + """Warning for fields found in data but not defined in any applicable spec.""" + pass diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index c7709e80b..5bf5fb80f 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -6,8 +6,10 @@ import numpy as np -from .errors import Error, DtypeError, MissingError, MissingDataType, ShapeError, IllegalLinkError, IncorrectDataType -from .errors import ExpectedArrayError, IncorrectQuantityError +from .errors import ( + Error, DtypeError, MissingError, MissingDataType, ShapeError, + IllegalLinkError, IncorrectDataType, ValidationWarning, + ExtraFieldWarning, ExpectedArrayError, IncorrectQuantityError) from ..build import GroupBuilder, DatasetBuilder, LinkBuilder, ReferenceBuilder from ..build.builders import BaseBuilder from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec @@ -436,6 +438,7 @@ def validate(self, **kwargs): if not check_shape(spec.shape, shape): if shape is None: ret.append(ExpectedArrayError(self.get_spec_loc(self.spec), self.spec.shape, str(value))) + return ret else: ret.append(ShapeError(self.get_spec_loc(spec), spec.shape, shape)) @@ -459,17 +462,49 @@ def validate(self, **kwargs): builder = getargs('builder', kwargs) attributes = builder.attributes ret = list() + + spec_attr_names = set(self.__attribute_validators.keys()) + spec_attr_names.add(self.spec.type_key()) + spec_attr_names.update(['namespace', 'help', 'colnames', 'object_id', 'id', 'data_type']) + for attr, validator in self.__attribute_validators.items(): attr_val = attributes.get(attr) - if attr_val is None: - if validator.spec.required: - ret.append(MissingError(self.get_spec_loc(validator.spec), - location=self.get_builder_loc(builder))) - else: + if attr_val is not None: errors = validator.validate(attr_val) for err in errors: err.location = self.get_builder_loc(builder) + ".%s" % validator.spec.name ret.extend(errors) + + elif validator.spec.required: + ret.append(MissingError( + self.get_spec_loc(validator.spec), + location=self.get_builder_loc(builder) + )) + + for builder_attr in attributes: + if builder_attr not in spec_attr_names: + dt = attributes.get(self.spec.type_key()) + found = False + if dt is not None: + try: + types_to_check = [dt] + list(self.vmap.namespace.get_hierarchy(dt)) + for candidate_dt in types_to_check: + v = self.vmap.get_validator(candidate_dt) + if hasattr(v, "_BaseStorageValidator__attribute_validators"): + dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) + if builder_attr in dt_attrs: + found = True + break + except Exception: + pass + + if not found: + ret.append(ExtraFieldWarning( + self.get_spec_loc(self.spec), + f"Unexpected attribute '{builder_attr}' encountered in {self.spec.name}", + location=self.get_builder_loc(builder) + )) + return ret @@ -586,6 +621,7 @@ def validate(self, **kwargs): builder = getargs('builder', kwargs) errors = super().validate(builder) errors.extend(self.__validate_children(builder)) + return self._remove_duplicates(errors) def __validate_children(self, parent_builder): @@ -608,6 +644,89 @@ def __validate_children(self, parent_builder): parent_builder.links.values()) matcher.assign_to_specs(builder_children) + seen = set() + effective_dt = self.spec.data_type or parent_builder.attributes.get(self.spec.type_key()) + if effective_dt is not None: + expected_child_types = set() + for child_spec in chain(self.spec.datasets, self.spec.groups, self.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + try: + for ancestor_dt in self.vmap.namespace.get_hierarchy(effective_dt): + ancestor_validator = self.vmap.get_validator(ancestor_dt) + if hasattr(ancestor_validator, 'spec'): + for child_spec in chain(ancestor_validator.spec.datasets, + ancestor_validator.spec.groups, + ancestor_validator.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + except Exception: + pass + + builder_effective_dt = parent_builder.attributes.get(self.spec.type_key()) + if builder_effective_dt is not None and builder_effective_dt != effective_dt: + try: + for ancestor_dt in self.vmap.namespace.get_hierarchy(builder_effective_dt): + ancestor_validator = self.vmap.get_validator(ancestor_dt) + if hasattr(ancestor_validator, 'spec'): + for child_spec in chain(ancestor_validator.spec.datasets, + ancestor_validator.spec.groups, + ancestor_validator.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + except Exception: + pass + + spec_named_children = set() + for child_spec in chain(self.spec.datasets, self.spec.groups, self.spec.links): + if hasattr(child_spec, 'name') and child_spec.name is not None: + spec_named_children.add(child_spec.name) + + COLUMN_TYPES = {'VectorData', 'VectorIndex', 'ElementIdentifiers', 'DynamicTableRegion'} + + for extra_builder in matcher.unmatched_builders: + if isinstance(extra_builder, LinkBuilder): + extra_builder = extra_builder.builder + builder_dt = extra_builder.attributes.get(self.spec.type_key()) + if builder_dt is None: + if expected_child_types & COLUMN_TYPES: + continue + try: + parent_hierarchy = self.vmap.namespace.get_hierarchy(effective_dt) + if 'DynamicTable' in parent_hierarchy: + continue + except Exception: + pass + has_untyped_dataset_children = any( + _resolve_data_type(cs) is None + for cs in chain(self.spec.datasets, self.spec.groups, self.spec.links) + ) + if has_untyped_dataset_children: + continue + else: + try: + hierarchy = self.vmap.namespace.get_hierarchy(builder_dt) + if any(t in expected_child_types for t in hierarchy): + continue + except Exception: + pass + if extra_builder.name in spec_named_children: + continue + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', + 'meaning', 'value', 'dtr', 'target'): + continue + if extra_builder.name in seen: + continue + seen.add(extra_builder.name) + yield ValidationWarning( + self.get_spec_loc(self.spec), + f"Unexpected element '{extra_builder.name}' encountered in {self.spec.name}", + location=self.get_builder_loc(parent_builder) + ) + for child_spec, matched_builders in matcher.spec_matches: yield from self.__validate_presence_and_quantity(child_spec, len(matched_builders), parent_builder) for child_builder in matched_builders: @@ -654,14 +773,52 @@ def __construct_incorrect_quantity_error(self, child_spec, parent_builder, n_bui def __validate_child_builder(self, child_spec, child_builder, parent_builder): """Validate a child builder against a child spec considering links""" + if isinstance(child_builder, LinkBuilder): if self.__cannot_be_link(child_spec): yield self.__construct_illegal_link_error(child_spec, parent_builder) return # do not validate illegally linked objects child_builder = child_builder.builder + child_builder_data_type = child_builder.attributes.get(self.spec.type_key()) - for child_validator in self.__get_child_validators(child_spec, child_builder_data_type): - yield from child_validator.validate(child_builder) + validators = list(self.__get_child_validators(child_spec, child_builder_data_type)) + + warnings_per_validator = [] + non_warning_errors = [] + + for v in validators: + current_warnings = [] + for result in v.validate(builder=child_builder): + if isinstance(result, ExtraFieldWarning): + current_warnings.append(result) + else: + non_warning_errors.append(result) + warnings_per_validator.append(current_warnings) + + yield from non_warning_errors + + if len(validators) == 1: + for w in warnings_per_validator[0]: + yield w + return + + def extract_attr(w): + match = re.search(r"'(.+?)'", w.reason) + return match.group(1) if match else None + + attr_sets = [] + for warning_list in warnings_per_validator: + attr_sets.append(set(extract_attr(w) for w in warning_list)) + + common_attrs = set.intersection(*attr_sets) if len(attr_sets) > 1 else (attr_sets[0] if attr_sets else set()) + + yielded_attrs = set() + for warning_list in warnings_per_validator: + for w in warning_list: + attr_name = extract_attr(w) + if attr_name in common_attrs and attr_name not in yielded_attrs: + yield w + yielded_attrs.add(attr_name) def __construct_illegal_link_error(self, child_spec, parent_builder): name_of_erroneous = self.get_spec_loc(child_spec) diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 0717ef401..f7e310188 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -12,10 +12,10 @@ from hdmf.testing import TestCase, remove_test_file from hdmf.validate import ValidatorMap from hdmf.validate.errors import (DtypeError, MissingError, ExpectedArrayError, MissingDataType, - IncorrectQuantityError, IllegalLinkError, ShapeError, IncorrectDataType) + IncorrectQuantityError, IllegalLinkError, ShapeError, ValidationWarning, ExtraFieldWarning, Error, IncorrectDataType) from hdmf.backends.hdf5 import HDF5IO from hdmf.utils import ZARR_INSTALLED, StrDataset -from hdmf.validate.errors import Error +from collections import defaultdict, OrderedDict, Counter CORE_NAMESPACE = 'test_core' @@ -1652,6 +1652,7 @@ def test_scalar_instead_of_array(self): ] ) result = self.vmap.validate(builder) + self.assertEqual(len(result), 1) # Should be ExpectedArrayError, not ShapeError self.assertIsInstance(result[0], ExpectedArrayError) @@ -2043,3 +2044,89 @@ def test_isodatetime_no_time_component_fails(self): # This confirms it fails because it lacks the 'T' and timezone self.assertEqual(len(result), 1) self.assertIsInstance(result[0], Error) + + +class TestExtraFieldIntersection(ValidatorTestBase): + """Test that extra field warnings are only raised if unmatched by ALL applicable specs.""" + + def getSpecs(self): + base_spec = DatasetSpec( + doc='base type', + data_type_def='BaseType', + attributes=[] + ) + ext_spec = DatasetSpec( + doc='extended type', + data_type_def='ExtType', + data_type_inc='BaseType', + attributes=[ + AttributeSpec(name='resolution', dtype='float', doc='extra attribute') + ] + ) + group_dataset_spec = DatasetSpec( + name='data_column', + doc='a dataset of base type', + data_type_inc='BaseType', + attributes=[ + AttributeSpec(name='resolution', dtype='float', doc='extra attribute') + ] + ) + group_spec = GroupSpec( + doc='A group containing the base type', + data_type_def='GroupWithBase', + datasets=[group_dataset_spec] + ) + return (base_spec, ext_spec, group_spec) + + def test_attribute_intersection_no_warning(self): + """Test that 'resolution' triggers NO warning because ext_spec knows it.""" + dataset = DatasetBuilder( + name='data_column', + data=[1.0, 2.0], + attributes={'data_type': 'BaseType', 'resolution': 0.1} + ) + builder = GroupBuilder( + name='my_group', + attributes={'data_type': 'GroupWithBase'}, + datasets=[dataset] + ) + + result = self.vmap.validate(builder) + + extra_warnings = [r for r in result if isinstance(r, ExtraFieldWarning)] + self.assertEqual(len(extra_warnings), 0, f"False positive found: {extra_warnings}") + + def test_truly_extra_attribute_warning(self): + """Test that a completely unknown attribute DOES trigger an ExtraFieldWarning.""" + dataset = DatasetBuilder( + name='data_column', + data=[1.0], + attributes={'data_type': 'BaseType', 'completely_unknown': 'surprise'} + ) + builder = GroupBuilder( + name='my_group', + attributes={'data_type': 'GroupWithBase'}, + datasets=[dataset] + ) + + result = self.vmap.validate(builder) + + extra_warnings = [r for r in result if isinstance(r, ExtraFieldWarning)] + self.assertEqual(len(extra_warnings), 1) + self.assertIn("completely_unknown", str(extra_warnings)) + + def test_unexpected_element_warning(self): + """Test that an unknown dataset triggers a ValidationWarning.""" + extra_dataset = DatasetBuilder(name='extra_ds', data=[1]) + builder = GroupBuilder( + name='my_group', + attributes={'data_type': 'GroupWithBase'}, + datasets=[extra_dataset] + ) + + result = self.vmap.validate(builder) + + warnings = [r for r in result if isinstance(r, ValidationWarning) and "Unexpected element" in str(r)] + self.assertEqual(len(warnings), 1) + + self.assertIn("extra_ds", str(warnings))