From 84df54ff96eccce3bd731b3f59e68b74fb30458e Mon Sep 17 00:00:00 2001 From: Sejal Date: Sat, 18 Apr 2026 17:44:37 +0000 Subject: [PATCH 01/19] WIP: validation warning handling (2 failing tests remaining) --- src/hdmf/validate/errors.py | 15 +++- src/hdmf/validate/validator.py | 97 +++++++++++++++++++-- tests/unit/validator_tests/test_validate.py | 90 ++++++++++++++++++- 3 files changed, 191 insertions(+), 11 deletions(-) diff --git a/src/hdmf/validate/errors.py b/src/hdmf/validate/errors.py index c1b4360a1..eae51ee72 100644 --- a/src/hdmf/validate/errors.py +++ b/src/hdmf/validate/errors.py @@ -12,7 +12,9 @@ "MissingDataType", "IllegalLinkError", "IncorrectDataType", - "IncorrectQuantityError" + "IncorrectQuantityError", + "ExtraFieldWarning", + "ValidationWarning" ] @@ -212,3 +214,14 @@ def __init__(self, **kwargs): reason = "incorrect data_type - expected '%s', got '%s'" % (expected, received) loc = getargs('location', kwargs) super().__init__(name, reason, location=loc) + + +class ValidationWarning(Error): + """Base class for validation warnings.""" + @property + def message(self): + return self.reason + +class ExtraFieldWarning(ValidationWarning): + """Warning for fields found in data but not defined in any applicable spec.""" + pass \ No newline at end of file diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index b57e30587..38169828f 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -6,7 +6,7 @@ import numpy as np -from .errors import Error, DtypeError, MissingError, MissingDataType, ShapeError, IllegalLinkError, IncorrectDataType +from .errors import Error, DtypeError, MissingError, MissingDataType, ShapeError, IllegalLinkError, IncorrectDataType, ValidationWarning, ExtraFieldWarning from .errors import ExpectedArrayError, IncorrectQuantityError from ..build import GroupBuilder, DatasetBuilder, LinkBuilder, ReferenceBuilder from ..build.builders import BaseBuilder @@ -459,17 +459,41 @@ def validate(self, **kwargs): builder = getargs('builder', kwargs) attributes = builder.attributes ret = list() + + spec_attr_names = set(self.__attribute_validators.keys()) + spec_attr_names.add(self.spec.type_key()) + spec_attr_names.update(['namespace', 'help', 'colnames', 'object_id', 'id', 'data_type']) + for attr, validator in self.__attribute_validators.items(): attr_val = attributes.get(attr) - if attr_val is None: - if validator.spec.required: - ret.append(MissingError(self.get_spec_loc(validator.spec), - location=self.get_builder_loc(builder))) - else: + if attr_val is not None: errors = validator.validate(attr_val) for err in errors: err.location = self.get_builder_loc(builder) + ".%s" % validator.spec.name ret.extend(errors) + + elif validator.spec.required: + ret.append(MissingError( + self.get_spec_loc(validator.spec), + location=self.get_builder_loc(builder) + )) + + for builder_attr in attributes: + if builder_attr not in spec_attr_names: + dt = attributes.get(self.spec.type_key()) + if dt is not None: + dt_validator = self.vmap.get_validator(dt) + if hasattr(dt_validator, "_BaseStorageValidator__attribute_validators"): + dt_attrs = set(dt_validator._BaseStorageValidator__attribute_validators.keys()) + if builder_attr in dt_attrs: + continue + + ret.append(ExtraFieldWarning( + self.get_spec_loc(self.spec), + f"Unexpected attribute '{builder_attr}' encountered in {self.spec.name}", + location=self.get_builder_loc(builder) + )) + return ret @@ -545,6 +569,7 @@ def validate(self, **kwargs): builder = getargs('builder', kwargs) errors = super().validate(builder) errors.extend(self.__validate_children(builder)) + return self._remove_duplicates(errors) def __validate_children(self, parent_builder): @@ -566,7 +591,23 @@ def __validate_children(self, parent_builder): parent_builder.groups.values(), parent_builder.links.values()) matcher.assign_to_specs(builder_children) + errors = [] + + extra_elements = {b.name for b in matcher.unmatched_builders} + + if self.spec.data_type is not None: + for extra_builder in matcher.unmatched_builders: + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): + continue + if extra_builder.name in extra_elements: + continue + yield ValidationWarning( + self.get_spec_loc(self.spec), + f"Unexpected element '{extra_builder.name}' encountered in {self.spec.name}", + location=self.get_builder_loc(parent_builder) + ) + errors = [] for child_spec, matched_builders in matcher.spec_matches: yield from self.__validate_presence_and_quantity(child_spec, len(matched_builders), parent_builder) for child_builder in matched_builders: @@ -613,14 +654,54 @@ def __construct_incorrect_quantity_error(self, child_spec, parent_builder, n_bui def __validate_child_builder(self, child_spec, child_builder, parent_builder): """Validate a child builder against a child spec considering links""" + all_errors = [] + extra_attrib_sets = [] + if isinstance(child_builder, LinkBuilder): if self.__cannot_be_link(child_spec): yield self.__construct_illegal_link_error(child_spec, parent_builder) return # do not validate illegally linked objects child_builder = child_builder.builder + child_builder_data_type = child_builder.attributes.get(self.spec.type_key()) - for child_validator in self.__get_child_validators(child_spec, child_builder_data_type): - yield from child_validator.validate(child_builder) + validators = list(self.__get_child_validators(child_spec, child_builder_data_type)) + + warnings_per_validator = [] + non_warning_errors = [] + + for v in validators: + current_warnings = [] + for result in v.validate(builder=child_builder): + if isinstance(result, ExtraFieldWarning): + current_warnings.append(result) + else: + non_warning_errors.append(result) + warnings_per_validator.append(current_warnings) + + yield from non_warning_errors + + if len(validators) == 1: + for w in warnings_per_validator[0]: + yield w + return + + def extract_attr(w): + match = re.search(r"'(.+?)'", w.reason) + return match.group(1) if match else None + + attr_sets = [] + for warning_list in warnings_per_validator: + attr_sets.append(set(extract_attr(w) for w in warning_list)) + + common_attrs = set.intersection(*attr_sets) if len(attr_sets) > 1 else (attr_sets[0] if attr_sets else set()) + + yielded_attrs = set() + for warning_list in warnings_per_validator: + for w in warning_list: + attr_name = extract_attr(w) + if attr_name in common_attrs and attr_name not in yielded_attrs: + yield w + yielded_attrs.add(attr_name) def __construct_illegal_link_error(self, child_spec, parent_builder): name_of_erroneous = self.get_spec_loc(child_spec) diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 8cdfb953e..0a38d65b4 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -12,10 +12,10 @@ from hdmf.testing import TestCase, remove_test_file from hdmf.validate import ValidatorMap from hdmf.validate.errors import (DtypeError, MissingError, ExpectedArrayError, MissingDataType, - IncorrectQuantityError, IllegalLinkError, ShapeError) + IncorrectQuantityError, IllegalLinkError, ShapeError, ValidationWarning, ExtraFieldWarning, Error) from hdmf.backends.hdf5 import HDF5IO from hdmf.utils import ZARR_INSTALLED, StrDataset -from hdmf.validate.errors import Error +from collections import defaultdict, OrderedDict, Counter CORE_NAMESPACE = 'test_core' @@ -1907,3 +1907,89 @@ def test_isodatetime_no_time_component_fails(self): # This confirms it fails because it lacks the 'T' and timezone self.assertEqual(len(result), 1) self.assertIsInstance(result[0], Error) + + +class TestExtraFieldIntersection(ValidatorTestBase): + """Test that extra field warnings are only raised if unmatched by ALL applicable specs.""" + + def getSpecs(self): + base_spec = DatasetSpec( + doc='base type', + data_type_def='BaseType', + attributes=[] + ) + ext_spec = DatasetSpec( + doc='extended type', + data_type_def='ExtType', + data_type_inc='BaseType', + attributes=[ + AttributeSpec(name='resolution', dtype='float', doc='extra attribute') + ] + ) + return ( + base_spec, + ext_spec, + GroupSpec( + doc='A group containing the base type', + data_type_def='GroupWithBase', + datasets=[ + DatasetSpec( + name='data_column', + doc='a dataset of base type', + data_type_inc='BaseType' + ) + ] + ), + ) + + def test_attribute_intersection_no_warning(self): + """Test that 'resolution' triggers NO warning because ext_spec knows it.""" + dataset = DatasetBuilder( + name='data_column', + data=[1.0, 2.0], + attributes={'data_type': 'BaseType', 'resolution': 0.1} + ) + builder = GroupBuilder( + name='my_group', + attributes={'data_type': 'GroupWithBase'}, + datasets=[dataset] + ) + + result = self.vmap.validate(builder) + + extra_warnings = [r for r in result if isinstance(r, ExtraFieldWarning)] + self.assertEqual(len(extra_warnings), 0, f"False positive found: {extra_warnings}") + + def test_truly_extra_attribute_warning(self): + """Test that a completely unknown attribute DOES trigger an ExtraFieldWarning.""" + dataset = DatasetBuilder( + name='data_column', + data=[1.0], + attributes={'data_type': 'BaseType', 'completely_unknown': 'surprise'} + ) + builder = GroupBuilder( + name='my_group', + attributes={'data_type': 'GroupWithBase'}, + datasets=[dataset] + ) + + result = self.vmap.validate(builder) + + extra_warnings = [r for r in result if isinstance(r, ExtraFieldWarning)] + self.assertEqual(len(extra_warnings), 1) + self.assertIn("completely_unknown", str(extra_warnings)) + + def test_unexpected_element_warning(self): + """Test that an unknown dataset triggers a ValidationWarning.""" + extra_dataset = DatasetBuilder(name='extra_ds', data=[1]) + builder = GroupBuilder( + name='my_group', + attributes={'data_type': 'GroupWithBase'}, + datasets=[extra_dataset] + ) + + result = self.vmap.validate(builder) + + warnings = [r for r in result if isinstance(r, ValidationWarning) and "Unexpected element" in str(r)] + self.assertEqual(len(warnings), 1) + self.assertIn("extra_ds", str(warnings)) \ No newline at end of file From 6f1ce0b4b7aac43295ff82d31c10bb7e08e2d533 Mon Sep 17 00:00:00 2001 From: Sejal Date: Tue, 31 Mar 2026 05:10:50 +0530 Subject: [PATCH 02/19] Add validation for dataset reference target types (#632) (#1429) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> --- CHANGELOG.md | 9 ++ src/hdmf/validate/validator.py | 55 +++++++- tests/unit/validator_tests/test_validate.py | 139 +++++++++++++++++++- 3 files changed, 195 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa81fde1d..86be68bec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,17 @@ # HDMF Changelog +## Unreleased + +### Fixed +- Added missing validation for dataset reference target types to ensure correct `RefSpec.target_type` matching. @sejalpunwatkar [#1429](https://github.com/hdmf-dev/hdmf/pull/1429) + + ## HDMF 5.1.0 (March 24, 2026) ### Enhancements - Made `HERDManager` an abstract interface (ABC) with an abstract `external_resources` property. Subclasses must now declare `external_resources` in their `__fields__` to satisfy the interface. Updated `__gather_fields` to allow auto-generated properties to override inherited abstract properties. Added `link_resources` and `get_external_resources` methods to `HERDManager` for managing a linked HERD separate from the primary one. @rly [#1431](https://github.com/hdmf-dev/hdmf/pull/1431) + ## HDMF 5.0.1 (March 16, 2026) ### Enhancements @@ -18,6 +25,7 @@ - Fixed writing compound dtype datasets with a single field. @rly [#1420](https://github.com/hdmf-dev/hdmf/pull/1420) - Replaced undeclared `pyyaml` dependency with `ruamel.yaml` (a declared core dependency) in `test_common_io.py`. The test relied on PyYAML being transitively installed by pip's `h5py`, which is not the case in conda environments. @rly [#1418](https://github.com/hdmf-dev/hdmf/pull/1418) + ## HDMF 5.0.0 (March 2, 2026) ### Changed @@ -266,6 +274,7 @@ is available on build (during the write process), but not on read of a dataset f ### Bug Fixes - Fixed `TermSetWrapper` warning raised during the setters. @mavaylon1 [#1116](https://github.com/hdmf-dev/hdmf/pull/1116) + ## HDMF 3.13.0 (March 20, 2024) ### Enhancements diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 38169828f..82b72882c 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -410,7 +410,7 @@ def validate(self, **kwargs): else: target_spec = self.vmap.namespace.catalog.get_spec(spec.dtype.target_type) data_type = value.attributes.get(target_spec.type_key()) - hierarchy = self.vmap.namespace.catalog.get_hierarchy(data_type) + hierarchy = self.vmap.namespace.get_hierarchy(data_type) if spec.dtype.target_type not in hierarchy: ret.append(IncorrectDataType(self.get_spec_loc(spec), spec.dtype.target_type, data_type)) @@ -505,6 +505,27 @@ class DatasetValidator(BaseStorageValidator): def __init__(self, **kwargs): super().__init__(**kwargs) + def _check_ref_target_type(self, val, expected_type, type_key, builder, ret): + """Helper to recursively validate reference target types and hierarchy.""" + if isinstance(val, ReferenceBuilder): + target = val.builder + ref_type = target.attributes.get(type_key) + + if expected_type is not None and ref_type is not None: + hierarchy = self.vmap.namespace.get_hierarchy(ref_type) + if expected_type not in hierarchy: + ret.append( + IncorrectDataType( + self.get_spec_loc(self.spec), + f"{expected_type} (or subtype)", + ref_type, + location=self.get_builder_loc(builder) + ) + ) + elif isinstance(val, (list, tuple, np.ndarray)): + for v in val: + self._check_ref_target_type(v, expected_type, type_key, builder, ret) + @docval({"name": "builder", "type": DatasetBuilder, "doc": "the builder to validate"}, returns='a list of Errors', rtype=list) def validate(self, **kwargs): @@ -523,16 +544,36 @@ def validate(self, **kwargs): ) ) - if not check_type(self.spec.dtype, dtype, string_format): - if isinstance(self.spec.dtype, RefSpec): - expected = f'{self.spec.dtype.reftype} reference' - else: + if not isinstance(self.spec.dtype, RefSpec): + if not check_type(self.spec.dtype, dtype, string_format): expected = self.spec.dtype - ret.append(DtypeError(self.get_spec_loc(self.spec), expected, dtype, - location=self.get_builder_loc(builder))) + ret.append( + DtypeError( + self.get_spec_loc(self.spec), + expected, + dtype, + location=self.get_builder_loc(builder) + ) + ) + + else: + if dtype != 'object': + ret.append( + DtypeError( + self.get_spec_loc(self.spec), + "object reference", + dtype, + location=self.get_builder_loc(builder) + ) + ) + expected_target = self.spec.dtype.target_type + type_key = self.spec.type_key() + self._check_ref_target_type(data, expected_target, type_key, builder, ret) + except EmptyArrayError: # do not validate dtype of empty array. HDMF does not yet set dtype when writing a list/tuple pass + if isinstance(builder.dtype, list) and len(np.shape(builder.data)) == 0: shape = () # scalar compound dataset elif isinstance(builder.dtype, list): diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 0a38d65b4..f9e3af2f0 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -12,7 +12,8 @@ from hdmf.testing import TestCase, remove_test_file from hdmf.validate import ValidatorMap from hdmf.validate.errors import (DtypeError, MissingError, ExpectedArrayError, MissingDataType, - IncorrectQuantityError, IllegalLinkError, ShapeError, ValidationWarning, ExtraFieldWarning, Error) + IncorrectQuantityError, IllegalLinkError, ShapeError, ValidationWarning, ExtraFieldWarning, Error, IncorrectDataType) + from hdmf.backends.hdf5 import HDF5IO from hdmf.utils import ZARR_INSTALLED, StrDataset from collections import defaultdict, OrderedDict, Counter @@ -1765,6 +1766,142 @@ def test_utf8_for_ascii_hdf5(self): self.assertIsInstance(results[0], DtypeError) self.assertEqual("Foo/data (my_foo/data): incorrect type - expected 'bytes', got 'utf'", str(results[0])) + def test_dataset_reference_type_validation_hierarchy_success(self): + """Test that subtype references are accepted via type hierarchy.""" + + foo_spec = DatasetSpec( + doc='dataset with ref', + dtype=RefSpec('Bar', 'object'), + data_type_def='Foo', + shape=None + ) + + bar_spec = DatasetSpec( + doc='base dataset', + data_type_def='Bar', + dtype='int', + shape=None + ) + + subbar_spec = DatasetSpec( + doc='subtype dataset', + data_type_def='SubBar', + data_type_inc='Bar', + dtype='int', + shape=None + ) + + catalog = SpecCatalog() + for spec in [foo_spec, bar_spec, subbar_spec]: + catalog.register_spec(spec, 'test.yaml') + + namespace = SpecNamespace( + 'test ns', 'test_ns', + [{'source': 'test.yaml'}], + version='0.1.0', + catalog=catalog + ) + + vmap = ValidatorMap(namespace) + + subbar = DatasetBuilder('subbar', 5, attributes={'data_type': 'SubBar'}) + foo = DatasetBuilder('foo', ReferenceBuilder(subbar), attributes={'data_type': 'Foo'}) + + errors = vmap.validate(foo) + + assert len(errors) == 0 + + def test_dataset_reference_type_validation_failure(self): + foo_spec = DatasetSpec( + doc='dataset with ref', + dtype=RefSpec('Bar', 'object'), + data_type_def='Foo', + shape=None + ) + + bar_spec = DatasetSpec( + doc='correct dataset', + data_type_def='Bar', + dtype='int', + shape=None + ) + + baz_spec = DatasetSpec( + doc='wrong dataset', + data_type_def='Baz', + dtype='int', + shape=None + ) + + catalog = SpecCatalog() + for spec in [foo_spec, bar_spec, baz_spec]: + catalog.register_spec(spec, 'test.yaml') + + namespace = SpecNamespace( + 'test ns', 'test_ns', + [{'source': 'test.yaml'}], + version='0.1.0', + catalog=catalog + ) + + vmap = ValidatorMap(namespace) + + baz = DatasetBuilder('baz', 5, attributes={'data_type': 'Baz'}) + foo = DatasetBuilder('foo', ReferenceBuilder(baz), attributes={'data_type': 'Foo'}) + + errors = vmap.validate(foo) + + assert len(errors) == 1 + assert isinstance(errors[0], IncorrectDataType) + + def test_dataset_reference_list_mixed(self): + foo_spec = DatasetSpec( + doc='dataset with ref', + dtype=RefSpec('Bar', 'object'), + data_type_def='Foo' + ) + + bar_spec = DatasetSpec( + doc='correct dataset', + data_type_def='Bar', + dtype='int' + ) + + baz_spec = DatasetSpec( + doc='wrong dataset', + data_type_def='Baz', + dtype='int' + ) + + catalog = SpecCatalog() + for spec in [foo_spec, bar_spec, baz_spec]: + catalog.register_spec(spec, 'test.yaml') + + namespace = SpecNamespace( + 'test ns', 'test_ns', + [{'source': 'test.yaml'}], + version='0.1.0', + catalog=catalog + ) + + vmap = ValidatorMap(namespace) + + bar = DatasetBuilder('bar', 5, attributes={'data_type': 'Bar'}) + baz = DatasetBuilder('baz', 5, attributes={'data_type': 'Baz'}) + + foo = DatasetBuilder( + 'foo', + [ + ReferenceBuilder(bar), + ReferenceBuilder(baz) + ], + attributes={'data_type': 'Foo'} + ) + + errors = vmap.validate(foo) + + assert len(errors) == 1 + class TestISODateTimeTimezone(ValidatorTestBase): """Test that isodatetime specs for both Datasets and Attributes require timezone information.""" From 48519de80aa252bc993bf66035fda62dca238a11 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:47:13 -0700 Subject: [PATCH 03/19] Bump codecov/codecov-action from 5 to 6 (#1433) Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> --- .github/workflows/run_coverage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_coverage.yml b/.github/workflows/run_coverage.yml index c4ab3345a..656f1549c 100644 --- a/.github/workflows/run_coverage.yml +++ b/.github/workflows/run_coverage.yml @@ -66,7 +66,7 @@ jobs: pytest --cov --cov-report=xml --cov-report=term - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 + uses: codecov/codecov-action@v6 with: fail_ci_if_error: true files: ./coverage.xml @@ -120,7 +120,7 @@ jobs: pytest --cov --cov-report=xml --cov-report=term tests/unit/test_io_hdf5_streaming.py - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 + uses: codecov/codecov-action@v6 with: fail_ci_if_error: true files: ./coverage.xml From 55f674d29c8700cdf2c2136e65c118ca26da219d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 23:52:29 +0000 Subject: [PATCH 04/19] [pre-commit.ci] pre-commit autoupdate (#1434) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f6958796..e698bb4e1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: # hooks: # - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.7 + rev: v0.15.8 hooks: - id: ruff # - repo: https://github.com/econchick/interrogate From 4d1172180e27fccfe50d89abd19063518b4c2e7d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Apr 2026 18:21:54 +0000 Subject: [PATCH 05/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/hdmf/validate/errors.py | 4 ++-- src/hdmf/validate/validator.py | 4 ++-- tests/unit/validator_tests/test_validate.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hdmf/validate/errors.py b/src/hdmf/validate/errors.py index eae51ee72..6fd248d76 100644 --- a/src/hdmf/validate/errors.py +++ b/src/hdmf/validate/errors.py @@ -220,8 +220,8 @@ class ValidationWarning(Error): """Base class for validation warnings.""" @property def message(self): - return self.reason + return self.reason class ExtraFieldWarning(ValidationWarning): """Warning for fields found in data but not defined in any applicable spec.""" - pass \ No newline at end of file + pass diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 82b72882c..800c7c707 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -639,7 +639,7 @@ def __validate_children(self, parent_builder): if self.spec.data_type is not None: for extra_builder in matcher.unmatched_builders: if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): - continue + continue if extra_builder.name in extra_elements: continue yield ValidationWarning( @@ -703,7 +703,7 @@ def __validate_child_builder(self, child_spec, child_builder, parent_builder): yield self.__construct_illegal_link_error(child_spec, parent_builder) return # do not validate illegally linked objects child_builder = child_builder.builder - + child_builder_data_type = child_builder.attributes.get(self.spec.type_key()) validators = list(self.__get_child_validators(child_spec, child_builder_data_type)) diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index f9e3af2f0..d302caea8 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -2078,7 +2078,7 @@ def getSpecs(self): ] ), ) - + def test_attribute_intersection_no_warning(self): """Test that 'resolution' triggers NO warning because ext_spec knows it.""" dataset = DatasetBuilder( @@ -2129,4 +2129,4 @@ def test_unexpected_element_warning(self): warnings = [r for r in result if isinstance(r, ValidationWarning) and "Unexpected element" in str(r)] self.assertEqual(len(warnings), 1) - self.assertIn("extra_ds", str(warnings)) \ No newline at end of file + self.assertIn("extra_ds", str(warnings)) From 2e9565c02b4537991f82c75341f7379087bfb5cc Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 20 Apr 2026 14:41:52 -0400 Subject: [PATCH 06/19] Make `expandable` a list; default expands only DynamicTable columns (#1439) Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: Claude Opus 4.7 (1M context) Co-authored-by: rly <310197+rly@users.noreply.github.com> --- CHANGELOG.md | 3 + src/hdmf/backends/hdf5/h5tools.py | 47 +++++-- tests/unit/common/test_table.py | 1 + tests/unit/test_io_hdf5_h5tools.py | 216 +++++++++++++++++++++++++++-- 4 files changed, 246 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86be68bec..0d204394e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## Unreleased +### Breaking changes +- `HDF5IO` `expandable` argument is now a list of data type names instead of a boolean. The default is `("VectorData", "ElementIdentifiers")`, so only `DynamicTable` columns and id are expandable out of the box — previously every dataset with a matching spec shape was expanded. Datasets of types outside this list that previously were expandable by default will now default to fixed-shape on-disk layout; add the relevant type to `expandable` to restore prior behavior. Replace `expandable=True` with an explicit list (e.g. `["VectorData", "ElementIdentifiers", "MyType"]`) and `expandable=False` with `[]`; passing `True`/`False` now raises a `TypeError`. @bendichter @rly [#1439](https://github.com/hdmf-dev/hdmf/pull/1439) + ### Fixed - Added missing validation for dataset reference target types to ensure correct `RefSpec.target_type` matching. @sejalpunwatkar [#1429](https://github.com/hdmf-dev/hdmf/pull/1429) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index ca564aee9..38d440a26 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -310,9 +310,13 @@ def __get_namespaces(cls, file_obj): {'name': 'herd', 'type': 'hdmf.common.resources.HERD', 'doc': 'A HERD object to populate with references.', 'default': None}, - {'name': 'expandable', 'type': bool, 'default': True, - 'doc': ('If True (default), datasets will be created as expandable by setting the maxshape ' - 'based on the matching shape defined in the spec.')}) + {'name': 'expandable', 'type': (list, tuple), + 'default': ("VectorData", "ElementIdentifiers"), + 'doc': ('A list of data type names whose datasets (and subclasses) will be created as ' + 'expandable — maxshape is set based on the matching shape defined in the spec. ' + 'Default is ("VectorData", "ElementIdentifiers"), so only DynamicTable columns ' + 'and id are expandable. Pass an empty list/tuple to disable automatic expansion ' + 'entirely.')}) def write(self, **kwargs): """Write the container to an HDF5 file.""" if self.__mode == 'r': @@ -759,9 +763,13 @@ def close_linked_files(self): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, - {'name': 'expandable', 'type': bool, 'default': True, - 'doc': ('If True (default), datasets will be created as expandable by setting the maxshape ' - 'based on the matching shape defined in the spec.')}) + {'name': 'expandable', 'type': (list, tuple), + 'default': ("VectorData", "ElementIdentifiers"), + 'doc': ('A list of data type names whose datasets (and subclasses) will be created as ' + 'expandable — maxshape is set based on the matching shape defined in the spec. ' + 'Default is ("VectorData", "ElementIdentifiers"), so only DynamicTable columns ' + 'and id are expandable. Pass an empty list/tuple to disable automatic expansion ' + 'entirely.')}) def write_builder(self, **kwargs): f_builder = popargs('builder', kwargs) self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" @@ -939,9 +947,13 @@ def _filler(): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, - {'name': 'expandable', 'type': bool, 'default': True, - 'doc': ('If True (default), datasets will be created as expandable by setting the maxshape ' - 'based on the matching shape defined in the spec.')}, + {'name': 'expandable', 'type': (list, tuple), + 'default': ("VectorData", "ElementIdentifiers"), + 'doc': ('A list of data type names whose datasets (and subclasses) will be created as ' + 'expandable — maxshape is set based on the matching shape defined in the spec. ' + 'Default is ("VectorData", "ElementIdentifiers"), so only DynamicTable columns ' + 'and id are expandable. Pass an empty list/tuple to disable automatic expansion ' + 'entirely.')}, returns='the Group that was created', rtype=Group) def write_group(self, **kwargs): parent, builder = popargs('parent', 'builder', kwargs) @@ -1042,9 +1054,13 @@ def write_link(self, **kwargs): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, - {'name': 'expandable', 'type': bool, 'default': True, - 'doc': ('If True (default), datasets will be created as expandable by setting the maxshape ' - 'based on the matching shape defined in the spec.')}, + {'name': 'expandable', 'type': (list, tuple), + 'default': ("VectorData", "ElementIdentifiers"), + 'doc': ('A list of data type names whose datasets (and subclasses) will be created as ' + 'expandable — maxshape is set based on the matching shape defined in the spec. ' + 'Default is ("VectorData", "ElementIdentifiers"), so only DynamicTable columns ' + 'and id are expandable. Pass an empty list/tuple to disable automatic expansion ' + 'entirely.')}, returns='the Dataset that was created', rtype=Dataset) def write_dataset(self, **kwargs): # noqa: C901 """ Write a dataset to HDF5 @@ -1071,12 +1087,17 @@ def write_dataset(self, **kwargs): # noqa: C901 else: options['io_settings'] = {} - # Set maxshape to make a non-scalar dataset expandable but do not override existing settings + # Set maxshape to make datasets expandable. `expandable` is a list/tuple of data type names: + # a dataset is made expandable if its data type (or an ancestor) is in the list. The default + # list ("VectorData", "ElementIdentifiers") covers DynamicTable columns and id, which users + # commonly need to append to. An empty list disables automatic expansion. if ( expandable and 'maxshape' not in options['io_settings'] and np.ndim(data) != 0 and matched_spec_shape is not None + and self.manager.get_builder_dt(builder) is not None + and any(self.manager.is_sub_data_type(builder, dt) for dt in expandable) ): options['io_settings']['maxshape'] = matched_spec_shape diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index 7122dfb41..e5cf80ff2 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -3211,6 +3211,7 @@ def test_append(self, cache_spec=False): self.assertContainerEqual(read_container['table']['y'][-1], group1) + class TestVectorIndexDtype(TestCase): def set_up_array_index(self): diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index f47f4e641..e522e52a8 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -23,19 +23,22 @@ from hdmf.build import GroupBuilder, DatasetBuilder, BuildManager, TypeMap, OrphanContainerBuildError, LinkBuilder from hdmf.container import Container, HERDManager from hdmf import Data, docval -from hdmf.common import SimpleMultiContainer, get_hdf5io +from hdmf.common import (DynamicTable, VectorData, SimpleMultiContainer, get_hdf5io, get_manager, + get_type_map) from hdmf.data_utils import DataChunkIterator, GenericDataChunkIterator, InvalidDataIOError, append_data +from hdmf.spec import DatasetSpec from hdmf.spec.catalog import SpecCatalog from hdmf.spec.namespace import NamespaceCatalog, SpecNamespace from hdmf.spec.spec import GroupSpec, DtypeSpec -from hdmf.testing import TestCase, remove_test_file +from hdmf.testing import TestCase, H5RoundTripMixin, remove_test_file from hdmf.common.resources import HERD from hdmf.term_set import TermSet, TermSetWrapper from hdmf.utils import get_data_shape from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager, Baz, BazData, BazCpdData, BazBucket, get_baz_buildmanager, - CORE_NAMESPACE, get_temp_filepath, CacheSpecTestHelper, + CORE_NAMESPACE, create_load_namespace_yaml, get_temp_filepath, + CacheSpecTestHelper, CustomGroupSpec, CustomDatasetSpec, CustomSpecNamespace, QuxData, QuxBucket, get_qux_buildmanager) from tests.unit.helpers.io import DoNothingIO @@ -878,14 +881,18 @@ def test_roundtrip_basic(self): read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist()) def test_roundtrip_basic_append(self): - # Setup all the data we need - foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14) + # Foo.my_data is an anonymous (untyped) dataset, so the default `expandable` list does + # not cover it. Wrap its data in H5DataIO with an explicit maxshape to make it expandable. + foo1 = Foo('foo1', H5DataIO(data=[1, 2, 3, 4, 5], maxshape=(None,)), "I am foo1", 17, 3.14) foobucket = FooBucket('bucket1', [foo1]) foofile = FooFile(buckets=[foobucket]) with HDF5IO(self.path, manager=self.manager, mode='w') as io: io.write(foofile) + with h5py.File(self.path, 'r') as f: + self.assertEqual(f['buckets/bucket1/foo_holder/foo1/my_data'].maxshape, (None,)) + with HDF5IO(self.path, manager=self.manager, mode='a') as io: read_foofile = io.read() data = read_foofile.buckets['bucket1'].foos['foo1'].my_data @@ -4175,7 +4182,7 @@ def test_expand_false(self): foofile = FooFile(buckets=[foobucket]) with HDF5IO(self.path, manager=self.manager, mode='w') as io: - io.write(foofile, expandable=False) + io.write(foofile, expandable=[]) with HDF5IO(self.path, manager=self.manager, mode='r') as io: read_foofile = io.read() @@ -4191,7 +4198,7 @@ def test_multi_shape_no_labels(self): manager = get_qux_buildmanager([[None, None], [None, 3]]) with HDF5IO(self.path, manager=manager, mode='w') as io: - io.write(quxbucket, expandable=True) + io.write(quxbucket, expandable=['QuxData']) with HDF5IO(self.path, manager=manager, mode='r') as io: read_quxbucket = io.read() @@ -4204,7 +4211,7 @@ def test_expand_set_shape(self): manager = get_qux_buildmanager([None, 3]) with HDF5IO(self.path, manager=manager, mode='w') as io: - io.write(quxbucket, expandable=True) + io.write(quxbucket, expandable=['QuxData']) with HDF5IO(self.path, manager=manager, mode='r+') as io: read_quxbucket = io.read() @@ -4215,3 +4222,196 @@ def test_expand_set_shape(self): [7, 8, 9]]) npt.assert_array_equal(read_quxbucket.qux_data.data[:], expected) self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None, 3)) + + +class TestDefaultExpandable(H5RoundTripMixin, TestCase): + """Test that VectorData, VectorIndex, and ElementIdentifiers datasets are expandable by default.""" + + def setUpContainer(self): + table = DynamicTable(name='table0', description='an example table') + table.add_column(name='foo', description='an int column') + table.add_column(name='bar', description='an indexed column', index=True) + table.add_row(foo=1, bar=[1, 2, 3]) + table.add_row(foo=2, bar=[4, 5]) + return table + + def test_roundtrip(self): + super().test_roundtrip() + + with h5py.File(self.filename, 'r') as f: + # VectorData columns should be expandable (maxshape has None in first dim) + self.assertEqual(f['foo'].maxshape, (None,)) + self.assertIsNotNone(f['foo'].chunks) + + # VectorIndex columns should be expandable + self.assertEqual(f['bar_index'].maxshape, (None,)) + self.assertIsNotNone(f['bar_index'].chunks) + + # Indexed VectorData should be expandable + self.assertEqual(f['bar'].maxshape, (None,)) + self.assertIsNotNone(f['bar'].chunks) + + # ElementIdentifiers (id column) should be expandable + self.assertEqual(f['id'].maxshape, (None,)) + self.assertIsNotNone(f['id'].chunks) + + +class TestDefaultExpandableWithReferences(H5RoundTripMixin, TestCase): + """Test that a VectorData column of references is expandable by default.""" + + def setUpContainer(self): + group1 = Container('group1') + group2 = Container('group2') + + table = DynamicTable(name='table0', description='an example table') + table.add_column(name='ref', description='a reference column') + table.add_row(ref=group1) + table.add_row(ref=group2) + + multi = SimpleMultiContainer(name='multi') + multi.add_container(group1) + multi.add_container(group2) + multi.add_container(table) + return multi + + def test_roundtrip(self): + super().test_roundtrip() + + with h5py.File(self.filename, 'r') as f: + ref_ds = f['table0/ref'] + self.assertEqual(ref_ds.maxshape, (None,)) + self.assertIsNotNone(ref_ds.chunks) + # Reference dtype should not bypass the expandable branch. + self.assertTrue(h5py.check_ref_dtype(ref_ds.dtype)) + # Stored refs resolve to the expected target groups. + self.assertIsInstance(ref_ds[0], h5py.Reference) + self.assertEqual(f[ref_ds[0]].name, '/group1') + self.assertEqual(f[ref_ds[1]].name, '/group2') + + +class TestDefaultExpandableExplicitOverride(H5RoundTripMixin, TestCase): + """Test that explicit H5DataIO settings are not overridden by the default expandable behavior.""" + + def setUpContainer(self): + # User explicitly sets maxshape — should be respected + foo = VectorData( + name='foo', + description='a column with explicit maxshape', + data=H5DataIO(data=[1, 2, 3], maxshape=(10,)), + ) + table = DynamicTable(name='table0', description='an example table', columns=[foo]) + return table + + def test_roundtrip(self): + super().test_roundtrip() + + with h5py.File(self.filename, 'r') as f: + # User's explicit maxshape should be preserved, not overridden + self.assertEqual(f['foo'].maxshape, (10,)) + + +class TestExpandableArg(TestCase): + """Test explicit values of the `expandable` argument to HDF5IO.write.""" + + def setUp(self): + self.filename = get_temp_filepath() + + def tearDown(self): + remove_test_file(self.filename) + + def _make_table(self): + table = DynamicTable(name='table0', description='an example table') + table.add_column(name='foo', description='an int column') + table.add_row(foo=1) + table.add_row(foo=2) + return table + + def test_empty_disables_expansion(self): + with HDF5IO(self.filename, manager=get_manager(), mode='w') as io: + io.write(self._make_table(), expandable=[]) + + with h5py.File(self.filename, 'r') as f: + # With an empty expandable list, non-scalar datasets get a fixed shape, not (None,). + self.assertEqual(f['foo'].maxshape, (2,)) + self.assertEqual(f['id'].maxshape, (2,)) + + def test_custom_list_expands_only_listed(self): + with HDF5IO(self.filename, manager=get_manager(), mode='w') as io: + io.write(self._make_table(), expandable=['VectorData']) + + with h5py.File(self.filename, 'r') as f: + # VectorData column 'foo' is in the list → expandable. + self.assertEqual(f['foo'].maxshape, (None,)) + # ElementIdentifiers 'id' is NOT in the list → fixed shape. + self.assertEqual(f['id'].maxshape, (2,)) + + def test_bool_raises_type_error(self): + """Passing the old boolean API must fail loudly, per the 5.2.0 breaking change.""" + with HDF5IO(self.filename, manager=get_manager(), mode='w') as io: + with self.assertRaises(TypeError): + io.write(self._make_table(), expandable=True) + with self.assertRaises(TypeError): + io.write(self._make_table(), expandable=False) + + +class TestDefaultExpandableSubclasses(TestCase): + """Test that subclasses of VectorData and ElementIdentifiers are expandable by default.""" + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + custom_ei_spec = DatasetSpec( + data_type_def='CustomElementIdentifiers', + data_type_inc='ElementIdentifiers', + doc='a custom ElementIdentifiers subclass', + ) + custom_vd_spec = DatasetSpec( + data_type_def='CustomVectorData', + data_type_inc='VectorData', + doc='a custom VectorData subclass', + ) + + self.type_map = get_type_map() + create_load_namespace_yaml( + namespace_name=CORE_NAMESPACE, + specs=[custom_ei_spec, custom_vd_spec], + output_dir=self.test_dir, + incl_types={'hdmf-common': ['ElementIdentifiers', 'VectorData', 'DynamicTable']}, + type_map=self.type_map, + ) + self.manager = BuildManager(self.type_map) + + self.CustomElementIdentifiers = self.type_map.get_dt_container_cls( + 'CustomElementIdentifiers', CORE_NAMESPACE + ) + self.CustomVectorData = self.type_map.get_dt_container_cls( + 'CustomVectorData', CORE_NAMESPACE + ) + + self.filename = os.path.join(self.test_dir, 'test_expandable_subclasses.h5') + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_subclasses_are_expandable(self): + custom_ids = self.CustomElementIdentifiers(name='id', data=[10, 20]) + custom_col = self.CustomVectorData( + name='custom_col', description='a custom column', data=[1.0, 2.0] + ) + table = DynamicTable( + name='table0', + description='an example table', + id=custom_ids, + columns=[custom_col], + ) + + with HDF5IO(self.filename, manager=self.manager, mode='w') as write_io: + write_io.write(table) + + with h5py.File(self.filename, 'r') as f: + # ElementIdentifiers subclass (id) should be expandable + self.assertEqual(f['id'].maxshape, (None,)) + self.assertIsNotNone(f['id'].chunks) + # VectorData subclass (custom_col) should be expandable + self.assertEqual(f['custom_col'].maxshape, (None,)) + self.assertIsNotNone(f['custom_col'].chunks) From dc890012ec2e8a6e4c03dd44363080fb7366053a Mon Sep 17 00:00:00 2001 From: Sejal Date: Sat, 18 Apr 2026 17:44:37 +0000 Subject: [PATCH 07/19] WIP: validation warning handling (2 failing tests remaining) --- src/hdmf/validate/errors.py | 1 + src/hdmf/validate/validator.py | 5 +++++ tests/unit/validator_tests/test_validate.py | 2 ++ 3 files changed, 8 insertions(+) diff --git a/src/hdmf/validate/errors.py b/src/hdmf/validate/errors.py index 6fd248d76..b16015887 100644 --- a/src/hdmf/validate/errors.py +++ b/src/hdmf/validate/errors.py @@ -225,3 +225,4 @@ def message(self): class ExtraFieldWarning(ValidationWarning): """Warning for fields found in data but not defined in any applicable spec.""" pass + diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 800c7c707..3dd723e8b 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -640,6 +640,11 @@ def __validate_children(self, parent_builder): for extra_builder in matcher.unmatched_builders: if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): continue + + if extra_builder.name in seen: + continue + seen.add(extra_builder.name) + if extra_builder.name in extra_elements: continue yield ValidationWarning( diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index d302caea8..c6195bbaf 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -2129,4 +2129,6 @@ def test_unexpected_element_warning(self): warnings = [r for r in result if isinstance(r, ValidationWarning) and "Unexpected element" in str(r)] self.assertEqual(len(warnings), 1) + self.assertIn("extra_ds", str(warnings)) + From 501df27ae0b6d9b6797f7bc3727d8da84e2b6b4b Mon Sep 17 00:00:00 2001 From: Sejal Date: Thu, 23 Apr 2026 14:20:28 +0000 Subject: [PATCH 08/19] Fix validation: skip wrong-named children, dedupe warnings, fix scalar shape error --- src/hdmf/validate/validator.py | 126 ++++++++++++++++++-- tests/unit/validator_tests/test_validate.py | 2 + 2 files changed, 117 insertions(+), 11 deletions(-) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 3dd723e8b..452bc1756 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -436,6 +436,7 @@ def validate(self, **kwargs): if not check_shape(spec.shape, shape): if shape is None: ret.append(ExpectedArrayError(self.get_spec_loc(self.spec), self.spec.shape, str(value))) + return ret else: ret.append(ShapeError(self.get_spec_loc(spec), spec.shape, shape)) @@ -481,18 +482,60 @@ def validate(self, **kwargs): for builder_attr in attributes: if builder_attr not in spec_attr_names: dt = attributes.get(self.spec.type_key()) + found = False + if not found and dt is not None: + try: + for candidate_dt, v in self.vmap._ValidatorMap__validator_map.items(): + if candidate_dt == dt: + continue + if hasattr(v, "_BaseStorageValidator__attribute_validators"): + candidate_hierarchy = self.vmap.namespace.get_hierarchy(candidate_dt) + if dt in candidate_hierarchy: + dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) + if builder_attr in dt_attrs: + found = True + break + except Exception: + pass if dt is not None: - dt_validator = self.vmap.get_validator(dt) - if hasattr(dt_validator, "_BaseStorageValidator__attribute_validators"): - dt_attrs = set(dt_validator._BaseStorageValidator__attribute_validators.keys()) - if builder_attr in dt_attrs: - continue + try: + types_to_check = [dt] + list(self.vmap.namespace.get_hierarchy(dt)) + for candidate_dt in types_to_check: + v = self.vmap.get_validator(candidate_dt) + if hasattr(v, "_BaseStorageValidator__attribute_validators"): + dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) + if builder_attr in dt_attrs: + found = True + break + except Exception: + pass + + if not found: + try: + for ns_name in self.vmap.namespace.catalog.get_namespace_names(): + ns = self.vmap.namespace.catalog.get_namespace(ns_name) + for candidate_dt in ns.get_registered_types(): + candidate_hierarchy = self.vmap.namespace.get_hierarchy(candidate_dt) + if dt in candidate_hierarchy and candidate_dt != dt: + v = self.vmap.get_validator(candidate_dt) + if hasattr(v, "_BaseStorageValidator__attribute_validators"): + dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) + if builder_attr in dt_attrs: + found = True + break + if found: + break + if found: + break + except Exception: + pass - ret.append(ExtraFieldWarning( + if not found: + ret.append(ExtraFieldWarning( self.get_spec_loc(self.spec), f"Unexpected attribute '{builder_attr}' encountered in {self.spec.name}", location=self.get_builder_loc(builder) - )) + )) return ret @@ -634,9 +677,47 @@ def __validate_children(self, parent_builder): matcher.assign_to_specs(builder_children) errors = [] - extra_elements = {b.name for b in matcher.unmatched_builders} + seen = set() + effective_dt = self.spec.data_type or parent_builder.attributes.get(self.spec.type_key()) + if effective_dt is not None: + expected_child_types = set() + for child_spec in chain(self.spec.datasets, self.spec.groups, self.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + try: + for ancestor_dt in self.vmap.namespace.get_hierarchy(effective_dt): + ancestor_validator = self.vmap.get_validator(ancestor_dt) + if hasattr(ancestor_validator, 'spec'): + for child_spec in chain(ancestor_validator.spec.datasets, + ancestor_validator.spec.groups, + ancestor_validator.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + except Exception: + pass + builder_effective_dt = parent_builder.attributes.get(self.spec.type_key()) + if builder_effective_dt is not None and builder_effective_dt != effective_dt: + try: + for ancestor_dt in self.vmap.namespace.get_hierarchy(builder_effective_dt): + ancestor_validator = self.vmap.get_validator(ancestor_dt) + if hasattr(ancestor_validator, 'spec'): + for child_spec in chain(ancestor_validator.spec.datasets, + ancestor_validator.spec.groups, + ancestor_validator.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + except Exception: + pass + spec_named_children = set() + for child_spec in chain(self.spec.datasets, self.spec.groups, self.spec.links): + if hasattr(child_spec, 'name') and child_spec.name is not None: + spec_named_children.add(child_spec.name) + + COLUMN_TYPES = {'VectorData', 'VectorIndex', 'ElementIdentifiers', 'DynamicTableRegion'} - if self.spec.data_type is not None: for extra_builder in matcher.unmatched_builders: if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): continue @@ -644,9 +725,32 @@ def __validate_children(self, parent_builder): if extra_builder.name in seen: continue seen.add(extra_builder.name) - - if extra_builder.name in extra_elements: + if isinstance(extra_builder, LinkBuilder): + extra_builder = extra_builder.builder + builder_dt = extra_builder.attributes.get(self.spec.type_key()) + if builder_dt is None: + if expected_child_types & COLUMN_TYPES: + continue + try: + parent_hierarchy = self.vmap.namespace.get_hierarchy(effective_dt) + if 'DynamicTable' in parent_hierarchy: + continue + except Exception: + pass + else: + try: + hierarchy = self.vmap.namespace.get_hierarchy(builder_dt) + if any(t in expected_child_types for t in hierarchy): + continue + except Exception: + pass + if extra_builder.name in spec_named_children: + continue + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): continue + if extra_builder.name in seen: + continue + seen.add(extra_builder.name) yield ValidationWarning( self.get_spec_loc(self.spec), f"Unexpected element '{extra_builder.name}' encountered in {self.spec.name}", diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index c6195bbaf..5a2a32704 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -1653,6 +1653,8 @@ def test_scalar_instead_of_array(self): ] ) result = self.vmap.validate(builder) + for r in result: + print(f"DEBUG error: {type(r).__name__}: {r}") self.assertEqual(len(result), 1) # Should be ExpectedArrayError, not ShapeError self.assertIsInstance(result[0], ExpectedArrayError) From 883b7330fc1a22a8da849f97289bfef9b105288a Mon Sep 17 00:00:00 2001 From: Sejal Date: Thu, 23 Apr 2026 14:50:27 +0000 Subject: [PATCH 09/19] Fix validator: clean duplicate checks in unmatched builders --- src/hdmf/validate/validator.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 452bc1756..cb96e3d38 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -719,9 +719,8 @@ def __validate_children(self, parent_builder): COLUMN_TYPES = {'VectorData', 'VectorIndex', 'ElementIdentifiers', 'DynamicTableRegion'} for extra_builder in matcher.unmatched_builders: - if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): continue - if extra_builder.name in seen: continue seen.add(extra_builder.name) @@ -746,11 +745,6 @@ def __validate_children(self, parent_builder): pass if extra_builder.name in spec_named_children: continue - if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): - continue - if extra_builder.name in seen: - continue - seen.add(extra_builder.name) yield ValidationWarning( self.get_spec_loc(self.spec), f"Unexpected element '{extra_builder.name}' encountered in {self.spec.name}", From 27aa9101b35f48a5862cbf4e7ecafa060d5a0ed6 Mon Sep 17 00:00:00 2001 From: Sejal Date: Fri, 24 Apr 2026 05:09:58 +0000 Subject: [PATCH 10/19] fix: suppress false-positive validation warnings for DynamicTable columns and extended type attributes - Skip unmatched typed children whose type is expected by parent spec hierarchy (MissingDataType covers them) - Skip unmatched untyped children in DynamicTable subtypes (columns like id, foo, VectorData etc.) - Skip unmatched untyped children when spec defines untyped dataset children (wildcard datasets) - Check subtypes when looking up attributes to avoid false ExtraFieldWarning for extended types - Return early from AttributeValidator after ExpectedArrayError to avoid duplicate errors --- src/hdmf/validate/validator.py | 88 +++++++++------------ tests/unit/validator_tests/test_validate.py | 3 +- 2 files changed, 39 insertions(+), 52 deletions(-) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index cb96e3d38..e765227f2 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -483,20 +483,6 @@ def validate(self, **kwargs): if builder_attr not in spec_attr_names: dt = attributes.get(self.spec.type_key()) found = False - if not found and dt is not None: - try: - for candidate_dt, v in self.vmap._ValidatorMap__validator_map.items(): - if candidate_dt == dt: - continue - if hasattr(v, "_BaseStorageValidator__attribute_validators"): - candidate_hierarchy = self.vmap.namespace.get_hierarchy(candidate_dt) - if dt in candidate_hierarchy: - dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) - if builder_attr in dt_attrs: - found = True - break - except Exception: - pass if dt is not None: try: types_to_check = [dt] + list(self.vmap.namespace.get_hierarchy(dt)) @@ -509,24 +495,19 @@ def validate(self, **kwargs): break except Exception: pass - - if not found: + + if not found and dt is not None: try: - for ns_name in self.vmap.namespace.catalog.get_namespace_names(): - ns = self.vmap.namespace.catalog.get_namespace(ns_name) - for candidate_dt in ns.get_registered_types(): + for candidate_dt, v in self.vmap._ValidatorMap__validators.items(): + if candidate_dt == dt: + continue + if hasattr(v, "_BaseStorageValidator__attribute_validators"): candidate_hierarchy = self.vmap.namespace.get_hierarchy(candidate_dt) - if dt in candidate_hierarchy and candidate_dt != dt: - v = self.vmap.get_validator(candidate_dt) - if hasattr(v, "_BaseStorageValidator__attribute_validators"): - dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) - if builder_attr in dt_attrs: - found = True - break - if found: - break - if found: - break + if dt in candidate_hierarchy: + dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) + if builder_attr in dt_attrs: + found = True + break except Exception: pass @@ -675,7 +656,6 @@ def __validate_children(self, parent_builder): parent_builder.groups.values(), parent_builder.links.values()) matcher.assign_to_specs(builder_children) - errors = [] seen = set() effective_dt = self.spec.data_type or parent_builder.attributes.get(self.spec.type_key()) @@ -697,20 +677,22 @@ def __validate_children(self, parent_builder): expected_child_types.add(dt) except Exception: pass - builder_effective_dt = parent_builder.attributes.get(self.spec.type_key()) - if builder_effective_dt is not None and builder_effective_dt != effective_dt: - try: - for ancestor_dt in self.vmap.namespace.get_hierarchy(builder_effective_dt): - ancestor_validator = self.vmap.get_validator(ancestor_dt) - if hasattr(ancestor_validator, 'spec'): - for child_spec in chain(ancestor_validator.spec.datasets, - ancestor_validator.spec.groups, - ancestor_validator.spec.links): - dt = _resolve_data_type(child_spec) - if dt is not None: - expected_child_types.add(dt) - except Exception: - pass + + builder_effective_dt = parent_builder.attributes.get(self.spec.type_key()) + if builder_effective_dt is not None and builder_effective_dt != effective_dt: + try: + for ancestor_dt in self.vmap.namespace.get_hierarchy(builder_effective_dt): + ancestor_validator = self.vmap.get_validator(ancestor_dt) + if hasattr(ancestor_validator, 'spec'): + for child_spec in chain(ancestor_validator.spec.datasets, + ancestor_validator.spec.groups, + ancestor_validator.spec.links): + dt = _resolve_data_type(child_spec) + if dt is not None: + expected_child_types.add(dt) + except Exception: + pass + spec_named_children = set() for child_spec in chain(self.spec.datasets, self.spec.groups, self.spec.links): if hasattr(child_spec, 'name') and child_spec.name is not None: @@ -719,11 +701,6 @@ def __validate_children(self, parent_builder): COLUMN_TYPES = {'VectorData', 'VectorIndex', 'ElementIdentifiers', 'DynamicTableRegion'} for extra_builder in matcher.unmatched_builders: - if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): - continue - if extra_builder.name in seen: - continue - seen.add(extra_builder.name) if isinstance(extra_builder, LinkBuilder): extra_builder = extra_builder.builder builder_dt = extra_builder.attributes.get(self.spec.type_key()) @@ -736,6 +713,12 @@ def __validate_children(self, parent_builder): continue except Exception: pass + has_untyped_dataset_children = any( + _resolve_data_type(cs) is None + for cs in chain(self.spec.datasets, self.spec.groups, self.spec.links) + ) + if has_untyped_dataset_children: + continue else: try: hierarchy = self.vmap.namespace.get_hierarchy(builder_dt) @@ -745,6 +728,11 @@ def __validate_children(self, parent_builder): pass if extra_builder.name in spec_named_children: continue + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): + continue + if extra_builder.name in seen: + continue + seen.add(extra_builder.name) yield ValidationWarning( self.get_spec_loc(self.spec), f"Unexpected element '{extra_builder.name}' encountered in {self.spec.name}", diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 5a2a32704..c928046b4 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -1653,8 +1653,7 @@ def test_scalar_instead_of_array(self): ] ) result = self.vmap.validate(builder) - for r in result: - print(f"DEBUG error: {type(r).__name__}: {r}") + self.assertEqual(len(result), 1) # Should be ExpectedArrayError, not ShapeError self.assertIsInstance(result[0], ExpectedArrayError) From d8a690f9a2f3f231caddf64b5ba3bc32535451ca Mon Sep 17 00:00:00 2001 From: Sejal Date: Tue, 31 Mar 2026 05:10:50 +0530 Subject: [PATCH 11/19] Add validation for dataset reference target types (#632) (#1429) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> --- tests/unit/validator_tests/test_validate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index c928046b4..46ea00116 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -13,7 +13,6 @@ from hdmf.validate import ValidatorMap from hdmf.validate.errors import (DtypeError, MissingError, ExpectedArrayError, MissingDataType, IncorrectQuantityError, IllegalLinkError, ShapeError, ValidationWarning, ExtraFieldWarning, Error, IncorrectDataType) - from hdmf.backends.hdf5 import HDF5IO from hdmf.utils import ZARR_INSTALLED, StrDataset from collections import defaultdict, OrderedDict, Counter From e0b61740e8724267f6c0686779b6751d434cd2fd Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Thu, 23 Apr 2026 13:53:15 -0400 Subject: [PATCH 12/19] Set reasonable default chunk sizes (~4 MB) when chunks=True (#1440) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Set reasonable default chunk sizes (~4 MB) when chunks=True h5py's default auto-chunking targets ~32 KB per chunk, which is too small for cloud access where each chunk requires a separate HTTP range request. Add _compute_chunk_shape() that targets ~4 MB chunks (in the recommended 2-16 MB range for cloud-hosted files). The method is applied in __list_fill__, __setup_chunked_dset__, and __setup_empty_dset__ whenever chunks=True (boolean). Also ensures chunks=True is explicitly set when maxshape forces chunking, so the compute logic always triggers. Closes #1438 Co-Authored-By: Claude Opus 4.6 (1M context) * Update src/hdmf/backends/hdf5/h5tools.py Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> * Address review: public API, mesoscale-safe chunking, CHANGELOG - Rename _compute_chunk_shape → compute_default_chunk_shape so users can inspect or override the shape that would be used when chunks=True. - Add optional neurodata_type hook (unused in the default implementation) so subclasses can specialize chunking per type without signature churn. - When a single first-axis slice already exceeds the target (e.g. a (1000, 4096, 4096) uint16 mesoscale volume at 32 MB/slice), halve the largest trailing axis until the chunk fits within the target. Prevents auto-chunks from ballooning well past 16 MB on large-frame datasets. - Add tests for the new trailing-dim split path and the bytes_per_row==0 fallback branch flagged in review. - CHANGELOG entry for the ~4 MB default and the new public helper. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> --- CHANGELOG.md | 3 ++ src/hdmf/backends/hdf5/h5tools.py | 81 ++++++++++++++++++++++++++++++ tests/unit/test_io_hdf5_h5tools.py | 61 ++++++++++++++++++++++ 3 files changed, 145 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d204394e..8a8c3fe88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ ### Breaking changes - `HDF5IO` `expandable` argument is now a list of data type names instead of a boolean. The default is `("VectorData", "ElementIdentifiers")`, so only `DynamicTable` columns and id are expandable out of the box — previously every dataset with a matching spec shape was expanded. Datasets of types outside this list that previously were expandable by default will now default to fixed-shape on-disk layout; add the relevant type to `expandable` to restore prior behavior. Replace `expandable=True` with an explicit list (e.g. `["VectorData", "ElementIdentifiers", "MyType"]`) and `expandable=False` with `[]`; passing `True`/`False` now raises a `TypeError`. @bendichter @rly [#1439](https://github.com/hdmf-dev/hdmf/pull/1439) +### Enhancements +- Set sensible default chunk sizes (~4 MB, in the recommended 2-16 MB range for cloud-hosted files) when `chunks=True`, replacing h5py's much smaller defaults. Added public `HDF5IO.compute_default_chunk_shape()` so users can inspect or override the chunk shape that would be used. @bendichter [#1440](https://github.com/hdmf-dev/hdmf/pull/1440) + ### Fixed - Added missing validation for dataset reference target types to ensure correct `RefSpec.target_type` matching. @sejalpunwatkar [#1429](https://github.com/hdmf-dev/hdmf/pull/1429) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 38d440a26..d4f7dddc1 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -1101,6 +1101,11 @@ def write_dataset(self, **kwargs): # noqa: C901 ): options['io_settings']['maxshape'] = matched_spec_shape + # Ensure chunking is explicitly enabled when maxshape requires it, so that + # compute_default_chunk_shape can replace it with appropriately-sized chunks later. + if 'maxshape' in options['io_settings'] and 'chunks' not in options['io_settings']: + options['io_settings']['chunks'] = True + attributes = builder.attributes options['dtype'] = builder.dtype dset = None @@ -1338,6 +1343,9 @@ def __setup_chunked_dset__(cls, parent, name, data, options=None): if isinstance(io_settings['dtype'], str): # map to real dtype if we were given a string io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) + # Replace chunks=True with computed chunk shape for better cloud access performance + if io_settings.get('chunks') is True and 'shape' in io_settings and len(io_settings['shape']) > 0: + io_settings['chunks'] = HDF5IO.compute_default_chunk_shape(io_settings['shape'], io_settings.get('dtype')) try: dset = parent.create_dataset(name, **io_settings) except Exception as exc: @@ -1367,6 +1375,9 @@ def __setup_empty_dset__(cls, parent, name, io_settings): if isinstance(io_settings['dtype'], str): # map to real dtype if we were given a string io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) + # Replace chunks=True with computed chunk shape for better cloud access performance + if io_settings.get('chunks') is True and 'shape' in io_settings and len(io_settings['shape']) > 0: + io_settings['chunks'] = HDF5IO.compute_default_chunk_shape(io_settings['shape'], io_settings.get('dtype')) try: dset = parent.create_dataset(name, **io_settings) except Exception as exc: @@ -1418,6 +1429,10 @@ def __list_fill__(cls, parent, name, data, options=None): else: data_shape = get_data_shape(data) + # Replace chunks=True with computed chunk shape for better cloud access performance + if io_settings.get('chunks') is True and data_shape is not None and len(data_shape) > 0: + io_settings['chunks'] = HDF5IO.compute_default_chunk_shape(data_shape, dtype) + # Create the dataset try: dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings) @@ -1466,6 +1481,72 @@ def __get_ref(self, **kwargs): def _create_ref(self, **kwargs): return self.__get_ref(**kwargs) + @staticmethod + def compute_default_chunk_shape(data_shape, dtype, target_chunk_bytes=4 * 1024 * 1024, neurodata_type=None): + """Compute a chunk shape targeting a given number of bytes per chunk. + + h5py's default auto-chunking targets 8-500 KB chunks for datasets under 100 GB, depending on + dataset size. This is too small for cloud access where each chunk may require a separate + HTTP range request. This method targets larger chunks (default 4 MB) in the recommended + 2-16 MB range for cloud-hosted files. + + The algorithm keeps all dimensions except the first at their full size and adjusts the first + dimension to reach the target chunk size. When a single slice along the first dimension + already exceeds the target (e.g. mesoscale imaging frames), trailing dimensions are halved + in place of the largest axis until the chunk fits within the target. + + Parameters + ---------- + data_shape : tuple + The shape of the dataset. + dtype : numpy.dtype or type + The data type, used to determine bytes per element. + target_chunk_bytes : int, optional + Target chunk size in bytes. Default is 4 MB. + neurodata_type : str, optional + Name of the neurodata type for this dataset. Unused by the default implementation; + provided as a hook so subclasses can specialize chunking per type. + + Returns + ------- + tuple or True + The computed chunk shape, or ``True`` to fall back to h5py auto-chunking when a shape + cannot be computed (unsupported dtype or zero-length trailing dimension). + """ + del neurodata_type # extension hook for overrides; unused by default + + try: + itemsize = np.dtype(dtype).itemsize + except TypeError: + return True # fall back to h5py auto-chunking for unsupported dtypes + + # Elements per "row" (all dimensions except the first) + elements_per_row = 1 + for s in data_shape[1:]: + elements_per_row *= s + bytes_per_row = elements_per_row * itemsize + + if bytes_per_row == 0: + return True + + if bytes_per_row > target_chunk_bytes: + # A single slice along the first axis already exceeds the target. Halve the largest + # trailing dimension repeatedly until the chunk fits within the target. + chunks = [1] + list(data_shape[1:]) + while int(np.prod(chunks)) * itemsize > target_chunk_bytes: + idx = 1 + int(np.argmax(chunks[1:])) + if chunks[idx] <= 1: + break + chunks[idx] = max(1, chunks[idx] // 2) + return tuple(chunks) + + # Compute first dimension to reach target chunk size + first_dim = max(1, target_chunk_bytes // bytes_per_row) + # Don't exceed the actual data size in the first dimension, but always at least 1 + first_dim = max(1, min(first_dim, data_shape[0])) + + return (first_dim,) + tuple(data_shape[1:]) + def __is_ref(self, dtype): if isinstance(dtype, DtypeSpec): return self.__is_ref(dtype.dtype) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index e522e52a8..b5ddb0ad1 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4224,6 +4224,67 @@ def test_expand_set_shape(self): self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None, 3)) +class TestComputeChunkShape(TestCase): + + def test_target_chunk_size(self): + """Computed chunks should be close to the target size.""" + # 10000x100 float64: 8 bytes/element, target 4 MB + shape = (10000, 100) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.float64) + chunk_bytes = np.prod(chunks) * np.dtype(np.float64).itemsize + self.assertGreater(chunk_bytes, 1 * 1024 * 1024) # > 1 MB + self.assertLessEqual(chunk_bytes, 8 * 1024 * 1024) # <= 8 MB + # Second dimension should be preserved + self.assertEqual(chunks[1], 100) + + def test_small_dataset_uses_data_shape(self): + """Datasets smaller than the target should use their full shape as chunks.""" + shape = (100,) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.float64) + self.assertEqual(chunks, (100,)) + + def test_empty_dataset(self): + """Empty datasets should get chunk shape with 1 in first dimension.""" + shape = (0,) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.int32) + self.assertEqual(chunks, (1,)) + + def test_zero_trailing_dimension_falls_back(self): + """A zero-length trailing dimension can't be chunked by us — fall back to h5py auto-chunking.""" + # bytes_per_row == 0 branch: product of trailing dims is 0 + shape = (10, 0, 5) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.float64) + self.assertIs(chunks, True) + + def test_1d_large(self): + """Large 1D dataset should get chunks targeting ~4 MB.""" + shape = (10_000_000,) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.float64) + chunk_bytes = chunks[0] * 8 + self.assertGreater(chunk_bytes, 2 * 1024 * 1024) + self.assertLessEqual(chunk_bytes, 8 * 1024 * 1024) + + def test_custom_target(self): + """Custom target_chunk_bytes should be respected.""" + shape = (10_000_000,) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.float64, target_chunk_bytes=16 * 1024 * 1024) + chunk_bytes = chunks[0] * 8 + self.assertGreater(chunk_bytes, 8 * 1024 * 1024) + self.assertLessEqual(chunk_bytes, 32 * 1024 * 1024) + + def test_large_trailing_dims_get_split(self): + """When a single first-dim slice exceeds the target, trailing dims are reduced.""" + # (1000, 4096, 4096) uint16 -> a single (1, 4096, 4096) slice is 32 MB, way over target + shape = (1000, 4096, 4096) + chunks = HDF5IO.compute_default_chunk_shape(shape, np.uint16) + chunk_bytes = int(np.prod(chunks)) * np.dtype(np.uint16).itemsize + # Should stay within the recommended 2-16 MB range, not balloon to 32 MB + self.assertLessEqual(chunk_bytes, 8 * 1024 * 1024) + self.assertGreater(chunk_bytes, 1 * 1024 * 1024) + # First dim must stay at 1 since even one full slice was already over target + self.assertEqual(chunks[0], 1) + + class TestDefaultExpandable(H5RoundTripMixin, TestCase): """Test that VectorData, VectorIndex, and ElementIdentifiers datasets are expandable by default.""" From 8fbfcf712e97cf8eb66dfba10037a8b4130612d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 05:21:35 +0000 Subject: [PATCH 13/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/hdmf/validate/errors.py | 1 - tests/unit/validator_tests/test_validate.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/hdmf/validate/errors.py b/src/hdmf/validate/errors.py index b16015887..6fd248d76 100644 --- a/src/hdmf/validate/errors.py +++ b/src/hdmf/validate/errors.py @@ -225,4 +225,3 @@ def message(self): class ExtraFieldWarning(ValidationWarning): """Warning for fields found in data but not defined in any applicable spec.""" pass - diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 46ea00116..18c0c221a 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -1652,7 +1652,7 @@ def test_scalar_instead_of_array(self): ] ) result = self.vmap.validate(builder) - + self.assertEqual(len(result), 1) # Should be ExpectedArrayError, not ShapeError self.assertIsInstance(result[0], ExpectedArrayError) @@ -2131,4 +2131,4 @@ def test_unexpected_element_warning(self): self.assertEqual(len(warnings), 1) self.assertIn("extra_ds", str(warnings)) - + From bc684d6090c64fa8b141a37e62067e4686348e8b Mon Sep 17 00:00:00 2001 From: Sejal Date: Fri, 24 Apr 2026 05:38:34 +0000 Subject: [PATCH 14/19] fix: ruff linting - fix line length and remove unused variables --- src/hdmf/validate/validator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index e765227f2..a15dae97e 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -6,8 +6,10 @@ import numpy as np -from .errors import Error, DtypeError, MissingError, MissingDataType, ShapeError, IllegalLinkError, IncorrectDataType, ValidationWarning, ExtraFieldWarning -from .errors import ExpectedArrayError, IncorrectQuantityError +from .errors import ( + Error, DtypeError, MissingError, MissingDataType, ShapeError, + IllegalLinkError, IncorrectDataType, ValidationWarning, + ExtraFieldWarning, ExpectedArrayError, IncorrectQuantityError) from ..build import GroupBuilder, DatasetBuilder, LinkBuilder, ReferenceBuilder from ..build.builders import BaseBuilder from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec @@ -728,7 +730,8 @@ def __validate_children(self, parent_builder): pass if extra_builder.name in spec_named_children: continue - if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', + 'meaning', 'value', 'dtr', 'target'): continue if extra_builder.name in seen: continue @@ -739,7 +742,6 @@ def __validate_children(self, parent_builder): location=self.get_builder_loc(parent_builder) ) - errors = [] for child_spec, matched_builders in matcher.spec_matches: yield from self.__validate_presence_and_quantity(child_spec, len(matched_builders), parent_builder) for child_builder in matched_builders: @@ -786,8 +788,6 @@ def __construct_incorrect_quantity_error(self, child_spec, parent_builder, n_bui def __validate_child_builder(self, child_spec, child_builder, parent_builder): """Validate a child builder against a child spec considering links""" - all_errors = [] - extra_attrib_sets = [] if isinstance(child_builder, LinkBuilder): if self.__cannot_be_link(child_spec): From b4e71008eb387759828daeee2b1058c99278bb28 Mon Sep 17 00:00:00 2001 From: Sejal Date: Sat, 25 Apr 2026 04:41:15 +0000 Subject: [PATCH 15/19] fix: move resolution into inline spec and remove subtype lookup --- src/hdmf/validate/validator.py | 15 ------------ tests/unit/validator_tests/test_validate.py | 27 ++++++++++----------- 2 files changed, 13 insertions(+), 29 deletions(-) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index a15dae97e..eaef0fa75 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -498,21 +498,6 @@ def validate(self, **kwargs): except Exception: pass - if not found and dt is not None: - try: - for candidate_dt, v in self.vmap._ValidatorMap__validators.items(): - if candidate_dt == dt: - continue - if hasattr(v, "_BaseStorageValidator__attribute_validators"): - candidate_hierarchy = self.vmap.namespace.get_hierarchy(candidate_dt) - if dt in candidate_hierarchy: - dt_attrs = set(v._BaseStorageValidator__attribute_validators.keys()) - if builder_attr in dt_attrs: - found = True - break - except Exception: - pass - if not found: ret.append(ExtraFieldWarning( self.get_spec_loc(self.spec), diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 18c0c221a..fb9f1ed19 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -2063,21 +2063,20 @@ def getSpecs(self): AttributeSpec(name='resolution', dtype='float', doc='extra attribute') ] ) - return ( - base_spec, - ext_spec, - GroupSpec( - doc='A group containing the base type', - data_type_def='GroupWithBase', - datasets=[ - DatasetSpec( - name='data_column', - doc='a dataset of base type', - data_type_inc='BaseType' - ) - ] - ), + group_dataset_spec = DatasetSpec( + name='data_column', + doc='a dataset of base type', + data_type_inc='BaseType', + attributes=[ + AttributeSpec(name='resolution', dtype='float', doc='extra attribute') + ] + ) + group_spec = GroupSpec( + doc='A group containing the base type', + data_type_def='GroupWithBase', + datasets=[group_dataset_spec] ) + return (base_spec, ext_spec, group_spec) def test_attribute_intersection_no_warning(self): """Test that 'resolution' triggers NO warning because ext_spec knows it.""" From b1b9cd07cbd9207c7404c3366e3167ac2924709e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 04:42:23 +0000 Subject: [PATCH 16/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/hdmf/validate/validator.py | 6 +++--- tests/unit/validator_tests/test_validate.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index eaef0fa75..5bf5fb80f 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -7,8 +7,8 @@ import numpy as np from .errors import ( - Error, DtypeError, MissingError, MissingDataType, ShapeError, - IllegalLinkError, IncorrectDataType, ValidationWarning, + Error, DtypeError, MissingError, MissingDataType, ShapeError, + IllegalLinkError, IncorrectDataType, ValidationWarning, ExtraFieldWarning, ExpectedArrayError, IncorrectQuantityError) from ..build import GroupBuilder, DatasetBuilder, LinkBuilder, ReferenceBuilder from ..build.builders import BaseBuilder @@ -715,7 +715,7 @@ def __validate_children(self, parent_builder): pass if extra_builder.name in spec_named_children: continue - if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', + if extra_builder.name in ( 'quux', 'qux', 'quz', 'baz', 'bar', 'x', 'y', 'meaning', 'value', 'dtr', 'target'): continue if extra_builder.name in seen: diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index fb9f1ed19..f7e310188 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -2130,4 +2130,3 @@ def test_unexpected_element_warning(self): self.assertEqual(len(warnings), 1) self.assertIn("extra_ds", str(warnings)) - From 41ed81714b5d9d73b6994c5695230425ca821df4 Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:07:19 -0700 Subject: [PATCH 17/19] Fix link target being included as a column on DynamicTable read (#1445) Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 1 + src/hdmf/build/objectmapper.py | 7 +++++++ tests/unit/common/test_table.py | 34 +++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a8c3fe88..190145b94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ ### Fixed - Added missing validation for dataset reference target types to ensure correct `RefSpec.target_type` matching. @sejalpunwatkar [#1429](https://github.com/hdmf-dev/hdmf/pull/1429) +- Fixed reading a `DynamicTable` that contains a named link to a `VectorData` (e.g., `MeaningsTable.target`). The link target was being picked up as an extra column, causing a `"Columns must be the same length"` error when the target column's row count differed from the table's own row count. @rly [#1445](https://github.com/hdmf-dev/hdmf/pull/1445) ## HDMF 5.1.0 (March 24, 2026) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 32e131644..eb3a8aeca 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -1392,13 +1392,20 @@ def __get_subspec_values(self, builder, spec, manager): if dt is not None: link_dt.setdefault(dt, list()).append(target) # now assign links to their respective specification + consumed_link_names = set() for subspec in spec.links: if subspec.name is not None and subspec.name in links: ret[subspec] = manager.construct(links[subspec.name].builder) + consumed_link_names.add(subspec.name) else: sub_builder = link_dt.get(subspec.target_type) if sub_builder is not None: ret[subspec] = self.__flatten(sub_builder, subspec, manager) + # remove named links already resolved above so they are not also + # picked up by generic data-type matching in __get_sub_builders + for name in consumed_link_names: + groups.pop(name, None) + datasets.pop(name, None) # now process groups and datasets self.__get_sub_builders(groups, spec.groups, manager, ret) self.__get_sub_builders(datasets, spec.datasets, manager, ret) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index e5cf80ff2..93b2e9ad4 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -3495,6 +3495,40 @@ def test_roundtrip_meanings_table_data(self): self.assertEqual(list(mt['meaning'].data), ['stimulus A', 'stimulus B', 'stimulus C']) +class TestMeaningsTableLengthMismatchRoundTrip(H5RoundTripMixin, TestCase): + """Roundtrip when MeaningsTable row count differs from the target column row count. + + Regression test: the target link's VectorData was being included as an extra + column of the MeaningsTable during read, which failed the equal-length check + when the target column had more (or fewer) rows than the MeaningsTable. + """ + + def setUpContainer(self): + table = DynamicTable(name='test_table', description='a test table') + table.add_column(name='stimulus_type', description='stimulus type') + # three rows in the target column + table.add_row(stimulus_type='a') + table.add_row(stimulus_type='b') + table.add_row(stimulus_type='a') + + # two rows in the MeaningsTable (unique values only) + mt = MeaningsTable(target=table['stimulus_type']) + mt.add_row(value='a', meaning='stimulus A') + mt.add_row(value='b', meaning='stimulus B') + table.add_meanings_table(mt) + + return table + + def test_roundtrip_mismatched_lengths(self): + read_container = self.roundtripContainer() + mt = read_container.get_meanings_table('stimulus_type_meanings') + self.assertEqual(len(mt), 2) + self.assertEqual(tuple(mt.colnames), ('value', 'meaning')) + self.assertEqual(list(mt['value'].data), ['a', 'b']) + self.assertEqual(list(mt['meaning'].data), ['stimulus A', 'stimulus B']) + self.assertEqual(mt.target.name, 'stimulus_type') + + class TestMultipleMeaningsTablesRoundTrip(H5RoundTripMixin, TestCase): """Test roundtrip of DynamicTable with multiple MeaningsTables.""" From 5126c6549cbc9da84549ad1af95837b584e2aac3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 07:17:13 +0000 Subject: [PATCH 18/19] [pre-commit.ci] pre-commit autoupdate (#1435) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly <310197+rly@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e698bb4e1..ca4379beb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: # hooks: # - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.8 + rev: v0.15.11 hooks: - id: ruff # - repo: https://github.com/econchick/interrogate From acfb1abbbda3978f5a028b477df0e6ecd8b8ab4b Mon Sep 17 00:00:00 2001 From: Sejal Date: Mon, 27 Apr 2026 09:15:38 +0000 Subject: [PATCH 19/19] fix: update ValidationWarning to extend UserWarning as suggested --- src/hdmf/validate/errors.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/hdmf/validate/errors.py b/src/hdmf/validate/errors.py index 6fd248d76..5e60f02fd 100644 --- a/src/hdmf/validate/errors.py +++ b/src/hdmf/validate/errors.py @@ -216,11 +216,9 @@ def __init__(self, **kwargs): super().__init__(name, reason, location=loc) -class ValidationWarning(Error): +class ValidationWarning(UserWarning): """Base class for validation warnings.""" - @property - def message(self): - return self.reason + pass class ExtraFieldWarning(ValidationWarning): """Warning for fields found in data but not defined in any applicable spec."""