From 133ffbcbd3d195ba47173532b33dcc3985eb33a8 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Thu, 14 May 2026 16:46:45 +0200 Subject: [PATCH 1/2] fixed issue reporting for DDF00125 --- .../json_schema_check_dataset_builder.py | 47 ++++++- cdisc_rules_engine/rules_engine.py | 4 +- .../test_json_schema_check_dataset_builder.py | 124 ++++++++++++++++++ 3 files changed, 172 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py index 6838fb610..9c8668479 100644 --- a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py @@ -83,6 +83,37 @@ def get_instance_by_path(self, instance: dict, path_list: list) -> dict: def get_parent_path(self, path_list: list): return list(path_list)[0 : (-1 - int(isinstance(path_list[-1], int)))] + def _get_ancestor_instance_type(self, path_list: list) -> str: + """Walk up the path to find the nearest ancestor's instanceType.""" + current_path = list(path_list) + while current_path: + current_path = current_path[:-1] + try: + ancestor = self.get_instance_by_path( + self.data_service.json, current_path + ) + if isinstance(ancestor, dict) and "instanceType" in ancestor: + return ancestor["instanceType"] + except (KeyError, IndexError, TypeError): + pass + return "" + + def _get_schema_class_name(self, error: exceptions.ValidationError) -> str: + """Extract class name from error's schema title or $ref when instanceType is unavailable. + + Only uses the schema ``title`` field or the last segment of a ``$ref`` — + avoids inspecting ``absolute_schema_path`` which can return JSON-Schema + keywords (e.g. ``"type"``, ``"items"``) as false positives. + """ + schema = error.schema if error.schema else {} + # Prefer explicit title (e.g. "AliasCode", "StudyVersion") + if title := schema.get("title", ""): + return title + # Fall back to the class name embedded in a $ref + if ref := schema.get("$ref", ""): + return ref.split("/")[-1] + return "" + def parse_error( self, error: exceptions.ValidationError, @@ -95,8 +126,9 @@ def parse_error( if error.validator in ["required", "additionalProperties"] else ( "{}[{}]".format(error.absolute_path[-2], error.absolute_path[-1]) - if isinstance(error.absolute_path[-1], int) - else error.absolute_path[-1] + if len(error.absolute_path) >= 2 + and isinstance(error.absolute_path[-1], int) + else (error.absolute_path[-1] if error.absolute_path else "") ) ) errlist["json_path"].append(error.json_path) @@ -110,6 +142,17 @@ def parse_error( and str(error.instance) in error.message else error.message ) + if errctx: + instance_type = ( + errctx.get("instanceType") + or self._get_schema_class_name(error) + or self._get_ancestor_instance_type(errpath) + ) + else: + instance_type = self._get_schema_class_name( + error + ) or self._get_ancestor_instance_type(errpath) + errlist["dataset"].append(instance_type) errlist["dataset"].append(errctx.get("instanceType", "") if errctx else "") errlist["id"].append(errctx.get("id", "") if errctx else "") errlist["_path"].append("/" + "/".join(map(str, errpath))) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 05bba3b91..d4edf3df1 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -149,7 +149,9 @@ def validate_single_rule(self, rule: dict): ) break if dataset_metadata.unsplit_name in results and "domains" in rule: - include_split = rule["domains"].get("include_split_datasets", False) + include_split = (rule["domains"] or {}).get( + "include_split_datasets", False + ) if not include_split: continue # handling split datasets dataset_results = self.validate_single_dataset( diff --git a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py index dee39886f..7136324a6 100644 --- a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py @@ -172,3 +172,127 @@ def test_json_schema_check_dataset_builder_invalid(): # Verify that the cache_service.add method was called cache_service.add.assert_called_once() + + +def test_json_schema_missing_instance_type_is_reported(): + """ + When an entity in a discriminated anyOf union is missing its instanceType, + the error must be captured and reported rather than silently dropped + (regression for GitHub issue #1546). + + The dataset assignment fallback chain is: + 1. instanceType from data → 2. schema title/ref → 3. nearest ancestor + DDF00125 checks for validator == "required" | "additionalProperties", so the + inner `required` context error (not the top-level anyOf) must be surfaced with + a meaningful dataset assignment. + """ + schema = { + "type": "object", + "properties": { + "study": {"$ref": "#/$defs/Study"}, + }, + "$defs": { + "Study": { + "title": "Study", + "type": "object", + "properties": { + "instanceType": {"const": "Study"}, + "id": {"type": "string"}, + "items": { + "type": "array", + "items": { + "anyOf": [ + {"$ref": "#/$defs/TypeA"}, + {"$ref": "#/$defs/TypeB"}, + ] + }, + }, + }, + "required": ["instanceType", "id"], + }, + "TypeA": { + # title is intentionally present so the dataset assignment falls + # back to "TypeA" (schema title) rather than "Study" (ancestor). + "title": "TypeA", + "type": "object", + "properties": { + "instanceType": {"const": "TypeA"}, + "value": {"type": "string"}, + }, + "required": ["instanceType", "value"], + }, + "TypeB": { + "title": "TypeB", + "type": "object", + "properties": { + "instanceType": {"const": "TypeB"}, + "data": {"type": "integer"}, + }, + "required": ["instanceType", "data"], + }, + }, + } + + # instanceType is intentionally missing from the nested item + instance = { + "study": { + "instanceType": "Study", + "id": "study-1", + "items": [ + {"value": "some-value"}, # missing instanceType + ], + } + } + + data_service = MagicMock() + data_service.json = instance + data_service.dataset_implementation = PandasDataset + data_service.dataset_path = "dummy.json" + + cache_service = MagicMock() + cache_service.get.return_value = None + + # Filter to the entity that owns the problematic object. + # Because TypeA.title == "TypeA", the dataset column will be set to "TypeA" + # (schema title wins over ancestor "Study"). This is the correct value for + # DDF00125 to report the issue under the right entity. + dataset_metadata = MagicMock() + dataset_metadata.name = "TypeA" + + builder = JsonSchemaCheckDatasetBuilder( + rule={}, + data_service=data_service, + cache_service=cache_service, + rule_processor=MagicMock(), + data_processor=MagicMock(), + dataset_metadata=dataset_metadata, + define_xml_path=None, + standard="USDM", + standard_version="4.0", + standard_substandard=None, + library_metadata=LibraryMetadataContainer(standard_schema_definition=schema), + ) + + ds = builder.get_dataset() + rows = ds.data.to_dict(orient="records") + + # 1. Error must not be silently dropped + assert len(rows) >= 1, "Missing instanceType error must produce at least one row" + + # 2. dataset must be set to the schema class name ("TypeA"), not empty string. + # This ensures DDF00125 associates the error with the correct entity. + assert any(row.get("dataset") == "TypeA" for row in rows), ( + "Error with missing instanceType must be associated with the schema class " + "('TypeA' via schema title), not silently dropped with dataset=''" + ) + + # 3. The error must have validator == "required" so DDF00125's check condition + # (validator is_contained_by ['required', 'additionalProperties']) fires. + assert any( + row.get("validator") == "required" for row in rows + ), "The surfaced error must have validator='required' so DDF00125 can match it" + + # 4. The path should identify where in the document the problem is + assert any( + "/study/items/0" in (row.get("_path") or "") for row in rows + ), "Path should point to the entity missing instanceType" From efe39571468bd0400895f8c3e307a58f8a831827 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Fri, 15 May 2026 11:25:51 +0200 Subject: [PATCH 2/2] fix duplicated error --- .../dataset_builders/json_schema_check_dataset_builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py index 9c8668479..7563e7ee5 100644 --- a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py @@ -153,7 +153,6 @@ def parse_error( error ) or self._get_ancestor_instance_type(errpath) errlist["dataset"].append(instance_type) - errlist["dataset"].append(errctx.get("instanceType", "") if errctx else "") errlist["id"].append(errctx.get("id", "") if errctx else "") errlist["_path"].append("/" + "/".join(map(str, errpath)))