Skip to content

Commit 32dc824

Browse files
1546: fixed issue reporting for DDF00125 (#1736)
* fixed issue reporting for DDF00125 * fix duplicated error
1 parent bc6ec1d commit 32dc824

3 files changed

Lines changed: 172 additions & 4 deletions

File tree

cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,37 @@ def get_instance_by_path(self, instance: dict, path_list: list) -> dict:
8383
def get_parent_path(self, path_list: list):
8484
return list(path_list)[0 : (-1 - int(isinstance(path_list[-1], int)))]
8585

86+
def _get_ancestor_instance_type(self, path_list: list) -> str:
87+
"""Walk up the path to find the nearest ancestor's instanceType."""
88+
current_path = list(path_list)
89+
while current_path:
90+
current_path = current_path[:-1]
91+
try:
92+
ancestor = self.get_instance_by_path(
93+
self.data_service.json, current_path
94+
)
95+
if isinstance(ancestor, dict) and "instanceType" in ancestor:
96+
return ancestor["instanceType"]
97+
except (KeyError, IndexError, TypeError):
98+
pass
99+
return ""
100+
101+
def _get_schema_class_name(self, error: exceptions.ValidationError) -> str:
102+
"""Extract class name from error's schema title or $ref when instanceType is unavailable.
103+
104+
Only uses the schema ``title`` field or the last segment of a ``$ref`` —
105+
avoids inspecting ``absolute_schema_path`` which can return JSON-Schema
106+
keywords (e.g. ``"type"``, ``"items"``) as false positives.
107+
"""
108+
schema = error.schema if error.schema else {}
109+
# Prefer explicit title (e.g. "AliasCode", "StudyVersion")
110+
if title := schema.get("title", ""):
111+
return title
112+
# Fall back to the class name embedded in a $ref
113+
if ref := schema.get("$ref", ""):
114+
return ref.split("/")[-1]
115+
return ""
116+
86117
def parse_error(
87118
self,
88119
error: exceptions.ValidationError,
@@ -95,8 +126,9 @@ def parse_error(
95126
if error.validator in ["required", "additionalProperties"]
96127
else (
97128
"{}[{}]".format(error.absolute_path[-2], error.absolute_path[-1])
98-
if isinstance(error.absolute_path[-1], int)
99-
else error.absolute_path[-1]
129+
if len(error.absolute_path) >= 2
130+
and isinstance(error.absolute_path[-1], int)
131+
else (error.absolute_path[-1] if error.absolute_path else "")
100132
)
101133
)
102134
errlist["json_path"].append(error.json_path)
@@ -110,7 +142,17 @@ def parse_error(
110142
and str(error.instance) in error.message
111143
else error.message
112144
)
113-
errlist["dataset"].append(errctx.get("instanceType", "") if errctx else "")
145+
if errctx:
146+
instance_type = (
147+
errctx.get("instanceType")
148+
or self._get_schema_class_name(error)
149+
or self._get_ancestor_instance_type(errpath)
150+
)
151+
else:
152+
instance_type = self._get_schema_class_name(
153+
error
154+
) or self._get_ancestor_instance_type(errpath)
155+
errlist["dataset"].append(instance_type)
114156
errlist["id"].append(errctx.get("id", "") if errctx else "")
115157
errlist["_path"].append("/" + "/".join(map(str, errpath)))
116158

cdisc_rules_engine/rules_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ def validate_single_rule(self, rule: dict):
149149
)
150150
break
151151
if dataset_metadata.unsplit_name in results and "domains" in rule:
152-
include_split = rule["domains"].get("include_split_datasets", False)
152+
include_split = (rule["domains"] or {}).get(
153+
"include_split_datasets", False
154+
)
153155
if not include_split:
154156
continue # handling split datasets
155157
dataset_results = self.validate_single_dataset(

tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,3 +172,127 @@ def test_json_schema_check_dataset_builder_invalid():
172172

173173
# Verify that the cache_service.add method was called
174174
cache_service.add.assert_called_once()
175+
176+
177+
def test_json_schema_missing_instance_type_is_reported():
178+
"""
179+
When an entity in a discriminated anyOf union is missing its instanceType,
180+
the error must be captured and reported rather than silently dropped
181+
(regression for GitHub issue #1546).
182+
183+
The dataset assignment fallback chain is:
184+
1. instanceType from data → 2. schema title/ref → 3. nearest ancestor
185+
DDF00125 checks for validator == "required" | "additionalProperties", so the
186+
inner `required` context error (not the top-level anyOf) must be surfaced with
187+
a meaningful dataset assignment.
188+
"""
189+
schema = {
190+
"type": "object",
191+
"properties": {
192+
"study": {"$ref": "#/$defs/Study"},
193+
},
194+
"$defs": {
195+
"Study": {
196+
"title": "Study",
197+
"type": "object",
198+
"properties": {
199+
"instanceType": {"const": "Study"},
200+
"id": {"type": "string"},
201+
"items": {
202+
"type": "array",
203+
"items": {
204+
"anyOf": [
205+
{"$ref": "#/$defs/TypeA"},
206+
{"$ref": "#/$defs/TypeB"},
207+
]
208+
},
209+
},
210+
},
211+
"required": ["instanceType", "id"],
212+
},
213+
"TypeA": {
214+
# title is intentionally present so the dataset assignment falls
215+
# back to "TypeA" (schema title) rather than "Study" (ancestor).
216+
"title": "TypeA",
217+
"type": "object",
218+
"properties": {
219+
"instanceType": {"const": "TypeA"},
220+
"value": {"type": "string"},
221+
},
222+
"required": ["instanceType", "value"],
223+
},
224+
"TypeB": {
225+
"title": "TypeB",
226+
"type": "object",
227+
"properties": {
228+
"instanceType": {"const": "TypeB"},
229+
"data": {"type": "integer"},
230+
},
231+
"required": ["instanceType", "data"],
232+
},
233+
},
234+
}
235+
236+
# instanceType is intentionally missing from the nested item
237+
instance = {
238+
"study": {
239+
"instanceType": "Study",
240+
"id": "study-1",
241+
"items": [
242+
{"value": "some-value"}, # missing instanceType
243+
],
244+
}
245+
}
246+
247+
data_service = MagicMock()
248+
data_service.json = instance
249+
data_service.dataset_implementation = PandasDataset
250+
data_service.dataset_path = "dummy.json"
251+
252+
cache_service = MagicMock()
253+
cache_service.get.return_value = None
254+
255+
# Filter to the entity that owns the problematic object.
256+
# Because TypeA.title == "TypeA", the dataset column will be set to "TypeA"
257+
# (schema title wins over ancestor "Study"). This is the correct value for
258+
# DDF00125 to report the issue under the right entity.
259+
dataset_metadata = MagicMock()
260+
dataset_metadata.name = "TypeA"
261+
262+
builder = JsonSchemaCheckDatasetBuilder(
263+
rule={},
264+
data_service=data_service,
265+
cache_service=cache_service,
266+
rule_processor=MagicMock(),
267+
data_processor=MagicMock(),
268+
dataset_metadata=dataset_metadata,
269+
define_xml_path=None,
270+
standard="USDM",
271+
standard_version="4.0",
272+
standard_substandard=None,
273+
library_metadata=LibraryMetadataContainer(standard_schema_definition=schema),
274+
)
275+
276+
ds = builder.get_dataset()
277+
rows = ds.data.to_dict(orient="records")
278+
279+
# 1. Error must not be silently dropped
280+
assert len(rows) >= 1, "Missing instanceType error must produce at least one row"
281+
282+
# 2. dataset must be set to the schema class name ("TypeA"), not empty string.
283+
# This ensures DDF00125 associates the error with the correct entity.
284+
assert any(row.get("dataset") == "TypeA" for row in rows), (
285+
"Error with missing instanceType must be associated with the schema class "
286+
"('TypeA' via schema title), not silently dropped with dataset=''"
287+
)
288+
289+
# 3. The error must have validator == "required" so DDF00125's check condition
290+
# (validator is_contained_by ['required', 'additionalProperties']) fires.
291+
assert any(
292+
row.get("validator") == "required" for row in rows
293+
), "The surfaced error must have validator='required' so DDF00125 can match it"
294+
295+
# 4. The path should identify where in the document the problem is
296+
assert any(
297+
"/study/items/0" in (row.get("_path") or "") for row in rows
298+
), "Path should point to the entity missing instanceType"

0 commit comments

Comments
 (0)