Skip to content

Commit c7c0a62

Browse files
#1616: User-friendly errors for missing/wrong Datasets tab and columns, standard validation, and library metadata lookup failures (#1626)
* #1616: User-friendly errors for missing/wrong Datasets tab and columns, standard validation, and library metadata lookup failures * Add USDM to StandardTypes enum * Do not raise LibraryMetadataNotFoundError for USDM when standard metadata is absent in cache * Remove datasets key reporting, inline Datasets guidance, add CT package validation (script_utils + run_single_rule_validation) * Improve Library/Datasets/CT validation errors; remove hint text, move _get_datasets_or_raise to script_utils, unify CT check after Define * Remove dataset wrapper, centralize CT check, add cached_worksheet to get_raw_dataset_metadata * cache & cdash * moved script out of exceptions * error text * test --------- Co-authored-by: Samuel Johnson <sfjohnson24@gmail.com> Co-authored-by: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com>
1 parent 7c820c6 commit c7c0a62

12 files changed

Lines changed: 512 additions & 82 deletions

File tree

TestRule/__init__.py

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
from cdisc_rules_engine.services.cdisc_library_service import CDISCLibraryService
77
from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator
88
from scripts.run_validation import run_single_rule_validation
9+
from cdisc_rules_engine.exceptions.custom_exceptions import (
10+
CTPackageNotFoundError,
11+
LibraryMetadataNotFoundError,
12+
)
13+
from scripts.script_utils import library_metadata_not_found_message
14+
from cdisc_library_client.custom_exceptions import (
15+
ResourceNotFoundException as LibraryResourceNotFoundException,
16+
)
917
import json
1018
import os
1119
import asyncio
@@ -17,11 +25,13 @@ class BadRequestError(Exception):
1725
pass
1826

1927

28+
_REQUIRED_DATASET_KEYS = {"filename", "label", "domain", "records", "variables"}
29+
30+
2031
def validate_datasets_payload(datasets):
21-
required_keys = {"filename", "label", "domain", "records", "variables"}
2232
missing_keys = set()
2333
for dataset in datasets:
24-
for key in required_keys:
34+
for key in _REQUIRED_DATASET_KEYS:
2535
if key not in dataset:
2636
missing_keys.add(key)
2737

@@ -32,19 +42,39 @@ def validate_datasets_payload(datasets):
3242
)
3343

3444
if missing_keys:
35-
raise KeyError(
36-
f"one or more datasets missing the following keys {missing_keys}"
37-
)
45+
raise BadRequestError("Test data is incorrect and missing required formatting.")
3846

3947

4048
def handle_exception(e: Exception):
41-
if isinstance(e, KeyError):
49+
if isinstance(e, BadRequestError):
50+
return func.HttpResponse(
51+
json.dumps({"error": "BadRequestError", "message": str(e)}),
52+
status_code=400,
53+
)
54+
if isinstance(e, LibraryMetadataNotFoundError):
55+
msg = getattr(e, "message", None) or getattr(e, "description", None) or str(e)
56+
return func.HttpResponse(
57+
json.dumps(
58+
{
59+
"error": "LibraryMetadataNotFoundError",
60+
"message": msg,
61+
}
62+
),
63+
status_code=400,
64+
)
65+
if isinstance(e, CTPackageNotFoundError):
66+
msg = getattr(e, "message", None) or getattr(e, "description", None) or str(e)
4267
return func.HttpResponse(
43-
json.dumps({"error": "KeyError", "message": str(e)}), status_code=400
68+
json.dumps({"error": "CTPackageNotFoundError", "message": msg}),
69+
status_code=400,
4470
)
45-
elif isinstance(e, BadRequestError):
71+
if isinstance(e, KeyError):
72+
msg = str(e)
73+
if "rule" in msg.lower() or "datasets" in msg.lower():
74+
msg = f"{msg} Ensure the request body includes the required JSON keys."
4675
return func.HttpResponse(
47-
json.dumps({"error": "BadRequestError", "message": str(e)}), status_code=400
76+
json.dumps({"error": "BadRequestError", "message": msg}),
77+
status_code=400,
4878
)
4979
else:
5080
return func.HttpResponse(
@@ -97,12 +127,25 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse: #
97127
asyncio.run(cache_populator.load_available_ct_packages())
98128
if standards_data or codelists:
99129
if standards_data:
100-
asyncio.run(
101-
cache_populator.load_standard(
102-
standard, standard_version, standard_substandard
130+
try:
131+
asyncio.run(
132+
cache_populator.load_standard(
133+
standard, standard_version, standard_substandard
134+
)
135+
)
136+
except LibraryResourceNotFoundException:
137+
raise LibraryMetadataNotFoundError(
138+
library_metadata_not_found_message(
139+
standard, standard_version, standard_substandard
140+
)
103141
)
142+
try:
143+
asyncio.run(cache_populator.load_codelists(codelists or []))
144+
except LibraryResourceNotFoundException:
145+
raise CTPackageNotFoundError(
146+
"Controlled terminology package(s) not found: "
147+
f"{', '.join(str(c) for c in (codelists or []))}."
104148
)
105-
asyncio.run(cache_populator.load_codelists(codelists))
106149
if not rule:
107150
raise KeyError("'rule' required in request")
108151
datasets = json_data.get("datasets")
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from cdisc_rules_engine.enums.base_enum import BaseEnum
2+
3+
4+
class ExcelDataSheets(BaseEnum):
5+
DATASETS_SHEET_NAME = "Datasets"
6+
DATASET_FILENAME_COLUMN = "Filename"
7+
DATASET_LABEL_COLUMN = "Label"
8+
DATASETS_SHEET_REQUIRED_COLUMNS = ("Filename", "Label")
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from cdisc_rules_engine.enums.base_enum import BaseEnum
2+
3+
4+
class StandardTypes(BaseEnum):
5+
"""Standards supported by CDISC Library; used for CLI validation when not using --custom-standard."""
6+
7+
SDTMIG = "sdtmig"
8+
SENDIG = "sendig"
9+
ADAM = "adam"
10+
TIG = "tig"
11+
USDM = "usdm"

cdisc_rules_engine/exceptions/custom_exceptions.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ class VariableMetadataNotFoundError(EngineError):
4040
)
4141

4242

43+
class LibraryMetadataNotFoundError(EngineError):
44+
code = 400
45+
description = (
46+
"Library metadata not found for the provided standard and version combination."
47+
)
48+
49+
4350
class DomainNotFoundError(EngineError):
4451
"""Raised when a required domain is not found in the dataset"""
4552

@@ -62,6 +69,19 @@ class InvalidJSONFormat(EngineError):
6269
description = "JSON data is malformed."
6370

6471

72+
class ExcelTestDataError(EngineError):
73+
code = 400
74+
description = (
75+
"Excel test data file is missing required sheets or column headers. "
76+
"Sheet and column names are case-sensitive."
77+
)
78+
79+
80+
class CTPackageNotFoundError(EngineError):
81+
code = 400
82+
description = "Controlled terminology package(s) not found"
83+
84+
6585
class NumberOfAttemptsExceeded(EngineError):
6686
pass
6787

cdisc_rules_engine/services/data_services/excel_data_service.py

Lines changed: 58 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
from io import IOBase
3+
import functools
34
from typing import List, Sequence
45
from datetime import datetime
56
import re
@@ -14,16 +15,14 @@
1415
from cdisc_rules_engine.models.variable_metadata_container import (
1516
VariableMetadataContainer,
1617
)
17-
from cdisc_rules_engine.services import logger
18+
from cdisc_rules_engine.exceptions.custom_exceptions import ExcelTestDataError
1819
from cdisc_rules_engine.services.data_readers.data_reader_factory import (
1920
DataReaderFactory,
2021
)
2122
from .base_data_service import BaseDataService, cached_dataset
22-
23-
DATASETS_SHEET_NAME = "Datasets"
24-
DATASET_FILENAME_COLUMN = "Filename"
25-
DATASET_LABEL_COLUMN = "Label"
26-
DATASET_NAME_COLUMN = "Dataset Name"
23+
from cdisc_rules_engine.enums.excel_test_sheets import (
24+
ExcelDataSheets,
25+
)
2726

2827

2928
class ExcelDataService(BaseDataService):
@@ -112,34 +111,43 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface:
112111
def _get_dataset_name(
113112
self, metadata: pd.DataFrame, first_record: dict, dataset_filename: str
114113
) -> str:
115-
if DATASET_NAME_COLUMN in metadata.columns and not metadata.empty:
116-
return metadata[DATASET_NAME_COLUMN].iloc[0]
117114
if self.standard == "usdm":
118115
return first_record.get("instanceType", dataset_filename.split(".")[0])
119116
return dataset_filename.split(".")[0].upper()
120117

118+
@functools.lru_cache(maxsize=None)
119+
def _get_datasets_worksheet(self) -> pd.DataFrame:
120+
return pd.read_excel(
121+
self.dataset_path,
122+
sheet_name=ExcelDataSheets.DATASETS_SHEET_NAME.value,
123+
na_values=[""],
124+
keep_default_na=False,
125+
)
126+
121127
@cached_dataset(DatasetTypes.RAW_METADATA.value)
122128
def get_raw_dataset_metadata(
123-
self, dataset_name: str, **kwargs
129+
self,
130+
dataset_name: str,
131+
**kwargs,
124132
) -> SDTMDatasetMetadata:
125133
"""
126134
Returns dataset metadata as DatasetMetadata instance.
127135
"""
128-
datasets_worksheet = pd.read_excel(
129-
self.dataset_path,
130-
sheet_name=DATASETS_SHEET_NAME,
131-
na_values=[""],
132-
keep_default_na=False,
133-
)
136+
datasets_worksheet = self._get_datasets_worksheet()
134137
metadata = datasets_worksheet[
135-
datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name
138+
datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value]
139+
== dataset_name
136140
]
137141
dataset = self.get_dataset(dataset_name=dataset_name)
138142
first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {}
139143
return SDTMDatasetMetadata(
140144
name=self._get_dataset_name(metadata, first_record, dataset_name),
141145
first_record=first_record,
142-
label=metadata[DATASET_LABEL_COLUMN].iloc[0] if not metadata.empty else "",
146+
label=(
147+
metadata[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0]
148+
if not metadata.empty
149+
else ""
150+
),
143151
modification_date=datetime.fromtimestamp(
144152
os.path.getmtime(self.dataset_path)
145153
).isoformat(),
@@ -199,23 +207,41 @@ def read_data(self, file_path: str) -> IOBase:
199207

200208
def get_datasets(self) -> List[dict]:
201209
try:
202-
worksheet = pd.read_excel(
203-
self.dataset_path,
204-
sheet_name=DATASETS_SHEET_NAME,
205-
na_values=[""],
206-
keep_default_na=False,
207-
)
208-
except TypeError as e:
209-
logger.error(
210-
f"Failed to read datasets from the Excel file at {self.dataset_path}. "
211-
f"Ensure the file is in the correct format. "
212-
f"Try opening and saving the file in Microsoft Excel. "
213-
f"Error: {str(e)}"
214-
)
210+
with pd.ExcelFile(self.dataset_path) as xl:
211+
sheet_names = xl.sheet_names
212+
if ExcelDataSheets.DATASETS_SHEET_NAME.value not in sheet_names:
213+
available = ", ".join(repr(s) for s in sheet_names) or "(none)"
214+
raise ExcelTestDataError(
215+
f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. "
216+
f"Submitted sheet names: {available}."
217+
)
218+
worksheet = xl.parse(
219+
ExcelDataSheets.DATASETS_SHEET_NAME.value,
220+
na_values=[""],
221+
keep_default_na=False,
222+
)
223+
except ExcelTestDataError:
215224
raise
225+
except Exception as e:
226+
raise ExcelTestDataError(
227+
f"Cannot read the Excel file. Ensure it is a valid .xlsx workbook. "
228+
f"Details: {e}"
229+
) from e
230+
231+
missing_cols = sorted(
232+
set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value)
233+
- set(worksheet.columns)
234+
)
235+
if missing_cols:
236+
raise ExcelTestDataError(
237+
f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a "
238+
f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): "
239+
f"{missing_cols}. Column headers are case-sensitive. "
240+
)
241+
216242
datasets = [
217-
self.get_raw_dataset_metadata(dataset_name=dataset_filename)
218-
for dataset_filename in worksheet[DATASET_FILENAME_COLUMN]
243+
self.get_raw_dataset_metadata(dataset_name=fn)
244+
for fn in worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value]
219245
]
220246
return datasets
221247

cdisc_rules_engine/services/data_services/local_data_service.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@
2424
convert_file_size,
2525
extract_file_name_from_path_string,
2626
)
27+
from cdisc_rules_engine.exceptions.custom_exceptions import InvalidDatasetFormat
2728
from .base_data_service import BaseDataService, cached_dataset
2829
from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
2930
from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface
3031
from cdisc_rules_engine.models.dataset import PandasDataset
31-
from cdisc_rules_engine.services import logger
3232
import re
3333

3434

@@ -244,28 +244,12 @@ def get_datasets(self) -> List[dict]:
244244
dataset_name=dataset_path
245245
)
246246
datasets.append(dataset_metadata)
247+
except InvalidDatasetFormat:
248+
raise
247249
except Exception as e:
248-
logger.error(
249-
f"Failed to read metadata for dataset {dataset_path}. "
250-
f"Error: {type(e).__name__}: {e}. Skipping this dataset."
251-
)
252-
file_name = extract_file_name_from_path_string(dataset_path)
253-
datasets.append(
254-
SDTMDatasetMetadata(
255-
name=(
256-
file_name.split(".")[0].upper()
257-
if "." in file_name
258-
else file_name.upper()
259-
),
260-
first_record={},
261-
label="",
262-
modification_date="",
263-
filename=file_name,
264-
full_path=dataset_path,
265-
file_size=0,
266-
record_count=0,
267-
)
268-
)
250+
raise InvalidDatasetFormat(
251+
f"Your data file could not be read: {dataset_path}."
252+
) from e
269253
return datasets
270254

271255
@staticmethod

core.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths
2424
from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions
2525
from cdisc_rules_engine.enums.report_types import ReportTypes
26+
from cdisc_rules_engine.enums.standard_types import StandardTypes
2627
from cdisc_rules_engine.models.external_dictionaries_container import (
2728
DictionaryTypes,
2829
ExternalDictionariesContainer,
@@ -478,6 +479,15 @@ def validate( # noqa
478479

479480
if not custom_standard:
480481
standard = standard.lower()
482+
supported_standards = StandardTypes.values()
483+
if standard not in supported_standards:
484+
supported_list = ", ".join(sorted(supported_standards))
485+
logger.error(
486+
f"Standard '{standard}' is not a supported standard. "
487+
f"Supported standards: {supported_list}. "
488+
f"Use --custom-standard flag for custom standards."
489+
)
490+
ctx.exit(2)
481491

482492
if raw_report is True:
483493
if not (len(output_format) == 1 and output_format[0] == ReportTypes.JSON.value):

0 commit comments

Comments
 (0)