|
1 | 1 | import os |
2 | 2 | from io import IOBase |
| 3 | +import functools |
3 | 4 | from typing import List, Sequence |
4 | 5 | from datetime import datetime |
5 | 6 | import re |
|
14 | 15 | from cdisc_rules_engine.models.variable_metadata_container import ( |
15 | 16 | VariableMetadataContainer, |
16 | 17 | ) |
17 | | -from cdisc_rules_engine.services import logger |
| 18 | +from cdisc_rules_engine.exceptions.custom_exceptions import ExcelTestDataError |
18 | 19 | from cdisc_rules_engine.services.data_readers.data_reader_factory import ( |
19 | 20 | DataReaderFactory, |
20 | 21 | ) |
21 | 22 | from .base_data_service import BaseDataService, cached_dataset |
22 | | - |
23 | | -DATASETS_SHEET_NAME = "Datasets" |
24 | | -DATASET_FILENAME_COLUMN = "Filename" |
25 | | -DATASET_LABEL_COLUMN = "Label" |
26 | | -DATASET_NAME_COLUMN = "Dataset Name" |
| 23 | +from cdisc_rules_engine.enums.excel_test_sheets import ( |
| 24 | + ExcelDataSheets, |
| 25 | +) |
27 | 26 |
|
28 | 27 |
|
29 | 28 | class ExcelDataService(BaseDataService): |
@@ -112,34 +111,43 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: |
112 | 111 | def _get_dataset_name( |
113 | 112 | self, metadata: pd.DataFrame, first_record: dict, dataset_filename: str |
114 | 113 | ) -> str: |
115 | | - if DATASET_NAME_COLUMN in metadata.columns and not metadata.empty: |
116 | | - return metadata[DATASET_NAME_COLUMN].iloc[0] |
117 | 114 | if self.standard == "usdm": |
118 | 115 | return first_record.get("instanceType", dataset_filename.split(".")[0]) |
119 | 116 | return dataset_filename.split(".")[0].upper() |
120 | 117 |
|
| 118 | + @functools.lru_cache(maxsize=None) |
| 119 | + def _get_datasets_worksheet(self) -> pd.DataFrame: |
| 120 | + return pd.read_excel( |
| 121 | + self.dataset_path, |
| 122 | + sheet_name=ExcelDataSheets.DATASETS_SHEET_NAME.value, |
| 123 | + na_values=[""], |
| 124 | + keep_default_na=False, |
| 125 | + ) |
| 126 | + |
121 | 127 | @cached_dataset(DatasetTypes.RAW_METADATA.value) |
122 | 128 | def get_raw_dataset_metadata( |
123 | | - self, dataset_name: str, **kwargs |
| 129 | + self, |
| 130 | + dataset_name: str, |
| 131 | + **kwargs, |
124 | 132 | ) -> SDTMDatasetMetadata: |
125 | 133 | """ |
126 | 134 | Returns dataset metadata as DatasetMetadata instance. |
127 | 135 | """ |
128 | | - datasets_worksheet = pd.read_excel( |
129 | | - self.dataset_path, |
130 | | - sheet_name=DATASETS_SHEET_NAME, |
131 | | - na_values=[""], |
132 | | - keep_default_na=False, |
133 | | - ) |
| 136 | + datasets_worksheet = self._get_datasets_worksheet() |
134 | 137 | metadata = datasets_worksheet[ |
135 | | - datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name |
| 138 | + datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] |
| 139 | + == dataset_name |
136 | 140 | ] |
137 | 141 | dataset = self.get_dataset(dataset_name=dataset_name) |
138 | 142 | first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} |
139 | 143 | return SDTMDatasetMetadata( |
140 | 144 | name=self._get_dataset_name(metadata, first_record, dataset_name), |
141 | 145 | first_record=first_record, |
142 | | - label=metadata[DATASET_LABEL_COLUMN].iloc[0] if not metadata.empty else "", |
| 146 | + label=( |
| 147 | + metadata[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0] |
| 148 | + if not metadata.empty |
| 149 | + else "" |
| 150 | + ), |
143 | 151 | modification_date=datetime.fromtimestamp( |
144 | 152 | os.path.getmtime(self.dataset_path) |
145 | 153 | ).isoformat(), |
@@ -199,23 +207,41 @@ def read_data(self, file_path: str) -> IOBase: |
199 | 207 |
|
200 | 208 | def get_datasets(self) -> List[dict]: |
201 | 209 | try: |
202 | | - worksheet = pd.read_excel( |
203 | | - self.dataset_path, |
204 | | - sheet_name=DATASETS_SHEET_NAME, |
205 | | - na_values=[""], |
206 | | - keep_default_na=False, |
207 | | - ) |
208 | | - except TypeError as e: |
209 | | - logger.error( |
210 | | - f"Failed to read datasets from the Excel file at {self.dataset_path}. " |
211 | | - f"Ensure the file is in the correct format. " |
212 | | - f"Try opening and saving the file in Microsoft Excel. " |
213 | | - f"Error: {str(e)}" |
214 | | - ) |
| 210 | + with pd.ExcelFile(self.dataset_path) as xl: |
| 211 | + sheet_names = xl.sheet_names |
| 212 | + if ExcelDataSheets.DATASETS_SHEET_NAME.value not in sheet_names: |
| 213 | + available = ", ".join(repr(s) for s in sheet_names) or "(none)" |
| 214 | + raise ExcelTestDataError( |
| 215 | + f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. " |
| 216 | + f"Submitted sheet names: {available}." |
| 217 | + ) |
| 218 | + worksheet = xl.parse( |
| 219 | + ExcelDataSheets.DATASETS_SHEET_NAME.value, |
| 220 | + na_values=[""], |
| 221 | + keep_default_na=False, |
| 222 | + ) |
| 223 | + except ExcelTestDataError: |
215 | 224 | raise |
| 225 | + except Exception as e: |
| 226 | + raise ExcelTestDataError( |
| 227 | + f"Cannot read the Excel file. Ensure it is a valid .xlsx workbook. " |
| 228 | + f"Details: {e}" |
| 229 | + ) from e |
| 230 | + |
| 231 | + missing_cols = sorted( |
| 232 | + set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value) |
| 233 | + - set(worksheet.columns) |
| 234 | + ) |
| 235 | + if missing_cols: |
| 236 | + raise ExcelTestDataError( |
| 237 | + f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a " |
| 238 | + f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): " |
| 239 | + f"{missing_cols}. Column headers are case-sensitive. " |
| 240 | + ) |
| 241 | + |
216 | 242 | datasets = [ |
217 | | - self.get_raw_dataset_metadata(dataset_name=dataset_filename) |
218 | | - for dataset_filename in worksheet[DATASET_FILENAME_COLUMN] |
| 243 | + self.get_raw_dataset_metadata(dataset_name=fn) |
| 244 | + for fn in worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] |
219 | 245 | ] |
220 | 246 | return datasets |
221 | 247 |
|
|
0 commit comments