Skip to content

Commit 0420158

Browse files
authored
Fixed published open rules for usdm (#1759)
* usdm running now * add json path and fix unit tests * only add path for usdm * handle emprt ints as floats * fix extra csv extension in tests * Fix datasetbuilder caching
1 parent e5fb933 commit 0420158

4 files changed

Lines changed: 85 additions & 82 deletions

File tree

cdisc_rules_engine/dataset_builders/base_dataset_builder.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,8 @@ def __init__(
4747
self.library_metadata = library_metadata
4848
self.dataset_implementation = self.data_service.dataset_implementation
4949
if isinstance(dataset_metadata, SDTMDatasetMetadata):
50-
self.domain = (
51-
f"SUPP{dataset_metadata.rdomain}"
52-
if dataset_metadata.rdomain
53-
else dataset_metadata.domain
54-
)
55-
self.dataset_name = dataset_metadata.name
50+
# This is created to support the get_dataset cached decorator
51+
self.domain = dataset_metadata.unsplit_name
5652
self.name = self.__class__.__name__
5753

5854
@abstractmethod

cdisc_rules_engine/services/csv_metadata_reader.py

Lines changed: 74 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def __init__(
1919
):
2020
self.file_path = file_path
2121
self.file_name = file_name
22+
self.dataset = Path(file_name).stem
2223
self.encoding = encoding
2324
self.variables_csv_path = (
2425
Path(variables_csv_path)
@@ -32,59 +33,87 @@ def __init__(
3233
)
3334

3435
def read(self) -> dict:
35-
dataset_name = Path(self.file_name).stem.lower()
36+
metadata = {}
37+
metadata.update(self.__dataset_metadata())
38+
metadata.update(
39+
{
40+
"dataset_modification_date": datetime.fromtimestamp(
41+
Path(self.file_path).stat().st_mtime
42+
).isoformat(),
43+
"adam_info": {
44+
"categorization_scheme": {},
45+
"w_indexes": {},
46+
"period": {},
47+
"selection_algorithm": {},
48+
},
49+
}
50+
)
51+
metadata.update(self.__variable_metadata())
52+
metadata.update(self.__data_metadata())
53+
return metadata
3654

37-
if not self.variables_csv_path.exists():
38-
logger = logging.getLogger("validator")
39-
logger.info("No variables file found for %s", dataset_name)
40-
variables_meta = {}
41-
else:
42-
variables_meta = self.__get_variable_metadata(
43-
dataset_name, self.variables_csv_path
55+
def __dataset_metadata(self) -> dict:
56+
logger = logging.getLogger("validator")
57+
58+
if not self.datasets_csv_path.exists():
59+
logger.info("No datasets file found for %s", self.dataset)
60+
return {"dataset_name": self.dataset}
61+
62+
try:
63+
datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding)
64+
except (UnicodeDecodeError, UnicodeError) as e:
65+
logger.error(
66+
f"\n Error reading CSV from: {self.file_path}"
67+
f"\n Failed to decode with {self.encoding} encoding: {e}"
68+
f"\n Please specify the correct encoding using the -e flag."
4469
)
70+
return {}
71+
except Exception as e:
72+
logger.error("Error reading CSV file %s. %s", self.file_path, e)
73+
return {}
74+
75+
if "Filename" not in datasets_df.columns:
76+
return {}
77+
78+
match = datasets_df[datasets_df["Filename"] == self.dataset]
79+
80+
if match.empty or len(match) > 1:
81+
return {}
82+
83+
single_match = match.iloc[0]
4584

46-
metadata = {
47-
"dataset_name": dataset_name.upper(),
48-
"dataset_modification_date": datetime.fromtimestamp(
49-
Path(self.file_path).stat().st_mtime
50-
).isoformat(),
51-
"adam_info": {
52-
"categorization_scheme": {},
53-
"w_indexes": {},
54-
"period": {},
55-
"selection_algorithm": {},
56-
},
85+
return {
86+
"dataset_name": (
87+
single_match["Dataset Name"]
88+
if "Dataset Name" in datasets_df.columns
89+
else str(single_match["Filename"]).upper()
90+
),
91+
"dataset_label": str(single_match["Label"]),
5792
}
58-
metadata.update(variables_meta)
59-
metadata.update(self.__data_meta())
60-
metadata.update(self.__dataset_label())
61-
return metadata
6293

63-
def __get_variable_metadata(
64-
self, dataset_name: str, variables_file_path: Path
94+
def __variable_metadata(
95+
self,
6596
) -> dict:
6697
logger = logging.getLogger("validator")
98+
if not self.variables_csv_path.exists():
99+
logger.info("No variables file found for %s", self.dataset)
100+
return {}
67101
try:
68-
meta_df = pd.read_csv(variables_file_path, encoding=self.encoding)
102+
meta_df = pd.read_csv(self.variables_csv_path, encoding=self.encoding)
69103
except (UnicodeDecodeError, UnicodeError) as e:
70104
logger.error(
71-
f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. "
105+
f"Could not decode CSV file {self.variables_csv_path} with {self.encoding} encoding: {e}. "
72106
f"Please specify the correct encoding using the -e flag."
73107
)
74108
return {}
75109
except Exception as e:
76110
logger.error("Error reading CSV file %s. %s", self.file_path, e)
77111
return {}
78112

79-
meta_df["dataset"] = meta_df["dataset"].apply(
80-
lambda x: Path(str(x)).stem.lower()
81-
)
82-
83-
dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name]
113+
dataset_meta_df = meta_df[meta_df["dataset"] == self.dataset]
84114

85115
if dataset_meta_df.empty:
86-
logger = logging.getLogger("validator")
87-
logger.info("No dataset metadata found for %s", dataset_name)
116+
logger.info("No dataset metadata found for %s", self.dataset)
88117
return {}
89118

90119
variable_names = dataset_meta_df["variable"].tolist()
@@ -95,7 +124,16 @@ def __get_variable_metadata(
95124
zip(variable_names, dataset_meta_df["type"])
96125
)
97126
variable_name_to_size_map = {
98-
var: (int(length) if pd.notna(length) else None)
127+
var: (
128+
int(length)
129+
if pd.notna(length)
130+
and (
131+
# Because NaN is a float, pandas forces an array of integers with any missing values to become floating point
132+
isinstance(length, int | float)
133+
or (isinstance(length, str) and length.isdigit())
134+
)
135+
else None
136+
)
99137
for var, length in zip(variable_names, dataset_meta_df["length"])
100138
}
101139
return {
@@ -108,41 +146,7 @@ def __get_variable_metadata(
108146
"number_of_variables": len(variable_names),
109147
}
110148

111-
def __dataset_label(self) -> dict:
112-
logger = logging.getLogger("validator")
113-
114-
if not self.datasets_csv_path.exists():
115-
return {}
116-
117-
try:
118-
datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding)
119-
except (UnicodeDecodeError, UnicodeError) as e:
120-
logger.error(
121-
f"\n Error reading CSV from: {self.file_path}"
122-
f"\n Failed to decode with {self.encoding} encoding: {e}"
123-
f"\n Please specify the correct encoding using the -e flag."
124-
)
125-
return {}
126-
except Exception as e:
127-
logger.error("Error reading CSV file %s. %s", self.file_path, e)
128-
return {}
129-
130-
if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns:
131-
return {}
132-
133-
datasets_df["dataset"] = datasets_df["Filename"].apply(
134-
lambda x: Path(str(x)).stem.lower()
135-
)
136-
137-
current_dataset = Path(self.file_name).stem.lower()
138-
match = datasets_df[datasets_df["dataset"] == current_dataset]
139-
140-
if match.empty:
141-
return {}
142-
143-
return {"dataset_label": str(match.iloc[0]["Label"])}
144-
145-
def __data_meta(self):
149+
def __data_metadata(self):
146150
logger = logging.getLogger("validator")
147151
result = {
148152
"dataset_length": 0,

cdisc_rules_engine/services/data_services/local_data_service.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface:
144144
basename(full_path).split(".")[1].upper()
145145
)
146146
df = reader.from_file(full_path)
147+
# Build a simulated json pointer for the case where we are simulating json data.
148+
if self.standard == "usdm":
149+
df["_path"] = [f"/{dataset_name}/{i}" for i in range(len(df))]
147150
return df
148151

149152
@cached_dataset(DatasetTypes.VARIABLES_METADATA.value)

tests/unit/test_csv_reader.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,9 @@ def test_duplicate_paths_removed(self, tmp_path):
178178

179179
VARIABLES_CSV = textwrap.dedent("""\
180180
dataset,variable,label,type,length
181-
patients.csv,id,Patient ID,integer,10
182-
patients.csv,name,Patient Name,string,50
183-
patients.csv,age,Patient Age,integer,3
181+
patients,id,Patient ID,integer,10
182+
patients,name,Patient Name,string,50
183+
patients,age,Patient Age,integer,3
184184
""")
185185

186186
DATA_CSV = textwrap.dedent("""\
@@ -192,7 +192,7 @@ def test_duplicate_paths_removed(self, tmp_path):
192192

193193
DATASETS_CSV = textwrap.dedent("""\
194194
Filename,Label
195-
patients.csv,Patient Dataset
195+
patients,Patient Dataset
196196
""")
197197

198198

@@ -299,7 +299,7 @@ def test_variable_name_to_size_map_with_values(self):
299299
def test_variable_name_to_size_map_with_nan_length(self):
300300
variables_with_nan = textwrap.dedent("""\
301301
dataset,variable,label,type,length
302-
patients.csv,id,Patient ID,integer,
302+
patients,id,Patient ID,integer,
303303
""")
304304
_write(self._variables_path(), variables_with_nan)
305305
reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv")
@@ -308,7 +308,7 @@ def test_variable_name_to_size_map_with_nan_length(self):
308308

309309
def test_dataset_name_lookup_is_case_insensitive(self):
310310
"""File name with mixed case should still match _variables.csv entry."""
311-
variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV")
311+
variables_upper = VARIABLES_CSV.replace("patients", "PATIENTS")
312312
_write(self._variables_path(), variables_upper)
313313
reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV")
314314
result = reader.read()

0 commit comments

Comments
 (0)