Skip to content

Commit 66f5e2d

Browse files
authored
missing all outcome vars logic changed (#1110)
* missing all outcome vars logic changed * updated logic * update * updated logic * updated logic
1 parent 9cbb09a commit 66f5e2d

3 files changed

Lines changed: 89 additions & 24 deletions

File tree

cdisc_rules_engine/exceptions/custom_exceptions.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,6 @@ class InvalidMatchKeyError(EngineError):
3333
description = "Invalid match key provided"
3434

3535

36-
class InvalidOutputVariables(EngineError):
37-
code = 400
38-
description = "Invalid output variables"
39-
40-
4136
class VariableMetadataNotFoundError(EngineError):
4237
code = 400
4338
description = (

cdisc_rules_engine/models/actions.py

Lines changed: 89 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
SOURCE_ROW_NUMBER,
1111
)
1212
from cdisc_rules_engine.enums.sensitivity import Sensitivity
13-
from cdisc_rules_engine.exceptions.custom_exceptions import InvalidOutputVariables
1413
from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata
1514
from cdisc_rules_engine.models.dataset_variable import DatasetVariable
1615
from cdisc_rules_engine.models.validation_error_container import (
@@ -120,26 +119,41 @@ def generate_targeted_error_object(
120119
df_columns: set = set(data)
121120
targets_in_dataset = targets.intersection(df_columns)
122121
targets_not_in_dataset = targets.difference(df_columns)
123-
errors_df = data[list(targets_in_dataset)]
122+
all_targets_missing = (
123+
len(targets_in_dataset) == 0 and len(targets_not_in_dataset) > 0
124+
)
125+
if targets_in_dataset:
126+
errors_df = data[list(targets_in_dataset)]
127+
else:
128+
errors_df = data
124129
if not targets:
125130
errors_df = data
126-
if errors_df.empty:
127-
raise InvalidOutputVariables(
128-
f"Output variables: {list(targets)} not found in dataset"
129-
)
131+
130132
if self.rule.get("sensitivity") == Sensitivity.DATASET.value:
131133
# Only generate one error for rules with dataset sensitivity
134+
missing_vars = {
135+
target: "Not in dataset" for target in targets_not_in_dataset
136+
}
137+
138+
# Create the initial error
139+
error_value = (
140+
dict(errors_df.iloc[0].to_dict()) if not all_targets_missing else {}
141+
)
142+
143+
# Add missing variables to the error value
144+
if missing_vars:
145+
error_value = {**error_value, **missing_vars}
146+
132147
errors_list = [
133148
ValidationErrorEntity(
134-
value=dict(errors_df.iloc[0].to_dict()),
149+
value=error_value,
135150
dataset=self._get_dataset_name(data),
136151
)
137152
]
138153
elif self.rule.get("sensitivity") == Sensitivity.RECORD.value:
139-
errors_series: pd.Series = errors_df.apply(
140-
lambda df_row: self._create_error_object(df_row, data), axis=1
154+
errors_list = self._generate_errors_by_target_presence(
155+
data, targets_not_in_dataset, all_targets_missing, errors_df
141156
)
142-
errors_list: List[ValidationErrorEntity] = errors_series.tolist()
143157
elif (
144158
self.rule.get("sensitivity") is not None
145159
): # rule sensitivity is incorrectly defined
@@ -163,14 +177,9 @@ def generate_targeted_error_object(
163177
errors=[error_entity],
164178
)
165179
else: # rule sensitivity is undefined
166-
errors_series: pd.Series = errors_df.apply(
167-
lambda df_row: self._create_error_object(df_row, data), axis=1
180+
errors_list = self._generate_errors_by_target_presence(
181+
data, targets_not_in_dataset, all_targets_missing, errors_df
168182
)
169-
errors_list: List[ValidationErrorEntity] = errors_series.tolist()
170-
missing_vars = {target: "Not in dataset" for target in targets_not_in_dataset}
171-
if missing_vars:
172-
for error in errors_list:
173-
error.value = {**error.value, **missing_vars}
174183
return ValidationErrorContainer(
175184
**{
176185
"domain": (
@@ -187,6 +196,69 @@ def generate_targeted_error_object(
187196
}
188197
)
189198

199+
def _generate_errors_by_target_presence(
200+
self,
201+
data: pd.DataFrame,
202+
targets_not_in_dataset: Set[str],
203+
all_targets_missing: bool,
204+
errors_df: pd.DataFrame,
205+
) -> List[ValidationErrorEntity]:
206+
"""
207+
Generate error list based on presence of target variables in the dataset.
208+
Handles two cases: (1) when all targets are missing, or (2) when some targets are present.
209+
210+
Args:
211+
data: The original dataframe
212+
targets_not_in_dataset: Set of target variables not found in the dataset
213+
all_targets_missing: Boolean indicating if all targets are missing
214+
errors_df: DataFrame subset with only the target variables (if any exist)
215+
216+
Returns:
217+
List of ValidationErrorEntity objects
218+
"""
219+
missing_vars = {target: "Not in dataset" for target in targets_not_in_dataset}
220+
221+
if all_targets_missing:
222+
errors_list = []
223+
for idx, row in data.iterrows():
224+
error = ValidationErrorEntity(
225+
value={
226+
target: "Not in dataset" for target in targets_not_in_dataset
227+
},
228+
dataset=self._get_dataset_name(pd.DataFrame([row])),
229+
row=int(row.get(SOURCE_ROW_NUMBER, idx + 1)),
230+
usubjid=(
231+
str(row.get("USUBJID"))
232+
if "USUBJID" in row and not pd.isna(row["USUBJID"])
233+
else None
234+
),
235+
sequence=(
236+
int(row.get(f"{self.dataset_metadata.domain or ''}SEQ"))
237+
if f"{self.dataset_metadata.domain or ''}SEQ" in row
238+
and self._sequence_exists(
239+
pd.Series(
240+
{
241+
idx: row.get(
242+
f"{self.dataset_metadata.domain or ''}SEQ"
243+
)
244+
}
245+
),
246+
idx,
247+
)
248+
else None
249+
),
250+
)
251+
errors_list.append(error)
252+
else:
253+
errors_series: pd.Series = errors_df.apply(
254+
lambda df_row: self._create_error_object(df_row, data), axis=1
255+
)
256+
errors_list: List[ValidationErrorEntity] = errors_series.tolist()
257+
if missing_vars:
258+
for error in errors_list:
259+
error.value = {**error.value, **missing_vars}
260+
return errors_list
261+
190262
def _get_dataset_name(self, data: pd.DataFrame) -> str:
191263
source_pathnames = data.get(SOURCE_FILENAME, [])
192264
source_filenames = [

cdisc_rules_engine/operations/base_operation.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
RuleExecutionError,
3131
RuleFormatError,
3232
InvalidMatchKeyError,
33-
InvalidOutputVariables,
3433
VariableMetadataNotFoundError,
3534
DomainNotFoundInDefineXMLError,
3635
InvalidDatasetFormat,
@@ -79,7 +78,6 @@ def execute(self) -> DatasetInterface:
7978
RuleExecutionError,
8079
RuleFormatError,
8180
InvalidMatchKeyError,
82-
InvalidOutputVariables,
8381
VariableMetadataNotFoundError,
8482
DomainNotFoundInDefineXMLError,
8583
InvalidDatasetFormat,

0 commit comments

Comments
 (0)