Skip to content

Commit c81784a

Browse files
Variable is null (#1627)
* update * operator * wildcard tests * changes * value error test case --------- Co-authored-by: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com>
1 parent 4d4343a commit c81784a

16 files changed

Lines changed: 398 additions & 83 deletions

cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ def build(self):
1313
variable_label
1414
variable_size
1515
variable_data_type
16+
variable_is_empty
17+
variable_has_empty_values
1618
define_variable_name,
1719
define_variable_label,
1820
define_variable_data_type,
@@ -29,7 +31,6 @@ def build(self):
2931
define_variable_codelist_coded_values,
3032
define_variable_codelist_coded_codes,
3133
define_variable_mandatory,
32-
variable_has_empty_values
3334
library_variable_name,
3435
library_variable_label,
3536
library_variable_data_type,
@@ -82,24 +83,22 @@ def build(self):
8283
right_on="library_variable_name",
8384
).fillna("")
8485

85-
final_dataframe["variable_has_empty_values"] = final_dataframe.apply(
86-
lambda row: self.variable_has_null_values(
87-
(
88-
row["variable_name"]
89-
if row["variable_name"] != ""
90-
else row["library_variable_name"]
86+
final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = (
87+
final_dataframe.apply(
88+
lambda row: self.get_variable_null_stats(
89+
row["variable_name"], dataset_contents
9190
),
92-
dataset_contents,
93-
),
94-
axis=1,
91+
axis=1,
92+
result_type="expand",
93+
)
9594
)
9695

9796
return final_dataframe
9897

99-
def variable_has_null_values(
98+
def get_variable_null_stats(
10099
self, variable: str, content: DatasetInterface
101-
) -> bool:
100+
) -> tuple[bool, bool]:
102101
if variable not in content:
103-
return True
104-
series = content[variable]
105-
return series.mask(series == "").isnull().any()
102+
return True, True
103+
series = content[variable].mask(content[variable] == "")
104+
return series.isnull().any(), series.isnull().all()

cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def build(self):
1313
variable_size
1414
variable_data_type
1515
variable_has_empty_values
16+
variable_is_empty
1617
library_variable_name,
1718
library_variable_label,
1819
library_variable_data_type,
@@ -57,18 +58,20 @@ def build(self):
5758
right_on="library_variable_name",
5859
).fillna("")
5960

60-
data["variable_has_empty_values"] = data.apply(
61-
lambda row: self.variable_has_null_values(
61+
data[["variable_has_empty_values", "variable_is_empty"]] = data.apply(
62+
lambda row: self.get_variable_null_stats(
6263
row["variable_name"], dataset_contents
6364
),
6465
axis=1,
66+
result_type="expand",
6567
)
68+
6669
return data
6770

68-
def variable_has_null_values(
71+
def get_variable_null_stats(
6972
self, variable: str, content: DatasetInterface
70-
) -> bool:
73+
) -> tuple[bool, bool]:
7174
if variable not in content:
72-
return True
73-
series = content[variable]
74-
return series.mask(series == "").isnull().any()
75+
return True, True
76+
series = content[variable].mask(content[variable] == "")
77+
return series.isnull().any(), series.isnull().all()

cdisc_rules_engine/models/operation_params.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class OperationParams:
5656
original_target: str = None
5757
regex: str = None
5858
returntype: str = None
59+
source: str = None
5960
target: str = None
6061
value_is_reference: bool = False
6162
namespace: str = None

cdisc_rules_engine/operations/base_operation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,10 @@ def _filter_data(self, data):
173173
def _is_wildcard_pattern(self, value: str) -> bool:
174174
if not isinstance(value, str):
175175
return False
176-
return value.endswith("%")
176+
return value.endswith("&")
177177

178178
def _apply_wildcard_filter(self, series: pd.Series, pattern: str) -> pd.Series:
179-
prefix = pattern.rstrip("%")
179+
prefix = pattern.rstrip("&")
180180
result = series.str.startswith(prefix, na=False)
181181
return result
182182

cdisc_rules_engine/operations/variable_is_null.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,16 @@
33

44
class VariableIsNull(BaseOperation):
55
def _execute_operation(self):
6-
# Always get the content dataframe. Similar to variable_exists check
7-
dataframe = self.data_service.get_dataset(dataset_name=self.params.dataset_path)
8-
if self.params.target.startswith("define_variable"):
9-
# Handle checks against define metadata
10-
target_column = self.evaluation_dataset[self.params.target]
11-
result = [
12-
self._is_target_variable_null(dataframe, value)
13-
for value in target_column
14-
]
15-
return self.data_service.dataset_implementation().convert_to_series(result)
6+
if self.params.source == "submission":
7+
if self.params.level == "row":
8+
raise ValueError("level: row may only be used with source: evaluation")
9+
dataframe = self.data_service.get_dataset(
10+
dataset_name=self.params.dataset_path
11+
)
1612
else:
17-
target_variable = self.params.target
18-
return self._is_target_variable_null(dataframe, target_variable)
13+
dataframe = self.evaluation_dataset
14+
15+
return self._is_target_variable_null(dataframe, self.params.target)
1916

2017
def _is_target_variable_null(self, dataframe, target_variable: str) -> bool:
2118
if target_variable not in dataframe:

cdisc_rules_engine/utilities/rule_processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def perform_rule_operations(
393393
original_target=original_target,
394394
regex=operation.get("regex"),
395395
returntype=operation.get("returntype"),
396+
source=operation.get("source"),
396397
standard=standard,
397398
standard_substandard=standard_substandard,
398399
standard_version=standard_version,

resources/schema/rule/MetaVariables.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@
146146
},
147147
{ "const": "variable_format" },
148148
{ "const": "variable_has_empty_values" },
149+
{ "const": "variable_is_empty" },
149150
{ "const": "variable_label" },
150151
{ "const": "variable_name" },
151152
{

resources/schema/rule/MetaVariables.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,10 @@ Variable format
238238

239239
True/False value indicating whether a variable has any empty values
240240

241+
## variable_is_empty
242+
243+
True/False value indicating whether a variable is completely empty
244+
241245
## variable_label
242246

243247
Variable long label

resources/schema/rule/Operations.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,9 @@
557557
"type": "string",
558558
"enum": ["code", "value", "pref_term"]
559559
},
560+
"source": {
561+
"type": "string"
562+
},
560563
"term_value": {
561564
"type": "string"
562565
},

resources/schema/rule/Operations.md

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,7 +1013,7 @@ Operations:
10131013
10141014
### record_count
10151015
1016-
If no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.
1016+
If no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. Filter can have a wildcard `&` that when added to the end of the filter value will look for all instances of that prefix (see 4th example below). If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.
10171017

10181018
If both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables.
10191019

@@ -1058,7 +1058,7 @@ Example: return the number of records where QNAM starts with "RACE" (matches RAC
10581058
- operation: record_count
10591059
id: $race_records_in_dataset
10601060
filter:
1061-
QNAM: "RACE%"
1061+
QNAM: "RACE&"
10621062
group:
10631063
- "USUBJID"
10641064
```
@@ -1291,7 +1291,7 @@ Match Datasets:
12911291
12921292
### variable_exists
12931293
1294-
Flag an error if MIDS is in the dataset currently being evaluated and the TM domain is not present in the study
1294+
Operation operates only on original submission datasets regardless of rule type. Flags an error if a column exists is in the submission dataset currently being evaluated.
12951295
12961296
Rule Type: Domain Presence Check
12971297
@@ -1312,13 +1312,18 @@ Operations:
13121312
### variable_is_null
13131313
13141314
Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty.
1315-
The operation can work with both direct variable names and define metadata references (variables starting with "define_variable").
1315+
The operation supports two sources via the `source` parameter:
1316+
1317+
- **`submission`** : checks against the raw submission dataset
1318+
- **`evaluation`** (default): checks against the evaluation dataset built based on the rule type
13161319

13171320
```yaml
1321+
# Dataset level check - is this variable entirely null/missing from the source data?
13181322
Operations:
13191323
- operator: variable_is_null
13201324
name: USUBJID
1321-
id: $aeterm_is_null
1325+
id: $usubjid_is_null
1326+
source: submission
13221327
```
13231328

13241329
### get_xhtml_errors

0 commit comments

Comments
 (0)