Skip to content

Commit 3bd0a27

Browse files
authored
Variable value (#1148)
* started dataset builder * new rule types * progress * updated rule type json * updating builders * update * added split dataset method * split dataset synthesis * merged logic * updated tests * update * update test * updated test
1 parent 08b282e commit 3bd0a27

15 files changed

Lines changed: 756 additions & 21 deletions

cdisc_rules_engine/dataset_builders/base_dataset_builder.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,18 @@ def build(self) -> DatasetInterface:
5656
"""
5757
pass
5858

59-
@abstractmethod
60-
def build_split_datasets(self, dataset_name) -> DatasetInterface:
59+
def build_split_datasets(self, dataset_name, **kwargs) -> DatasetInterface:
6160
"""
62-
Returns correct dataframe to operate on
61+
Returns correct dataframe to operate on.
62+
Default implementation that temporarily sets dataset_path to dataset_name and calls build().
6363
"""
64-
pass
64+
original_path = self.dataset_path
65+
try:
66+
self.dataset_path = dataset_name
67+
result = self.build(**kwargs)
68+
return result
69+
finally:
70+
self.dataset_path = original_path
6571

6672
def get_dataset(self, **kwargs):
6773
# If validating dataset content, ensure split datasets are handled.

cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,9 @@ def build(self):
1010
dataset_name - Name of the dataset
1111
dataset_label - Label for the dataset
1212
"""
13-
return self.build_split_datasets(self.dataset_path)
14-
15-
def build_split_datasets(self, dataset_name, **kwargs):
1613
size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule)
1714
return self.data_service.get_dataset_metadata(
18-
dataset_name=dataset_name,
15+
dataset_name=self.dataset_path,
1916
size_unit=size_unit,
2017
datasets=self.datasets,
2118
)

cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,13 @@ def build(self):
2727
2828
...,
2929
"""
30-
return self.build_split_datasets(self.dataset_metadata.filename)
31-
32-
def build_split_datasets(self, dataset_name, **kwargs):
33-
"""
34-
Returns the contents of a file as a dataframe for evaluation.
35-
"""
3630
data_contents_df = self.data_service.get_dataset(
37-
dataset_name=dataset_name, datasets=self.datasets
31+
dataset_name=self.dataset_path, datasets=self.datasets
3832
)
3933
# Build dataset metadata dataframe
4034
size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule)
4135
dataset_metadata = self.data_service.get_dataset_metadata(
42-
dataset_name=dataset_name, size_unit=size_unit, datasets=self.datasets
36+
dataset_name=self.dataset_path, size_unit=size_unit, datasets=self.datasets
4337
).to_dict(orient="records")[0]
4438
# Build define xml dataframe
4539
define = self.get_define_xml_item_group_metadata_for_dataset(dataset_metadata)

cdisc_rules_engine/dataset_builders/dataset_builder_factory.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@
4444
from cdisc_rules_engine.dataset_builders.variables_metadata_with_define_and_library_dataset_builder import (
4545
VariablesMetadataWithDefineAndLibraryDatasetBuilder,
4646
)
47+
from cdisc_rules_engine.dataset_builders.dataset_metadata_values_builder import (
48+
ValueCheckDatasetMetadataDatasetBuilder,
49+
)
50+
from cdisc_rules_engine.dataset_builders.variables_metadata_values_dataset_builder import (
51+
ValueCheckVariableMetadataDatasetBuilder,
52+
)
4753
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
4854
from cdisc_rules_engine.enums.rule_types import RuleTypes
4955

@@ -65,6 +71,8 @@ class DatasetBuilderFactory(FactoryInterface):
6571
RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_LIBRARY.value: VariablesMetadataWithLibraryMetadataDatasetBuilder,
6672
RuleTypes.DEFINE_ITEM_METADATA_CHECK_AGAINST_LIBRARY.value: DefineVariablesWithLibraryMetadataDatasetBuilder,
6773
RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE_XML_AND_LIBRARY.value: VariablesMetadataWithDefineAndLibraryDatasetBuilder,
74+
RuleTypes.VALUE_CHECK_WITH_DATASET_METADATA.value: ValueCheckDatasetMetadataDatasetBuilder,
75+
RuleTypes.VALUE_CHECK_WITH_VARIABLE_METADATA.value: ValueCheckVariableMetadataDatasetBuilder,
6876
}
6977

7078
@classmethod
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from cdisc_rules_engine.dataset_builders.values_dataset_builder import (
2+
ValuesDatasetBuilder,
3+
)
4+
5+
6+
class ValueCheckDatasetMetadataDatasetBuilder(ValuesDatasetBuilder):
7+
def build(self):
8+
"""
9+
Returns a long dataset where each value in each row of the original dataset is
10+
a row in the new dataset with dataset metadata attached.
11+
Columns available in the dataset include:
12+
- "row_number"
13+
- "variable_name"
14+
- "variable_value"
15+
- dataset_size - File size
16+
- dataset_location - Path to file
17+
- dataset_name - Name of the dataset
18+
- dataset_label - Label for the dataset
19+
"""
20+
size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule)
21+
dataset_metadata = self.data_service.get_dataset_metadata(
22+
dataset_name=self.dataset_path,
23+
size_unit=size_unit,
24+
datasets=self.datasets,
25+
)
26+
dataset_metadata = dataset_metadata.to_dict(orient="records")[0]
27+
data_contents_long_df = super().build()
28+
row_count = len(data_contents_long_df)
29+
for key, value in dataset_metadata.items():
30+
data_contents_long_df[key] = [value] * row_count
31+
return data_contents_long_df

cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,3 @@ def build(self):
1616
return self.data_service.get_variables_metadata(
1717
dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True
1818
)
19-
20-
def build_split_datasets(self, dataset_name, **kwargs):
21-
return self.data_service.get_variables_metadata(
22-
dataset_name=dataset_name, datasets=self.datasets, drop_duplicates=True
23-
)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from cdisc_rules_engine.dataset_builders.values_dataset_builder import (
2+
ValuesDatasetBuilder,
3+
)
4+
5+
6+
class ValueCheckVariableMetadataDatasetBuilder(ValuesDatasetBuilder):
7+
def build(self):
8+
"""
9+
Returns a long dataset where each value in each row of the original dataset is
10+
a row in the new dataset, with variable metadata attached.
11+
12+
Columns available in the dataset include:
13+
- "row_number"
14+
- "variable_name"
15+
- "variable_value"
16+
- "variable_order_number"
17+
- "variable_label"
18+
- "variable_size"
19+
- "variable_data_type"
20+
- "variable_format"
21+
- "variable_value_length"
22+
"""
23+
data_contents_long_df = super().build()
24+
variable_metadata = self.data_service.get_variables_metadata(
25+
dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True
26+
)
27+
merged_df = data_contents_long_df.merge(
28+
variable_metadata, how="left", on="variable_name"
29+
)
30+
merged_df["variable_value_length"] = merged_df.apply(
31+
lambda row: ValuesDatasetBuilder.calculate_variable_value_length(
32+
row["variable_value"], row["variable_data_type"]
33+
),
34+
axis=1,
35+
)
36+
return merged_df

cdisc_rules_engine/enums/rule_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,5 @@ class RuleTypes(BaseEnum):
2929
DEFINE_ITEM_METADATA_CHECK_AGAINST_LIBRARY = (
3030
"Define Item Metadata Check against Library Metadata"
3131
)
32+
VALUE_CHECK_WITH_DATASET_METADATA = "Value Check with Dataset Metadata"
33+
VALUE_CHECK_WITH_VARIABLE_METADATA = "Value Check with Variable Metadata"

cdisc_rules_engine/models/dataset/dask_dataset.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,24 @@ def fillna(
360360
def to_dict(self, **kwargs) -> dict:
361361
return list(self._data.map_partitions(lambda x: x.to_dict(orient="records")))
362362

363+
def items(self, **kwargs):
364+
computed_df = self._data.compute()
365+
return computed_df.to_dict(**kwargs).items()
366+
367+
def keys(self, **kwargs):
368+
"""
369+
Returns a object containing the keys in the dataset dictionary.
370+
"""
371+
computed_df = self._data.compute()
372+
return computed_df.to_dict(**kwargs).keys()
373+
374+
def values(self, **kwargs):
375+
"""
376+
Returns a object containing the values in the dataset dictionary.
377+
"""
378+
computed_df = self._data.compute()
379+
return computed_df.to_dict(**kwargs).values()
380+
363381
def isin(self, values):
364382
values_set = set(values)
365383

cdisc_rules_engine/models/dataset/dataset_interface.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,3 +251,22 @@ def to_dict(self, **kwargs) -> dict:
251251
"""
252252
Convert the dataset to a dictionary.
253253
"""
254+
255+
@abstractmethod
256+
def items(self, **kwargs):
257+
"""
258+
Convert the dataset to dictionary items.
259+
Returns a view object displaying a list of (key, value) tuple pairs.
260+
"""
261+
262+
@abstractmethod
263+
def keys(self, **kwargs):
264+
"""
265+
Returns a view object containing the keys in the dataset dictionary.
266+
"""
267+
268+
@abstractmethod
269+
def values(self, **kwargs):
270+
"""
271+
Returns a view object containing the values in the dataset dictionary.
272+
"""

0 commit comments

Comments
 (0)