Skip to content

Commit 21ac8e6

Browse files
1681: added @cached decorator to get_dataset (#1687)
* added cached_dataset decorator to get_dataset * added per-builder dataset caching on rule validation. * cache tests * fix for tests -- RulesEngine cache is cleared before each test * added fields for caching to BaseDatasetBuilder * remove comment * removed unnecessary kwargs pop call * test for ignored kwargs in split dataset build * added cached decorator to JSONataDatasetBuilder * removed kwargs from get_dataset method * fix test * removed params and unused drop_duplicates argument from get metadata * remove kwargs * test --------- Co-authored-by: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com> Co-authored-by: Samuel Johnson <sfjohnson24@gmail.com>
1 parent d5bcca9 commit 21ac8e6

23 files changed

Lines changed: 172 additions & 42 deletions

cdisc_rules_engine/dataset_builders/base_dataset_builder.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import (
66
DefineXMLReaderFactory,
77
)
8+
from cdisc_rules_engine.utilities.decorators import cached
89
from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets
910
from cdisc_rules_engine.utilities.sdtm_utilities import (
1011
tag_source,
@@ -34,6 +35,7 @@ def __init__(
3435
):
3536
self.data_service = data_service
3637
self.cache = cache_service
38+
self.cache_service = cache_service
3739
self.data_processor = data_processor
3840
self.rule_processor = rule_processor
3941
self.dataset_metadata = dataset_metadata
@@ -44,6 +46,14 @@ def __init__(
4446
self.standard_substandard = standard_substandard
4547
self.library_metadata = library_metadata
4648
self.dataset_implementation = self.data_service.dataset_implementation
49+
if isinstance(dataset_metadata, SDTMDatasetMetadata):
50+
self.domain = (
51+
f"SUPP{dataset_metadata.rdomain}"
52+
if dataset_metadata.rdomain
53+
else dataset_metadata.domain
54+
)
55+
self.dataset_name = dataset_metadata.name
56+
self.name = self.__class__.__name__
4757

4858
@abstractmethod
4959
def build(self) -> DatasetInterface:
@@ -67,7 +77,8 @@ def build_split_datasets(self, dataset_name: str, **kwargs) -> DatasetInterface:
6777
finally:
6878
self.dataset_metadata = original_dataset_metadata
6979

70-
def get_dataset(self, **kwargs):
80+
@cached("get_dataset")
81+
def get_dataset(self):
7182
# If validating dataset content, ensure split datasets are handled.
7283
if self.dataset_metadata.is_split:
7384
# Handle split datasets for content checks.
@@ -77,15 +88,14 @@ def get_dataset(self, **kwargs):
7788
datasets_metadata=get_corresponding_datasets(
7889
self.data_service.get_datasets(), self.dataset_metadata
7990
),
80-
**kwargs,
8191
)
8292
else:
8393
# single dataset. the most common case
8494
dataset: DatasetInterface = self.build()
8595
dataset = tag_source(dataset, self.dataset_metadata)
8696
return dataset
8797

88-
def get_dataset_contents(self, **kwargs):
98+
def get_dataset_contents(self):
8999
# If validating dataset content, ensure split datasets are handled.
90100
if self.dataset_metadata.is_split:
91101
# Handle split datasets for content checks.
@@ -95,7 +105,6 @@ def get_dataset_contents(self, **kwargs):
95105
datasets_metadata=get_corresponding_datasets(
96106
self.data_service.get_datasets(), self.dataset_metadata
97107
),
98-
**kwargs,
99108
)
100109
else:
101110
# single dataset. the most common case

cdisc_rules_engine/dataset_builders/contents_dataset_builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ def build_split_datasets(self, dataset_name, **kwargs):
1717
"""
1818
return self.data_service.get_dataset(dataset_name=dataset_name)
1919

20-
def get_dataset(self, **kwargs):
21-
dataset = super().get_dataset(**kwargs)
20+
def get_dataset(self):
21+
dataset = super().get_dataset()
2222
length = sum(
2323
[
2424
dataset.record_count

cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,5 @@ def build(self):
1414
"""
1515

1616
return self.dataset_implementation.from_records(
17-
{ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()},
18-
index=[0],
17+
[{ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}]
1918
)

cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def _get_cached_dataset(self) -> dict[str, list[str]]:
4646

4747
return errlist
4848

49-
def get_dataset(self, **kwargs) -> DatasetInterface:
49+
def get_dataset(self) -> DatasetInterface:
5050
dataset = self._get_cached_dataset()
5151
records = [
5252
{key: dataset[key][i] for key in dataset}
@@ -56,10 +56,10 @@ def get_dataset(self, **kwargs) -> DatasetInterface:
5656
row for row in records if row["dataset"] == self.dataset_metadata.name
5757
]
5858
if filtered:
59-
result = self.dataset_implementation.from_records(filtered, **kwargs)
59+
result = self.dataset_implementation.from_records(filtered)
6060
else:
6161
empty_row = {key: "" for key in self.dataset_template.keys()}
62-
result = self.dataset_implementation.from_records([empty_row], **kwargs)
62+
result = self.dataset_implementation.from_records([empty_row])
6363
return tag_source(result, self.dataset_metadata)
6464

6565
def list_errors(self, tree: exceptions.ErrorTree, errlist: dict[str, list]):

cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from json import load
22
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
3+
from cdisc_rules_engine.utilities.decorators import cached
34

45

56
def add_json_pointer_paths(node, path=""):
@@ -19,7 +20,8 @@ def add_json_pointer_paths(node, path=""):
1920

2021
class JSONataDatasetBuilder(BaseDatasetBuilder):
2122

22-
def get_dataset(self, **kwargs):
23+
@cached("get_dataset")
24+
def get_dataset(self):
2325
if not self.dataset_metadata.full_path:
2426
return None
2527
with self.data_service.read_data(self.dataset_metadata.full_path) as fp:

cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ def build(self):
1717
"""
1818
# Get basic variable metadata
1919
variables_metadata = self.data_service.get_variables_metadata(
20-
dataset_name=self.dataset_metadata.name,
21-
drop_duplicates=True,
20+
dataset_name=self.dataset_metadata.name
2221
)
2322

2423
# Check if the rule requires variable_max_size

cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ def build(self):
2222
"""
2323
data_contents_long_df = super().build()
2424
variable_metadata = self.data_service.get_variables_metadata(
25-
dataset_name=self.dataset_metadata.name,
26-
drop_duplicates=True,
25+
dataset_name=self.dataset_metadata.name
2726
)
2827
merged_df = data_contents_long_df.merge(
2928
variable_metadata._data, how="left", on="variable_name"

cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@ def build(self):
4242
"""
4343
variable_metadata: List[dict] = self.get_define_xml_variables_metadata()
4444
content_metadata: DatasetInterface = self.data_service.get_variables_metadata(
45-
dataset_name=self.dataset_metadata.name,
46-
drop_duplicates=True,
45+
dataset_name=self.dataset_metadata.name
4746
)
4847
define_metadata: DatasetInterface = self.dataset_implementation.from_records(
4948
variable_metadata

cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@ def build(self):
3636
variable_metadata: List[dict] = self.get_define_xml_variables_metadata()
3737
# get dataset metadata and execute the rule
3838
content_metadata: DatasetInterface = self.data_service.get_variables_metadata(
39-
dataset_name=self.dataset_metadata.name,
40-
drop_duplicates=True,
39+
dataset_name=self.dataset_metadata.name
4140
)
4241
define_metadata: DatasetInterface = self.dataset_implementation.from_records(
4342
variable_metadata

cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ def build(self):
2626
# get dataset metadata and execute the rule
2727
content_variables_metadata: DatasetInterface = (
2828
self.data_service.get_variables_metadata(
29-
dataset_name=self.dataset_metadata.name,
30-
drop_duplicates=True,
29+
dataset_name=self.dataset_metadata.name
3130
)
3231
)
3332
dataset_contents = self.get_dataset_contents()

0 commit comments

Comments
 (0)