diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index ddff41a59..66d2d8fdf 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -809,6 +809,7 @@ def equals_string_part(self, other_value): column with a regex """ target = other_value.get("target") + type_insensitive = other_value.get("type_insensitive", False) comparator = other_value.get("comparator") regex = other_value.get("regex") value_is_literal: bool = other_value.get("value_is_literal", False) @@ -821,7 +822,11 @@ def equals_string_part(self, other_value): self.value[parsed_id] = parsed_data return self.value.apply( lambda row: self._check_equality( - row, target, parsed_id, value_is_literal=False + row, + target, + parsed_id, + value_is_literal=False, + type_insensitive=type_insensitive, ), axis=1, ) diff --git a/resources/schema/rule-merged/Operator.json b/resources/schema/rule-merged/Operator.json index 1c1f13a64..d7d57b73a 100644 --- a/resources/schema/rule-merged/Operator.json +++ b/resources/schema/rule-merged/Operator.json @@ -135,7 +135,10 @@ "properties": { "operator": { "const": "does_not_equal_string_part", - "markdownDescription": "\nComplement of `equals_string_part`\n" + "markdownDescription": "\nComplement of `equals_string_part`. Also has the optional parameter 'type_insensitive'.\n" + }, + "type_insensitive": { + "type": "boolean" } }, "required": ["operator", "value", "regex"], @@ -223,7 +226,10 @@ "properties": { "operator": { "const": "equals_string_part", - "markdownDescription": "\nChecks that the values in the target column equal the result of parsing the value in the comparison column with a regex\n\n> RDOMAIN equals characters 5 and 6 of SUPP dataset name\n\n```yaml\n- name: RDOMAIN\n operator: equals_string_part\n value: dataset_name\n regex: \".{4}(..).*\"\n```\n" + "markdownDescription": "\nChecks that the values in the target column equal the result of parsing the value in the comparison column with a regex\nHas optional parameter:\n\n- 'type_insensitive' when true, both values are converted to strings before comparison to handle type mismatches between string and numeric data. NOTE: all trailing zeroes will be removed in both strings and floats.\n\n> RDOMAIN equals characters 5 and 6 of SUPP dataset name\n\n```yaml\n- name: RDOMAIN\n operator: equals_string_part\n type_insensitive: true\n value: dataset_name\n regex: \".{4}(..).*\"\n```\n" + }, + "type_insensitive": { + "type": "boolean" } }, "required": ["operator", "value", "regex"], diff --git a/resources/schema/rule/Operator.json b/resources/schema/rule/Operator.json index 61ed71f2f..021e79f89 100644 --- a/resources/schema/rule/Operator.json +++ b/resources/schema/rule/Operator.json @@ -73,7 +73,10 @@ "type": "object" }, { - "properties": { "operator": { "const": "does_not_equal_string_part" } }, + "properties": { + "operator": { "const": "does_not_equal_string_part" }, + "type_insensitive": { "type": "boolean" } + }, "required": ["operator", "value", "regex"], "type": "object" }, @@ -120,7 +123,10 @@ "type": "object" }, { - "properties": { "operator": { "const": "equals_string_part" } }, + "properties": { + "operator": { "const": "equals_string_part" }, + "type_insensitive": { "type": "boolean" } + }, "required": ["operator", "value", "regex"], "type": "object" }, diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 4685cac0f..0a46ea122 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -156,17 +156,21 @@ Text-based operations including regex pattern matching, substring operations, pr ### does_not_equal_string_part -Complement of `equals_string_part` +Complement of `equals_string_part`. Also has the optional parameter 'type_insensitive'. ### equals_string_part Checks that the values in the target column equal the result of parsing the value in the comparison column with a regex +Has optional parameter: + +- 'type_insensitive' when true, both values are converted to strings before comparison to handle type mismatches between string and numeric data. NOTE: all trailing zeroes will be removed in both strings and floats. > RDOMAIN equals characters 5 and 6 of SUPP dataset name ```yaml - name: RDOMAIN operator: equals_string_part + type_insensitive: true value: dataset_name regex: ".{4}(..).*" ``` diff --git a/tests/unit/test_check_operators/test_string_comparison.py b/tests/unit/test_check_operators/test_string_comparison.py index 1ec3277d3..522d6ad5f 100644 --- a/tests/unit/test_check_operators/test_string_comparison.py +++ b/tests/unit/test_check_operators/test_string_comparison.py @@ -22,6 +22,20 @@ DaskDataset, [True, True, False], ), + ( + {"VAR2": ["<40"], "target": [40]}, + "VAR2", + ".(.*)", + PandasDataset, + [False], + ), + ( + {"VAR2": ["<40"], "target": [40]}, + "VAR2", + ".(.*)", + DaskDataset, + [False], + ), ], ) def test_equals_string_part(data, comparator, regex, dataset_type, expected_result): @@ -33,6 +47,111 @@ def test_equals_string_part(data, comparator, regex, dataset_type, expected_resu assert result.equals(df.convert_to_series(expected_result)) +@pytest.mark.parametrize( + "data,comparator,operator,regex,dataset_type,expected_result", + [ + ( + {"VAR2": [">=40", "<=50"], "target": [40, 50]}, + "VAR2", + "equals_string_part", + ".{2}(.*)", + PandasDataset, + [True, True], + ), + ( + {"VAR2": [">=40", "<=50"], "target": [40, 50]}, + "VAR2", + "equals_string_part", + ".{2}(.*)", + DaskDataset, + [True, True], + ), + ( + {"VAR2": [">=42 ", "<=55 "], "target": [40, 50]}, + "VAR2", + "does_not_equal_string_part", + ".{2}(.*)", + PandasDataset, + [True, True], + ), + ( + {"VAR2": [">=42 ", "<=55 "], "target": [40, 50]}, + "VAR2", + "does_not_equal_string_part", + ".{2}(.*)", + DaskDataset, + [True, True], + ), + ( + {"VAR2": [">40", "<50"], "target": [40, 50]}, + "VAR2", + "equals_string_part", + ".(.*)", + DaskDataset, + [True, True], + ), + ( + {"VAR2": [">45", "<52"], "target": [40, 50]}, + "VAR2", + "does_not_equal_string_part", + ".(.*)", + PandasDataset, + [True, True], + ), + ( + {"VAR2": [">45", "<52"], "target": [40, 50]}, + "VAR2", + "does_not_equal_string_part", + ".(.*)", + DaskDataset, + [True, True], + ), + ( + {"VAR2": [">40", "<50"], "target": [40.0, 50.0]}, + "VAR2", + "equals_string_part", + ".(.*)", + PandasDataset, + [True, True], + ), + ( + {"VAR2": [">45", "<52"], "target": [40.0, 50.0]}, + "VAR2", + "does_not_equal_string_part", + ".(.*)", + DaskDataset, + [True, True], + ), + ], +) +def test_equals_string_part_type_insensitive( + data, comparator, operator, regex, dataset_type, expected_result +): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + + if operator == "equals_string_part": + result = dataframe_type.equals_string_part( + { + "target": "target", + "comparator": comparator, + "regex": regex, + "type_insensitive": True, + } + ) + else: + result = dataframe_type.does_not_equal_string_part( + { + "target": "target", + "comparator": comparator, + "regex": regex, + "type_insensitive": True, + } + ) + + assert result.equals(df.convert_to_series(expected_result)) + + @pytest.mark.parametrize( "data,comparator,regex,dataset_type,value_is_literal,expected_result", [