Merge pull request #46 from lambda-feedback/test-tr124-restructuring

KarlLundengaard · web-flow · commit bb643237c4c6 · 2025-07-23T13:13:02.000Z
Test tr124 restructuring
diff --git a/app/context/physical_quantity.py b/app/context/physical_quantity.py
@@ -279,12 +279,12 @@ def quantity_match(unused_inputs):
             #       numerical tolerances can be applied appropriately
             if parsing_params.get('rtol', 0) > 0 or parsing_params.get('atol', 0) > 0:
                 if (lhs_string == 'answer' and rhs_string == 'response') or (lhs_string == 'response' and rhs_string == 'answer'):
-                    ans = parameters["reserved_expressions"]["answer"]["standard"]["value"]
-                    res = parameters["reserved_expressions"]["response"]["standard"]["value"]
+                    ans = parameters["reserved_expressions"]["answer"]["standard"]["value"].simplify()
+                    res = parameters["reserved_expressions"]["response"]["standard"]["value"].simplify()
                 if (ans is not None and ans.is_constant()) and (res is not None and res.is_constant()):
-                    if parsing_params.get('rtol', 0) > 0:
+                    if parsing_params.get('rtol', 0) > 0 and (ans != 0):
                         value_match = bool(abs(float((ans-res)/ans)) < parsing_params['rtol'])
-                    elif parsing_params.get('atol', 0) > 0:
+                    elif parsing_params.get('atol', 0) > 0 or (ans == 0):
                         value_match = bool(abs(float(ans-res)) < parsing_params['atol'])
 
         substitutions = [(key, expr["standard"]["unit"]) for (key, expr) in reserved_expressions]
@@ -541,20 +541,6 @@ def expression_preprocess(name, expr, parameters):
             expr = expr[0:match_content.span()[0]]+match_content.group().replace("*", " ")+expr[match_content.span()[1]:]
             match_content = re.search(search_string, expr)
 
-    prefixes = set(x[0] for x in set_of_SI_prefixes)
-    fundamental_units = set(x[0] for x in set_of_SI_base_unit_dimensions)
-    units_string = parameters["units_string"]
-    valid_units = set()
-    for key in units_sets_dictionary.keys():
-        if key in units_string:
-            for unit in units_sets_dictionary[key]:
-                valid_units = valid_units.union(set((unit[0], unit[1])+unit[3]+unit[4]))
-    dimensions = set(x[2] for x in set_of_SI_base_unit_dimensions)
-    unsplittable_symbols = list(prefixes | fundamental_units | valid_units | dimensions)
-    preprocess_parameters = deepcopy(parameters)
-    # TODO: find better way to prevent preprocessing from mangling reserved keywords for physical quantity criteria
-    preprocess_parameters.update({"reserved_keywords": preprocess_parameters.get("reserved_keywords", [])+unsplittable_symbols+['matches']})
-    expr = substitute_input_symbols(expr.strip(), preprocess_parameters)[0]
     success = True
     return success, expr, None
 
@@ -572,7 +558,9 @@ def feedback_string_generator(tags, graph, parameters_dict):
 def parsing_parameters_generator(params, unsplittable_symbols=tuple(), symbol_assumptions=tuple()):
     parsing_parameters = create_sympy_parsing_params(params)
     parsing_parameters.update({
-        "strictness": params.get("strictness", "natural")
+        "strictness": params.get("strictness", "natural"),
+        "rtol": float(params.get("rtol", 0)),
+        "atol": float(params.get("atol", 0)),
     })
     return parsing_parameters
 
diff --git a/app/context/symbolic.py b/app/context/symbolic.py
@@ -101,46 +101,60 @@ def do_comparison(comparison_symbol, expression):
 
 def check_equality(criterion, parameters_dict, local_substitutions=[]):
     lhs_expr, rhs_expr = create_expressions_for_comparison(criterion, parameters_dict, local_substitutions)
-    result = do_comparison(criterion.content, lhs_expr-rhs_expr)
-
-    # TODO: Make numerical comparison its own context
-    if result is False:
-        error_below_rtol = None
-        error_below_atol = None
-        if parameters_dict.get("numerical", False) or float(parameters_dict.get("rtol", 0)) > 0 or float(parameters_dict.get("atol", 0)) > 0:
-            # REMARK: 'pi' should be a reserved symbol but it is sometimes not treated as one, possibly because of input symbols.
-            # The two lines below this comments fixes the issue but a more robust solution should be found for cases where there
-            # are other reserved symbols.
-            def replace_pi(expr):
-                pi_symbol = pi
-                for s in expr.free_symbols:
-                    if str(s) == 'pi':
-                        pi_symbol = s
-                return expr.subs(pi_symbol, float(pi))
-            # NOTE: This code assumes that the left hand side is the response and the right hand side is the answer
-            # Separates LHS and RHS, parses and evaluates them
-            res = N(replace_pi(lhs_expr))
-            ans = N(replace_pi(rhs_expr))
-            if float(parameters_dict.get("atol", 0)) > 0:
-                try:
-                    absolute_error = abs(float(ans-res))
-                    error_below_atol = bool(absolute_error < float(parameters_dict["atol"]))
-                except TypeError:
-                    error_below_atol = None
-            else:
-                error_below_atol = True
-            if float(parameters_dict.get("rtol", 0)) > 0:
-                try:
-                    relative_error = abs(float((ans-res)/ans))
-                    error_below_rtol = bool(relative_error < float(parameters_dict["rtol"]))
-                except TypeError:
-                    error_below_rtol = None
-            else:
-                error_below_rtol = True
-            if error_below_atol is None or error_below_rtol is None:
-                result = False
-            elif error_below_atol is True and error_below_rtol is True:
-                result = True
+    if isinstance(lhs_expr, Equality) and not isinstance(rhs_expr, Equality):
+        result = False
+    elif not isinstance(lhs_expr, Equality) and isinstance(rhs_expr, Equality):
+        result = False
+    else:
+        result = do_comparison(criterion.content, lhs_expr-rhs_expr)
+        # There are some types of expression, e.g. those containing hyperbolic trigonometric functions, that can behave
+        # unpredictably when simplification is applied. For that reason we check several different combinations of
+        # simplifications here in order to reduce the likelihood of false negatives.
+        if result is False:
+            result = do_comparison(criterion.content, lhs_expr-rhs_expr.simplify())
+        if result is False:
+            result = do_comparison(criterion.content, lhs_expr.simplify()-rhs_expr)
+        if result is False:
+            result = do_comparison(criterion.content, lhs_expr.simplify()-rhs_expr.simplify())
+
+        # TODO: Make numerical comparison its own context
+        if result is False:
+            error_below_rtol = None
+            error_below_atol = None
+            if parameters_dict.get("numerical", False) or float(parameters_dict.get("rtol", 0)) > 0 or float(parameters_dict.get("atol", 0)) > 0:
+                # REMARK: 'pi' should be a reserved symbol but it is sometimes not treated as one, possibly because of input symbols.
+                # The two lines below this comments fixes the issue but a more robust solution should be found for cases where there
+                # are other reserved symbols.
+                def replace_pi(expr):
+                    pi_symbol = pi
+                    for s in expr.free_symbols:
+                        if str(s) == 'pi':
+                            pi_symbol = s
+                    return expr.subs(pi_symbol, float(pi))
+                # NOTE: This code assumes that the left hand side is the response and the right hand side is the answer
+                # Separates LHS and RHS, parses and evaluates them
+                res = N(replace_pi(lhs_expr))
+                ans = N(replace_pi(rhs_expr))
+                if float(parameters_dict.get("atol", 0)) > 0:
+                    try:
+                        absolute_error = abs(float(ans-res))
+                        error_below_atol = bool(absolute_error < float(parameters_dict["atol"]))
+                    except TypeError:
+                        error_below_atol = None
+                else:
+                    error_below_atol = True
+                if float(parameters_dict.get("rtol", 0)) > 0:
+                    try:
+                        relative_error = abs(float((ans-res)/ans))
+                        error_below_rtol = bool(relative_error < float(parameters_dict["rtol"]))
+                    except TypeError:
+                        error_below_rtol = None
+                else:
+                    error_below_rtol = True
+                if error_below_atol is None or error_below_rtol is None:
+                    result = False
+                elif error_below_atol is True and error_below_rtol is True:
+                    result = True
 
     return result
 
@@ -252,7 +266,12 @@ def set_equivalence(unused_input):
             result = None
             for j, answer in enumerate(answer_list):
                 current_pair = [("response", response), ("answer", answer)]
-                result = check_equality(criterion, parameters_dict, local_substitutions=current_pair)
+                if isinstance(response, Equality) and not isinstance(answer, Equality):
+                    result = False
+                elif not isinstance(response, Equality) and isinstance(answer, Equality):
+                    result = False
+                else:
+                    result = check_equality(criterion, parameters_dict, local_substitutions=current_pair)
                 if result is True:
                     matches["responses"][i] = True
                     matches["answers"][j] = True
@@ -397,6 +416,14 @@ def same_symbols(unused_input):
             details="Checks if "+str(lhs)+" is equivalent to "+str(rhs)+".",
             evaluate=equality_equivalence
         )
+        graph.attach(
+            label,
+            label+"_UNKNOWN",
+            summary="Cannot determine if "+str(lhs)+" is equivalent to "+str(rhs),
+            details="Cannot determine if "+str(lhs)+" is equivalent to "+str(rhs)+".",
+            feedback_string_generator=symbolic_feedback_string_generators["INTERNAL"]("EQUALITY_EQUIVALENCE_UNKNOWN")
+        )
+        graph.attach(label+"_UNKNOWN", END.label)
         graph.attach(
             label,
             label+"_TRUE",
@@ -474,6 +501,14 @@ def same_symbols(unused_input):
             feedback_string_generator=symbolic_feedback_string_generators["response=answer"]("FALSE")
         )
         graph.attach(label+"_FALSE", END.label)
+        graph.attach(
+            label,
+            label+"_UNKNOWN",
+            summary="Cannot detrmine if "+str(lhs)+"="+str(rhs),
+            details="Cannot detrmine if "+str(lhs)+" is equal to "+str(rhs)+".",
+            feedback_string_generator=symbolic_feedback_string_generators["response=answer"]("UNKNOWN")
+        )
+        graph.attach(label+"_UNKNOWN", END.label)
     return graph
 
 
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -23,13 +23,13 @@ class TestEvaluationFunction():
     """
 
     # Import tests that makes sure that mathematical expression comparison works as expected
-    #from .tests.symbolic_evaluation_tests import TestEvaluationFunction as TestSymbolicComparison
+    from .tests.symbolic_evaluation_tests import TestEvaluationFunction as TestSymbolicComparison
 
     # Import tests that makes sure that physical quantities are handled as expected
-    #from .tests.physical_quantity_evaluation_tests import TestEvaluationFunction as TestQuantities
+    from .tests.physical_quantity_evaluation_tests import TestEvaluationFunction as TestQuantities
 
     # Import tests that corresponds to examples in documentation and examples module
-    from .tests.example_tests import TestEvaluationFunction as TestExamples
+    #from .tests.example_tests import TestEvaluationFunction as TestExamples
 
     def test_eval_function_can_handle_latex_input(self):
         response = r"\sin x + x^{7}"
diff --git a/app/feedback/symbolic.py b/app/feedback/symbolic.py
@@ -25,6 +25,7 @@
     "EQUALITY_NOT_EXPRESSION": "The response was an equality but was expected to be an expression.",
     "EQUALITIES_EQUIVALENT": None,
     "EQUALITIES_NOT_EQUIVALENT": "The response is not the expected equality.",
+    "EQUALITY_EQUIVALENCE_UNKNOWN": "Cannot determine if the given equality is equivalent to the expected equality.",
     "WITHIN_TOLERANCE": None,  # "The difference between the response the answer is within specified error tolerance.",
     "NOT_NUMERICAL": None,  # "The expression cannot be evaluated numerically.",
 }[tag]
diff --git a/app/preview_implementations/symbolic_preview.py b/app/preview_implementations/symbolic_preview.py
@@ -108,7 +108,7 @@ def preview_function(response: str, params: Params) -> Result:
                 sympy_out = []
                 for expression in expression_list:
                     latex_out.append(sympy_to_latex(expression, symbols, settings={"mul_symbol": r" \cdot "}))
-                    sympy_out.append(str(expression))
+                    sympy_out.append(response)
 
             if len(sympy_out) == 1:
                 sympy_out = sympy_out[0]
diff --git a/app/tests/physical_quantity_evaluation_tests.py b/app/tests/physical_quantity_evaluation_tests.py
@@ -325,6 +325,66 @@ def test_legacy_strictness(self):
         result = evaluation_function(res, ans, params, include_test_data=True)
         assert result["is_correct"] is True
 
+    def test_physical_quantity_with_rtol(self):
+        ans = "7500 m/s"
+        res = "7504.1 m/s"
+        params = {
+            'rtol': 0.05,
+            'strict_syntax': False,
+            'physical_quantity': True,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(res, ans, params, include_test_data=True)
+        assert result["is_correct"] is True
+
+    def test_physical_quantity_with_atol(self):
+        ans = "7500 m/s"
+        res = "7504.1 m/s"
+        params = {
+            'atol': 5,
+            'strict_syntax': False,
+            'physical_quantity': True,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(res, ans, params, include_test_data=True)
+        assert result["is_correct"] is True
+
+#    def test_rad_vs_Hz(self):
+#        ans = "28.53 rad/s"
+#        res = "4.5405 H"
+#        params = {
+#            'rtol': 0.03,
+#            'strict_syntax': False,
+#            'physical_quantity': True,
+#            'elementary_functions': True,
+#        }
+#        result = evaluation_function(res, ans, params, include_test_data=True)
+#        assert result["is_correct"] is True
+
+    def test_tolerance_given_as_string(self):
+        ans = "4.52 kg"
+        res = "13.74 kg"
+        params = {
+            'rtol': '0.015',
+            'strict_syntax': False,
+            'physical_quantity': True,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(res, ans, params, include_test_data=True)
+        assert result["is_correct"] is False
+
+    def test_answer_zero_value(self):
+        ans = "0 m"
+        res = "1 m"
+        params = {
+            'rtol': 0,
+            'atol': 0,
+            'strict_syntax': False,
+            'physical_quantity': True,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(res, ans, params, include_test_data=True)
+        assert result["is_correct"] is False
 
 if __name__ == "__main__":
     pytest.main(['-xk not slow', "--no-header", os.path.abspath(__file__)])
diff --git a/app/tests/symbolic_evaluation_tests.py b/app/tests/symbolic_evaluation_tests.py
@@ -1881,5 +1881,84 @@ def test_alternatives_to_input_symbols_takes_priority_over_elementary_function_a
         result = evaluation_function(response, answer, params)
         assert result["is_correct"] is False
 
+    def test_elementary_function_symbol_with_implicit_multiplication_on_both_sides(self):
+        response = "momega^3 l^2/2(exp(2omegat)-exp(-2omegat))"
+        answer = "m*omega^3*l^2*sinh(2*omega*t)"
+        params = {
+            'strict_syntax': False,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(response, answer, params)
+        assert result["is_correct"] is True
+
+    def test_response_for_which_correctness_cannot_be_determined(self):
+        response = "2 pi e^{-a |omega|}" # The expression in {...} is interpreted as elements in a set instead of a math expression
+        answer = "2 pi e^(-a|omega|)"
+        params = {
+            'atol': 0,
+            'rtol': 0,
+            'strict_syntax': False,
+            'physical_quantity': False,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(response, answer, params)
+        assert result["is_correct"] is False
+
+    def test_unexpected_equalities_in_response_that_generates_set(self):
+        response = "z= plus_minus 1 + 2*i" # plus_minus generates a set of two equalities
+        answer = "2i plus_minus 1" # plus_minus generates a set of two expressions
+        params = {
+            'atol': 0,
+            'rtol': 0,
+            'strict_syntax': False,
+            'physical_quantity': False,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(response, answer, params)
+        assert result["is_correct"] is False
+
+    def test_infinity_alias(self):
+        response = "2.694"
+        answer = "infinity"
+        params = {
+            'strict_syntax': False,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(response, answer, params)
+        assert result["is_correct"] is False
+
+    def test_equality_expression_mismatch_with_custom_criteria(self):
+        response = "x-0.883x^2=0.251X^3"
+        answer = "0.4842x-0.1163x^3"
+        params = {
+            'atol': 0,
+            'rtol': 0.03,
+            'criteria': 'response=answer where x=0, diff(response,x)=diff(answer,x) where x=0, diff(response,x,2)=diff(answer,x,2) where x=0, diff(response,x,3)=diff(answer,x,3) where x=0',
+            'strict_syntax': False,
+            'physical_quantity': False,
+            'elementary_functions': True,
+        }
+        result = evaluation_function(response, answer, params)
+        assert result["is_correct"] is False
+
+    def test_input_symbols_takes_priority_when_containing_elementary_function_names_as_substring(self):
+        response = "Vmax - Vmaxe^-(t/tau)"
+        answer = "Vmax*(1-exp(-t/tau))"
+        params = {
+            "atol": 0,
+            "rtol": 0,
+            "strict_syntax": False,
+            "elementary_functions": True,
+            "physical_quantity": False,
+            "symbols": {
+                "Vmax": {"aliases": ["V_max"], "latex": r"$V_{max}$"},
+                "RS": {"aliases": ["R_S", "rs"], "latex": r"$R_S$"},
+                "RF": {"aliases": ["R_F", "rf"], "latex": r"$R_f$"},
+                "tau": {"aliases": [], "latex": r"$\tau$"},
+            },
+        }
+        result = evaluation_function(response, answer, params)
+        assert result["is_correct"] is True
+
 if __name__ == "__main__":
     pytest.main(['-xk not slow', "--tb=line", '--durations=10', os.path.abspath(__file__)])
diff --git a/app/utility/expression_utilities.py b/app/utility/expression_utilities.py