Merge pull request #45 from lambda-feedback/test-tr124-restructuring

KarlLundengaard · web-flow · commit 169b43fe23d0 · 2025-06-05T02:56:08.000Z
Test tr124 restructuring
diff --git a/app/context/symbolic.py b/app/context/symbolic.py
@@ -40,6 +40,8 @@ def check_criterion(criterion, parameters_dict, generate_feedback=True):
         result = check_order(criterion, parameters_dict)
     elif label == "CONTAINS":
         result = check_contains_symbol(criterion, parameters_dict)
+    elif label == "PROPORTIONAL_TO":
+        result = check_proportionality(criterion, parameters_dict)
     elif label == "WHERE":
         crit = criterion.children[0]
         subs = criterion.children[1]
@@ -90,7 +92,10 @@ def do_comparison(comparison_symbol, expression):
         "<=": lambda expr: bool(expression.cancel().simplify().simplify() <= 0),
     }
     comparison = comparisons[comparison_symbol.strip()]
-    result = comparison(expression)
+    try:
+        result = comparison(expression)
+    except Exception:
+        result = None
     return result
 
 
@@ -146,6 +151,20 @@ def check_order(criterion, parameters_dict, local_substitutions=[]):
     return result
 
 
+def check_proportionality(criterion, parameters_dict, local_substitutions=[]):
+    lhs_expr, rhs_expr = create_expressions_for_comparison(criterion, parameters_dict, local_substitutions)
+    result = None
+    if lhs_expr.cancel().simplify().simplify() != 0:
+        result = (rhs_expr/lhs_expr).cancel().simplify()
+    elif rhs_expr.cancel().simplify().simplify() != 0:
+        result = (lhs_expr/rhs_expr).cancel().simplify()
+    if result == 0 or result is None:
+        result = False
+    else:
+        result = result.is_constant()
+    return result
+
+
 def check_contains_symbol(criterion, parameters_dict, local_substitutions=[]):
     lhs_expr, rhs_expr = create_expressions_for_comparison(criterion, parameters_dict, local_substitutions)
     result = rhs_expr in lhs_expr.atoms()
@@ -218,10 +237,14 @@ def mathematical_equivalence(unused_input):
             return {
                 label+"_TRUE": None
             }
-        else:
+        elif result is False:
             return {
                 label+"_FALSE": None
             }
+        else:
+            return {
+                label+"_UNKNOWN": None
+            }
 
     def set_equivalence(unused_input):
         matches = {"responses": [False]*len(response_list), "answers": [False]*len(answer_list)}
@@ -328,7 +351,7 @@ def same_symbols(unused_input):
     ans = parameters_dict["reserved_expressions"]["answer"]
     use_equality_equivalence = isinstance(res, Equality) or isinstance(ans, Equality)
 
-    # TODO: Make checking set quivalence its own context that calls symbolic comparisons instead
+    # TODO: Make checking set equivalence its own context that calls symbolic comparisons instead
     if use_set_equivalence is True:
         graph.add_evaluation_node(
             label,
@@ -718,14 +741,19 @@ def criterion_eval_node(criterion, parameters_dict, generate_feedback=True):
     def evaluation_node_internal(unused_input):
         result = check_criterion(criterion, parameters_dict, generate_feedback)
         label = criterion.content_string()
-        if result:
+        if result is True:
             return {
                 label+"_TRUE": feedback_string_generator_inputs
             }
-        else:
+        elif result is False:
             return {
                 label+"_FALSE": feedback_string_generator_inputs
             }
+        else:
+            return {
+                label+"_UNKNOWN": feedback_string_generator_inputs
+            }
+
     label = criterion.content_string()
     graph = CriteriaGraph(label)
     END = CriteriaGraph.END
@@ -747,6 +775,14 @@ def evaluation_node_internal(unused_input):
         feedback_string_generator=symbolic_feedback_string_generators["GENERIC"]("FALSE")
     )
     graph.attach(label+"_FALSE", END.label)
+    graph.attach(
+        label,
+        label+"_UNKNOWN",
+        summary="True",
+        details=label+" is false.",
+        feedback_string_generator=symbolic_feedback_string_generators["GENERIC"]("FALSE")
+    )
+    graph.attach(label+"_UNKNOWN", END.label)
     return graph
 
 
diff --git a/app/docs/user.md b/app/docs/user.md
@@ -43,8 +43,12 @@ The `criteria` parameter reserves `response` and `answer` as keywords that will
 
 | Name  | Syntax                         | Description                         | Example             |
 |-------|:-------------------------------|:------------------------------------|:--------------------|
-| EQUAL | `EXPRESSION = EXPRESSION`        | Checks if the expressions are equal | `answer = response` |
-| WHERE | `EXPRESSION = EXPRESSION where EXPRESSION = EXPRESSION, ... , EXPRESSION = EXPRESSION` | Checks if the equality on the left side of `where` are equal if the equalities in the comma-separated list on the right side of `where` | `answer = response` |
+| EQUAL | `EXPRESSION = EXPRESSION`      | Checks if the expressions are equal | `answer = response` - Default way to check equality of expressions |
+| ORDER | `EXPRESSION ORDER EXPRESSION`  | Checks if the expressions have the given order. ORDER operators can be `>`, `<`, `>=`, `<=` | `answer > response` - Checks if the answer is greater than the response |
+| WHERE | `EXPRESSION = EXPRESSION where EXPRESSION = EXPRESSION; ... ; EXPRESSION = EXPRESSION` | Checks if the equality on the left side of `where` are equal if the equalities in the comma-separated list on the right side of `where` | `answer = response where x = 0` - Checks if the curves given by the answer and the response intersect when $x=0$. |
+| WRITTEN_AS | `EXPRESSION written as EXPRESSION` | Syntactical comparison, checks if the two expressions are written the same way. | `response written as answer` - Checks if the response is written in the same for as the answer (e.g. if answer is `(x+1)(x+2)` then the response `x^2+3x+2` will not satisfy the criteria but `(x+3)(x+4)` will). |
+| PROPORTIONAL | `EXPRESSION proportional to EXPRESSION` | Checks if one expression can be written is equivalent to the other expression multiplied by some constant. | `answer proportional to response` |
+| CONTAINS | `EXPRESSION contains EXPRESSION` | Checks if the left expression has the right expression as a subexpression. | `response contains x` - Checks if the response contains the symbol x |
 
 
 ## `elementary_functions`
diff --git a/app/evaluation.py b/app/evaluation.py
@@ -55,7 +55,7 @@ def determine_context(parameters):
             input_symbols_reserved_codes.append(input_symbol[0])
             input_symbols_reserved_aliases += [ip for ip in input_symbol[1] if len(ip.strip()) > 0]
 
-    reserved_keywords_codes = {"where", "written as"}
+    reserved_keywords_codes = {"where", "written as", "contains", "proportional to"}
     reserved_keywords_aliases = {"plus_minus", "minus_plus"}
     for re in parameters["reserved_expressions_strings"].values():
         reserved_keywords_aliases = reserved_keywords_aliases.union(set(re.keys()))
@@ -205,6 +205,7 @@ def generate_feedback(main_criteria, criteria_graphs, evaluation_parameters):
     response = evaluation_parameters["reserved_expressions"]["response"]
     criteria_feedback = set()
     is_correct = True
+    custom_feedback = evaluation_parameters.get("custom_feedback",{})
     for (criterion_identifier, graph) in criteria_graphs.items():
         # TODO: Find better way to identify main criteria for criteria graph
         main_criteria = criterion_identifier+"_TRUE"
@@ -219,7 +220,7 @@ def generate_feedback(main_criteria, criteria_graphs, evaluation_parameters):
         #       assumption that some way to return partial feedback
         #       before script has executed completely will be available
         #       in the future
-        evaluation_result.add_feedback_from_tags(criteria_feedback, graph)
+        evaluation_result.add_feedback_from_tags(criteria_feedback, graph, custom_feedback=custom_feedback)
     evaluation_result.is_correct = is_correct
     return
 
@@ -239,12 +240,6 @@ def evaluation_function(response, answer, params, include_test_data=False) -> di
 
     parameters = deepcopy(params)
 
-    # CONSIDER: Can this be moved into the preprocessing procedures in a consistent way?
-    # Can it be turned into its own context? Or moved into the determine_context procedure?
-    # What solution will be most consistently reusable?
-    if parameters.get("is_latex", False):
-        response = parse_latex(response, parameters.get("symbols", {}), False)
-
     reserved_expressions_strings = {
         "learner": {
             "response": response
@@ -269,13 +264,31 @@ def evaluation_function(response, answer, params, include_test_data=False) -> di
     else:
         evaluation_result.latex = preview["latex"]
         evaluation_result.simplified = preview["sympy"]
+
+    reserved_expressions_keys = list(reserved_expressions_strings["learner"].keys())+list(reserved_expressions_strings["task"].keys())
     parameters.update(
         {
             "context": context,
-            "parsing_parameters": context["parsing_parameters_generator"](parameters),
+            "reserved_keywords": context["reserved_keywords"]+reserved_expressions_keys,
+        }
+    )
+    parsing_parameters = context["parsing_parameters_generator"](parameters, unsplittable_symbols=reserved_expressions_keys) 
+    parameters.update(
+        {
+            "parsing_parameters": parsing_parameters,
         }
     )
 
+    # CONSIDER: Can this be moved into the preprocessing procedures in a consistent way?
+    # Can it be turned into its own context? Or moved into the determine_context procedure?
+    # What solution will be most consistently reusable?
+    if parameters.get("is_latex", False):
+        parameters["reserved_expressions_strings"]["learner"].update(
+            {
+                "response": parse_latex(response, parameters.get("symbols", {}), False, parameters=parameters),
+            }
+        )
+
     # FIXME: Move this into expression_utilities
     if params.get("strict_syntax", True):
         if "^" in response:
@@ -287,13 +300,10 @@ def evaluation_function(response, answer, params, include_test_data=False) -> di
     if reserved_expressions_success is False:
         return evaluation_result.serialise(include_test_data)
     reserved_expressions_parsed = {**reserved_expressions["learner"], **reserved_expressions["task"]}
-    parameters.update({"reserved_keywords": parameters["context"]["reserved_keywords"]+list(reserved_expressions_parsed.keys())})
 
     criteria_parser = context["generate_criteria_parser"](reserved_expressions)
     criteria = create_criteria_dict(criteria_parser, parameters)
 
-    parsing_parameters = parameters["context"]["parsing_parameters_generator"](parameters, unsplittable_symbols=list(reserved_expressions_parsed.keys()))
-
     evaluation_parameters = FrozenValuesDictionary(
         {
             "reserved_expressions_strings": reserved_expressions_strings,
@@ -308,6 +318,7 @@ def evaluation_function(response, answer, params, include_test_data=False) -> di
             "numerical": parameters.get("numerical", False),
             "atol": parameters.get("atol", 0),
             "rtol": parameters.get("rtol", 0),
+            "custom_feedback": parameters.get("custom_feedback",{}),
         }
     )
 
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -23,10 +23,10 @@ class TestEvaluationFunction():
     """
 
     # Import tests that makes sure that mathematical expression comparison works as expected
-    from .tests.symbolic_evaluation_tests import TestEvaluationFunction as TestSymbolicComparison
+    #from .tests.symbolic_evaluation_tests import TestEvaluationFunction as TestSymbolicComparison
 
     # Import tests that makes sure that physical quantities are handled as expected
-    from .tests.physical_quantity_evaluation_tests import TestEvaluationFunction as TestQuantities
+    #from .tests.physical_quantity_evaluation_tests import TestEvaluationFunction as TestQuantities
 
     # Import tests that corresponds to examples in documentation and examples module
     from .tests.example_tests import TestEvaluationFunction as TestExamples
@@ -81,4 +81,4 @@ def test_CHEM40002_1_5_instance_2024_25(self):
 
 
 if __name__ == "__main__":
-    pytest.main(['-k not slow', '--tb=line', '--durations=10', os.path.abspath(__file__)])
+    pytest.main(['-xk not slow', '--tb=short', '--durations=10', os.path.abspath(__file__)])
diff --git a/app/feedback/symbolic.py b/app/feedback/symbolic.py
@@ -69,3 +69,8 @@
     "EXPONENTIAL": "Response and answer are both written on exponential form.",  # None,
     "UNKNOWN": "The response is not written on the expected form.",
 }[tag]
+feedback_generators["PROPORTIONAL_TO"] = lambda tag: lambda inputs: {
+    "TRUE": None,
+    "FALSE": None,
+    "UNKNOWN": None,
+}[tag]
diff --git a/app/tests/example_tests.py b/app/tests/example_tests.py
@@ -507,6 +507,142 @@ def test_syntactical_comparison(self, response, answer, criteria, value, feedbac
         assert result["is_correct"] is value
         assert set(feedback_tags) == set(result["tags"])
 
+    @pytest.mark.parametrize(
+        "response, value, tags",
+        [
+            (
+                "2a+2b+2c",
+                True,
+                [
+                    "response proportional to answer_TRUE",
+                ],
+            ),
+            (
+                "a+2b+3c",
+                False,
+                [
+                    "response proportional to answer_FALSE",
+                ],
+            ),
+            (
+                "pi*(a+b+c)",
+                True,
+                [
+                    "response proportional to answer_TRUE",
+                ],
+            ),
+            (
+                "x*(a+b+c)",
+                False,
+                [
+                    "response proportional to answer_FALSE",
+                ],
+            ),
+        ]
+    )
+    def test_custom_comparison_with_criteria_proportional(self, response, value, tags):
+        params = {
+            "strict_syntax": False,
+            "elementary_functions": True,
+            "criteria": "response proportional to answer",
+        }
+        answer = "a+b+c"
+        result = evaluation_function(response, answer, params, include_test_data=True)
+        assert result["is_correct"] is value
+        assert set(tags) == set(result["tags"])
+
+    @pytest.mark.parametrize(
+        "response, value, tags",
+        [
+            (
+                "2*x^2+0.5+0.25*sin(x)^2",
+                False,
+                [
+                    "answer <= response_TRUE",
+                    "2+answer > response_UNKNOWN",
+                ]
+            ),
+        ]
+    )
+    def test_custom_comparison_with_criteria_order(self, response, value, tags):
+        params = {
+            "strict_syntax": False,
+            "elementary_functions": True,
+            "criteria": "answer <= response, 2+answer > response",
+            "symbol_assumptions": "('x', 'real')",
+        }
+        answer = "2*x^2"
+        result = evaluation_function(response, answer, params, include_test_data=True)
+        assert result["is_correct"] is value
+        assert set(tags) == set(result["tags"])
+
+    @pytest.mark.parametrize(
+        "response, value, tags",
+        [
+            (
+                "pi*n",
+                True,
+                [
+                    "sin(response)=0_TRUE",
+                    "sin(response)=0_SAME_SYMBOLS_TRUE",
+                    "response contains n_TRUE",
+                ],
+            ),
+        ]
+    )
+    def test_custom_comparison_with_criteria_contains(self, response, value, tags):
+        params = {
+            "strict_syntax": False,
+            "elementary_functions": True,
+            "criteria": "sin(response)=0, response contains n",
+            "symbols": {
+                "n": {
+                    "latex": r"\(n\)",
+                    "aliases": ["i", "k", "N", "I", "K"],
+                },
+            },
+            "symbol_assumptions": "('n', 'integer')"
+        }
+        answer = "0"
+        result = evaluation_function(response, answer, params, include_test_data=True)
+        assert result["is_correct"] is value
+        assert set(tags) == set(result["tags"])
+
+    @pytest.mark.parametrize(
+        "response, answer, criteria, value, feedback_tags, custom_feedback, additional_params",
+        [
+            (
+                "2*x^2+0.5+0.25*sin(x)^2",
+                "2x^2",
+                "answer <= response, 2+answer > response",
+                False,
+                [
+                    "answer <= response_TRUE",
+                    "2+answer > response_UNKNOWN",
+                ],
+                {
+                    "answer <= response_TRUE": "AAA",
+                    "2+answer > response_UNKNOWN": "BBB",
+                },
+                {
+                    "symbol_assumptions": "('x', 'real')",
+                }
+            ),
+        ]
+    )
+    def test_criteria_custom_feedback(self, response, answer, criteria, value, feedback_tags, custom_feedback, additional_params):
+        params = {
+            "strict_syntax": False,
+            "elementary_functions": True,
+            "criteria": criteria,
+            "custom_feedback": custom_feedback,
+        }
+        params.update(additional_params)
+        result = evaluation_function(response, answer, params, include_test_data=True)
+        assert result["is_correct"] is value
+        assert set(feedback_tags) == set(result["tags"])
+        for string in custom_feedback.values():
+            assert string in result["feedback"]
 
 if __name__ == "__main__":
     pytest.main(['-sk not slow', "--tb=line", os.path.abspath(__file__)])
diff --git a/app/utility/criteria_parsing.py b/app/utility/criteria_parsing.py
diff --git a/app/utility/evaluation_result_utilities.py b/app/utility/evaluation_result_utilities.py
diff --git a/app/utility/expression_utilities.py b/app/utility/expression_utilities.py