Update: modify 24 file(s)

vawsgit · vawsgit · commit cd739a245e0b · 2026-01-07T16:36:43.000-06:00
diff --git a/examples/scripts/aggregate_metrics_demo.py b/examples/scripts/aggregate_metrics_demo.py
@@ -212,7 +212,7 @@ def demonstrate_deprecation_warning():
         warnings.simplefilter("always")
 
         # This will trigger a deprecation warning
-        field = ComparableField(aggregate=True, comparator=ExactComparator())
+        ComparableField(aggregate=True, comparator=ExactComparator())
 
         if w:
             print(f"   Warning: {w[0].message}")
diff --git a/examples/scripts/bulk_evaluation_demo.py b/examples/scripts/bulk_evaluation_demo.py
@@ -249,7 +249,7 @@ def demo_evaluation_with_output():
         evaluator.update(gt_doc, pred_doc, doc_id)
 
     # Get results and save metrics
-    result = evaluator.compute()
+    evaluator.compute()
     evaluator.save_metrics(metrics_file)
 
     print("\n💾 Output Files Created:")
@@ -341,7 +341,7 @@ def main():
     print("=" * 60)
 
     # Demo 1: Basic bulk evaluation
-    evaluator = demo_basic_bulk_evaluation()
+    demo_basic_bulk_evaluation()
 
     # Demo 2: Batch processing
     demo_batch_processing()
diff --git a/src/stickler/comparators/__init__.py b/src/stickler/comparators/__init__.py
@@ -16,15 +16,19 @@
 
 # Import BERTComparator if evaluate is available
 try:
-    from stickler.comparators.bert import BERTComparator
+    from stickler.comparators.bert import BERTComparator  # noqa: F401
 
     BERT_AVAILABLE = True
 except ImportError:
     BERT_AVAILABLE = False
 
 # Import FuzzyComparator and Fuzz alias only if rapidfuzz is available
 try:
-    from stickler.comparators.fuzzy import RAPIDFUZZ_AVAILABLE, Fuzz, FuzzyComparator
+    from stickler.comparators.fuzzy import (  # noqa: F401
+        RAPIDFUZZ_AVAILABLE,
+        Fuzz,
+        FuzzyComparator,
+    )
 except ImportError:
     RAPIDFUZZ_AVAILABLE = False
 
diff --git a/src/stickler/reporting/html/__init__.py b/src/stickler/reporting/html/__init__.py
@@ -9,5 +9,7 @@
 
 __all__ = [
     "EvaluationHTMLReporter",
-    "ReportConfig"
+    "ReportConfig",
+    "SectionGenerator",
+    "VisualizationEngine"
 ]
diff --git a/src/stickler/structured_object_evaluator/models/configuration_helper.py b/src/stickler/structured_object_evaluator/models/configuration_helper.py
@@ -5,9 +5,14 @@
 """
 
 import inspect
-from typing import Any, Dict, Union, get_args, get_origin
+from typing import TYPE_CHECKING, Any, Dict, Union, get_args, get_origin
 
 from stickler.comparators.levenshtein import LevenshteinComparator
+
+if TYPE_CHECKING:
+    from stickler.structured_object_evaluator.models.comparison_info import (
+        ComparableFieldConfig,
+    )
 from stickler.comparators.structured import StructuredModelComparator
 
 
diff --git a/tests/common/comparators/test_llm.py b/tests/common/comparators/test_llm.py
@@ -167,7 +167,7 @@ def test_compare_with_custom_prompt(self, mock_bedrock):
         mock_bedrock.return_value = mock_client
 
         custom_prompt = "Custom prompt {value1} vs {value2}"
-        comparator = LLMComparator(
+        LLMComparator(
             model_name="test-model", prompt_template=custom_prompt
         )
 
@@ -380,7 +380,7 @@ def test_get_comparison_details_error_handling(self):
         
         assert "error" in details
         assert "comparison_result" in details
-        assert details["comparison_result"] == False
+        assert not details["comparison_result"]
 
     def test_string_representation(self):
         """Test string representations for serialization."""
@@ -477,20 +477,20 @@ def test_get_comparison_details_comprehensive_error_handling(self):
         self.mock_agent.side_effect = NoCredentialsError()
         details = self.comparator.get_comparison_details("value1", "value2")
         assert "error" in details
-        assert details["comparison_result"] == False
+        assert not details["comparison_result"]
         
         # Test ClientError
         error_response = {'Error': {'Code': 'ThrottlingException', 'Message': 'Rate exceeded'}}
         self.mock_agent.side_effect = ClientError(error_response, 'InvokeModel')
         details = self.comparator.get_comparison_details("value1", "value2")
         assert "error" in details
-        assert details["comparison_result"] == False
+        assert not details["comparison_result"]
         
         # Test generic exception
         self.mock_agent.side_effect = Exception("Generic error")
         details = self.comparator.get_comparison_details("value1", "value2")
         assert "error" in details
-        assert details["comparison_result"] == False
+        assert not details["comparison_result"]
 
     def test_model_initialization_error(self):
         """Test error handling during model initialization."""
diff --git a/tests/structured_object_evaluator/test_aggregate_contact_issue.py b/tests/structured_object_evaluator/test_aggregate_contact_issue.py
@@ -80,7 +80,6 @@ def test_contact_object_level_metrics_not_rollup():
     expected_contact_fa = 0  # No false alarms (both sides have 1 contact)
     expected_contact_fp = 1  # fp = fa + fd = 0 + 1 = 1
     expected_contact_tp = 0  # No true positives
-    expected_contact_tn = 0  # No true negatives
     expected_contact_fn = 0  # No false negatives
 
     # CURRENT BUG: These will fail because it's rolling up nested field metrics (fd=1, fa=1, fp=2)
diff --git a/tests/structured_object_evaluator/test_comparable_field_fix.py b/tests/structured_object_evaluator/test_comparable_field_fix.py
@@ -242,7 +242,7 @@ class TestModel(StructuredModel):
         assert isinstance(config.comparator, CustomTestComparator)
         assert config.threshold == 0.75
         assert config.weight == 2.5
-        assert config.clip_under_threshold == False
+        assert not config.clip_under_threshold
 
     def test_multiple_custom_comparators(self):
         """Test that multiple different custom comparators can coexist."""
diff --git a/tests/structured_object_evaluator/test_comparators.py b/tests/structured_object_evaluator/test_comparators.py
@@ -15,7 +15,7 @@
     FUZZY_AVAILABLE = False
 
 try:
-    from stickler.comparators.semantic import SemanticComparator
+    from stickler.comparators.semantic import SemanticComparator  # noqa: F401
 
     SEMANTIC_AVAILABLE = True
 except ImportError:
diff --git a/tests/structured_object_evaluator/test_confusion_matrix_metrics.py b/tests/structured_object_evaluator/test_confusion_matrix_metrics.py
@@ -356,7 +356,7 @@ def test_nested_structured_models():
 
     # Create test nested models
     details1 = SimpleModel(name="Details 1", count=1, description="First details")
-    details2 = SimpleModel(name="Details 2", count=2, description=None)
+    SimpleModel(name="Details 2", count=2, description=None)
 
     # Similar but not exact nested model
     details1_similar = SimpleModel(name="Details 1", count=1, description="First")
diff --git a/tests/structured_object_evaluator/test_field_comparison_collection.py b/tests/structured_object_evaluator/test_field_comparison_collection.py
@@ -405,7 +405,7 @@ def test_edge_case_empty_lists(self):
         
         # Should have comparisons for the items in model2 (false alarms)
         tag_comparisons = [fc for fc in field_comparisons if fc["expected_key"].startswith("tags[")]
-        score_comparisons = [fc for fc in field_comparisons if fc["expected_key"].startswith("scores[")]
+        [fc for fc in field_comparisons if fc["expected_key"].startswith("scores[")]
         
         # Should have false alarm entries
         if tag_comparisons:
@@ -430,7 +430,7 @@ def test_edge_case_mismatched_list_lengths(self):
         
         # Should have comparisons for all items
         tag_comparisons = [fc for fc in field_comparisons if fc["expected_key"].startswith("tags[")]
-        score_comparisons = [fc for fc in field_comparisons if fc["expected_key"].startswith("scores[")]
+        [fc for fc in field_comparisons if fc["expected_key"].startswith("scores[")]
         
         # Should have matches and non-matches
         tag_matches = [fc for fc in tag_comparisons if fc["match"]]
diff --git a/tests/structured_object_evaluator/test_json_handling.py b/tests/structured_object_evaluator/test_json_handling.py
@@ -108,7 +108,7 @@ def test_missing_fields_handling():
     assert pred.vendor_name is None
 
     # But all_fields_matched should be False due to missing fields
-    assert comparison["all_fields_matched"] == False
+    assert not comparison["all_fields_matched"]
 
     # Overall score should be calculated based on fields present in both models
     # with weights taken into account
@@ -163,7 +163,7 @@ def test_extra_fields_handling():
 
     # Overall score should be perfect
     assert comparison["overall_score"] == 1.0
-    assert comparison["all_fields_matched"] == True
+    assert comparison["all_fields_matched"]
 
 
 def test_compare_json_utility():
diff --git a/tests/structured_object_evaluator/test_json_schema_field_converter.py b/tests/structured_object_evaluator/test_json_schema_field_converter.py
@@ -39,10 +39,10 @@ def test_convert_all_primitive_types(self):
         assert "active" in field_definitions
 
         # Check types
-        assert field_definitions["name"][0] == str
-        assert field_definitions["age"][0] == int
-        assert field_definitions["price"][0] == float
-        assert field_definitions["active"][0] == bool
+        assert field_definitions["name"][0] is str
+        assert field_definitions["age"][0] is int
+        assert field_definitions["price"][0] is float
+        assert field_definitions["active"][0] is bool
 
         # Check required vs optional (via is_required)
         name_field = field_definitions["name"][1]
@@ -195,12 +195,12 @@ def test_convert_with_arrays_of_primitives(self):
 
         # Verify they are List types
         assert hasattr(tags_type, "__origin__")
-        assert tags_type.__origin__ == list
-        assert tags_type.__args__[0] == str
+        assert tags_type.__origin__ is list
+        assert tags_type.__args__[0] is str
 
         assert hasattr(scores_type, "__origin__")
-        assert scores_type.__origin__ == list
-        assert scores_type.__args__[0] == float
+        assert scores_type.__origin__ is list
+        assert scores_type.__args__[0] is float
 
     def test_convert_empty_properties(self):
         """Test converting empty properties dictionary."""
@@ -474,7 +474,7 @@ def test_resolve_ref_in_array_items(self):
         
         # Verify it's a List type
         assert hasattr(items_type, "__origin__")
-        assert items_type.__origin__ == list
+        assert items_type.__origin__ is list
         
         # Verify element is a StructuredModel subclass
         from stickler.structured_object_evaluator.models.structured_model import (
@@ -630,7 +630,7 @@ def test_array_of_objects(self):
         
         # Verify it's a List type
         assert hasattr(employees_type, "__origin__")
-        assert employees_type.__origin__ == list
+        assert employees_type.__origin__ is list
         
         # Verify element is a StructuredModel subclass
         from stickler.structured_object_evaluator.models.structured_model import (
@@ -693,10 +693,10 @@ def test_array_of_primitives_all_types(self):
         )
 
         # Check all array types
-        assert field_definitions["strings"][0].__args__[0] == str
-        assert field_definitions["integers"][0].__args__[0] == int
-        assert field_definitions["numbers"][0].__args__[0] == float
-        assert field_definitions["booleans"][0].__args__[0] == bool
+        assert field_definitions["strings"][0].__args__[0] is str
+        assert field_definitions["integers"][0].__args__[0] is int
+        assert field_definitions["numbers"][0].__args__[0] is float
+        assert field_definitions["booleans"][0].__args__[0] is bool
 
 
 class TestErrorHandling:
diff --git a/tests/structured_object_evaluator/test_model_schema.py b/tests/structured_object_evaluator/test_model_schema.py
@@ -140,7 +140,7 @@ def test_schema_serialization():
     # Test with nested model as well
     nested_schema = NestedTestModel.model_json_schema()
     nested_json = json.dumps(nested_schema)
-    parsed_nested = json.loads(nested_json)
+    json.loads(nested_json)
 
 
 def test_schema_validation_compatibility():
diff --git a/tests/structured_object_evaluator/test_nested_confusion_matrix_aggregation.py b/tests/structured_object_evaluator/test_nested_confusion_matrix_aggregation.py
@@ -170,7 +170,7 @@ def test_field_level_confusion_matrix_aggregation(self):
 
         # 5. Calculate the expected counts for each field type across all line items
         # We can use the line_items aggregate metrics to verify aggregation is working correctly
-        line_item_metrics = cm["fields"]["line_items"]["aggregate"]
+        cm["fields"]["line_items"]["aggregate"]
 
         # 6. Verify each field has correct counts based on our test data
         # Actual aggregated counts based on implementation behavior:
diff --git a/tests/structured_object_evaluator/test_nested_object_notebook_case.py b/tests/structured_object_evaluator/test_nested_object_notebook_case.py
@@ -372,6 +372,6 @@ def test_products_list_evaluation(self, ground_truth):
     test_case = TestNestedObjectNotebookCase()
     gt = create_ground_truth_order()
 
-    test_case.test_missing_item_case_detailed(gt, evaluator)
-    test_case.test_all_cases_comparison(gt, evaluator)
+    test_case.test_missing_item_case_detailed(gt)
+    test_case.test_all_cases_comparison(gt)
     test_case.test_products_list_evaluation(gt)
diff --git a/tests/structured_object_evaluator/test_performance_benchmark.py b/tests/structured_object_evaluator/test_performance_benchmark.py
@@ -70,7 +70,7 @@ def test_performance_simple_comparison():
     iterations = 50
     start = time.time()
     for _ in range(iterations):
-        result = gt.compare_with(pred, include_confusion_matrix=True)
+        gt.compare_with(pred, include_confusion_matrix=True)
     elapsed = time.time() - start
     
     avg_time = elapsed / iterations
@@ -131,7 +131,7 @@ def test_performance_nested_comparison():
     iterations = 50
     start = time.time()
     for _ in range(iterations):
-        result = gt.compare_with(pred, include_confusion_matrix=True, document_non_matches=True)
+        gt.compare_with(pred, include_confusion_matrix=True, document_non_matches=True)
     elapsed = time.time() - start
     
     avg_time = elapsed / iterations
@@ -177,7 +177,7 @@ def test_performance_large_list_comparison():
     iterations = 20
     start = time.time()
     for _ in range(iterations):
-        result = gt.compare_with(pred, include_confusion_matrix=True)
+        gt.compare_with(pred, include_confusion_matrix=True)
     elapsed = time.time() - start
     
     avg_time = elapsed / iterations
diff --git a/tests/structured_object_evaluator/test_scoring_consistency_fix.py b/tests/structured_object_evaluator/test_scoring_consistency_fix.py
@@ -118,7 +118,7 @@ def test_list_comparison_before_fix_demonstration(self):
 
         # Get individual threshold-applied scores
         self.test_individual_invoice_scores_with_thresholds()
-        individual_scores, individual_avg = (
+        _individual_scores, individual_avg = (
             self.individual_scores,
             self.expected_individual_avg,
         )
@@ -172,7 +172,7 @@ def test_expected_behavior_after_fix(self):
 
         # Get the individual scores (what users expect)
         self.test_individual_invoice_scores_with_thresholds()
-        individual_scores, individual_avg = (
+        _individual_scores, individual_avg = (
             self.individual_scores,
             self.expected_individual_avg,
         )
diff --git a/tests/structured_object_evaluator/test_structured_model_comparators.py b/tests/structured_object_evaluator/test_structured_model_comparators.py
@@ -124,7 +124,7 @@ def test_case_sensitivity():
     # Get scores for each field
     results = gt.compare_with(pred, evaluator_format=True)
 
-    strict_score = results["fields"]["strict_field"]["anls_score"]
+    results["fields"]["strict_field"]["anls_score"]
     standard_score = results["fields"]["standard_field"]["anls_score"]
 
     # Standard LevenshteinComparator normalizes to lowercase, so score should be 1.0
@@ -239,7 +239,7 @@ class FuzzyVariantsModel(StructuredModel):
     )
 
     # Get scores
-    results = gt.compare_with(pred, evaluator_format=True)
+    gt.compare_with(pred, evaluator_format=True)
 
     # We only care about the relative performance for this test
     # Test directly against FuzzyComparator methods to validate differences
@@ -304,7 +304,7 @@ class ThresholdModel(StructuredModel):
     raw_similarity = LevenshteinComparator().compare("Hello World", "Hello Wrld")
 
     # Get the scores from the evaluator
-    strict_score = results["fields"]["strict"]["anls_score"]
+    results["fields"]["strict"]["anls_score"]
     moderate_score = results["fields"]["moderate"]["anls_score"]
     lenient_score = results["fields"]["lenient"]["anls_score"]
 
diff --git a/tests/structured_object_evaluator/test_structured_model_schema.py b/tests/structured_object_evaluator/test_structured_model_schema.py
@@ -137,7 +137,7 @@ def test_schema_serialization():
     # Test with nested model as well
     nested_schema = NestedTestModel.model_json_schema()
     nested_json = json.dumps(nested_schema)
-    parsed_nested = json.loads(nested_json)
+    json.loads(nested_json)
 
 
 def test_schema_validation_compatibility():
diff --git a/tests/structured_object_evaluator/test_structured_object_metrics.py b/tests/structured_object_evaluator/test_structured_object_metrics.py
@@ -296,7 +296,7 @@ def get_base_metrics(cm_result, field_name):
         ), "Expected line_items in confusion matrix fields"
 
         # Get line_items confusion matrix metrics
-        line_items_cm = get_base_metrics(cm, "line_items")
+        get_base_metrics(cm, "line_items")
 
         # Verify that we have hierarchical structure for fields within line items
         expected_field_entries = ["description", "quantity", "unit_price", "total"]
diff --git a/tests/structured_object_evaluator/test_trees.py b/tests/structured_object_evaluator/test_trees.py
@@ -31,7 +31,7 @@ def test_make_tree_leaf_types(self):
         # Test boolean
         tree = ANLSTree.make_tree(True, is_gt=True)
         assert isinstance(tree, ANLSLeaf)
-        assert tree.obj == True
+        assert tree.obj
 
         # Test None
         tree = ANLSTree.make_tree(None, is_gt=True)
diff --git a/tests/structured_object_evaluator/test_universal_aggregate_field_comprehensive.py b/tests/structured_object_evaluator/test_universal_aggregate_field_comprehensive.py
@@ -163,7 +163,7 @@ def test_deprecation_warning_for_legacy_aggregate_parameter(self):
             warnings.simplefilter("always")
 
             # This should trigger a deprecation warning
-            field = ComparableField(aggregate=True)
+            ComparableField(aggregate=True)
 
             # Verify warning was triggered
             assert len(w) == 1
diff --git a/tests/test_compare_with_call_count.py b/tests/test_compare_with_call_count.py

Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,7 @@`
`9`	`9`
`10`	`10`	`__all__ = [`
`11`	`11`	`"EvaluationHTMLReporter",`
`12`		`- "ReportConfig"`
	`12`	`+ "ReportConfig",`
	`13`	`+ "SectionGenerator",`
	`14`	`+ "VisualizationEngine"`
`13`	`15`	`]`