vawsgit
diff --git a/‎docs/docs/Guides/StructuredModel_compare_with_README.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/docs/Guides/StructuredModel_compare_with_README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/docs/SDK-Docs/evaluator.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/docs/SDK-Docs/evaluator.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/scripts/non_match_analysis_demo.py‎
Lines changed: 9 additions & 3 deletions b/‎examples/scripts/non_match_analysis_demo.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎examples/scripts/quick_start.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/scripts/quick_start.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/stickler/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/stickler/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/stickler/structured_object_evaluator/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/stickler/structured_object_evaluator/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/stickler/structured_object_evaluator/bulk_structured_model_evaluator.py‎
Lines changed: 93 additions & 30 deletions b/‎src/stickler/structured_object_evaluator/bulk_structured_model_evaluator.py‎
Lines changed: 93 additions & 30 deletions
diff --git a/‎src/stickler/structured_object_evaluator/models/comparison_dispatcher.py‎
Lines changed: 1 addition & 0 deletions b/‎src/stickler/structured_object_evaluator/models/comparison_dispatcher.py‎
Lines changed: 1 addition & 0 deletions
@@ -366,7 +366,8 @@ result = model1.compare_with(
     add_confidence_metrics=True,     # Add AUROC confidence calibration metric
     evaluator_format=False,          # Format for evaluation tools
     recall_with_fd=False,           # Include FD in recall calculation
-    add_derived_metrics=True        # Add precision/recall/F1 metrics
+    add_derived_metrics=True,       # Add precision/recall/F1 metrics
+    document_field_comparisons=False # Document all field-level comparisons
 )
 ```
 
 
@@ -8,3 +8,7 @@
 ::: stickler.structured_object_evaluator.bulk_structured_model_evaluator.BulkStructuredModelEvaluator
     options:
       heading_level: 2
+
+::: stickler.structured_object_evaluator.bulk_structured_model_evaluator.aggregate_from_comparisons
+    options:
+      heading_level: 2
@@ -110,7 +110,9 @@ def demonstrate_basic_evaluation(gt_order, pred_order):
     print("\n🔍 Basic Evaluation (No Non-Match Documentation)")
     print("=" * 60)
 
-    result = gt_order.compare_with(pred_order, evaluator_format=True, document_non_matches=False)
+    result = gt_order.compare_with(
+        pred_order, evaluator_format=True, document_non_matches=False
+    )
 
     print("Overall Scores:")
     print(f"  Precision: {result['overall']['precision']:.3f}")
@@ -133,7 +135,9 @@ def demonstrate_enhanced_non_matches(gt_order, pred_order):
     print("\n🔍 Enhanced Non-Match Analysis")
     print("=" * 50)
 
-    result = gt_order.compare_with(pred_order, evaluator_format=True, document_non_matches=True)
+    result = gt_order.compare_with(
+        pred_order, evaluator_format=True, document_non_matches=True
+    )
 
     # Show non-matches
     non_matches = result.get("non_matches", [])
@@ -299,7 +303,9 @@ def main():
     demonstrate_compare_with_method(gt_order, pred_order)
 
     # Analyze non-matches for practical debugging
-    result = gt_order.compare_with(pred_order, evaluator_format=True, document_non_matches=True)
+    result = gt_order.compare_with(
+        pred_order, evaluator_format=True, document_non_matches=True
+    )
     non_matches = result.get("non_matches", [])
     analyze_non_matches_for_debugging(non_matches)
 
 
@@ -198,7 +198,9 @@ def demo_evaluator_detailed_analysis():
 
     print("Evaluating similar but not identical orders...")
 
-    result = gt_order.compare_with(pred_order, include_confusion_matrix=True, evaluator_format=True)
+    result = gt_order.compare_with(
+        pred_order, include_confusion_matrix=True, evaluator_format=True
+    )
 
     print("\n📊 Overall Metrics:")
     print(f"  Precision: {result['overall']['precision']:.3f}")
 
@@ -10,6 +10,7 @@
     NonMatchField,
     NonMatchType,
     StructuredModel,
+    aggregate_from_comparisons,
     anls_score,
     compare_json,
     compare_structured_models,
@@ -25,4 +26,5 @@
     "compare_structured_models",
     "anls_score",
     "compare_json",
+    "aggregate_from_comparisons",
 ]
@@ -4,6 +4,7 @@
 comparison metrics and displaying the results in a user-friendly format.
 """
 
+from .bulk_structured_model_evaluator import aggregate_from_comparisons
 from .models.comparable_field import ComparableField
 from .models.non_match_field import NonMatchField, NonMatchType
 from .models.structured_model import StructuredModel
@@ -19,6 +20,7 @@
     "compare_structured_models",
     "anls_score",
     "compare_json",
+    "aggregate_from_comparisons",
     "ScoreNode",
     "construct_nested_dict",
     "merge_and_calculate_mean",
 
@@ -41,7 +41,7 @@ class BulkStructuredModelEvaluator:
 
     def __init__(
         self,
-        target_schema: Type[StructuredModel],
+        target_schema: Optional[Type[StructuredModel]] = None,
         verbose: bool = False,
         document_non_matches: bool = True,
         elide_errors: bool = False,
@@ -51,7 +51,9 @@ def __init__(
         Initialize the stateful bulk evaluator.
 
         Args:
-            target_schema: StructuredModel class for validation and processing
+            target_schema: Optional StructuredModel class for validation and processing.
+                Required for update() and evaluate_dataframe(). Not required when using
+                update_from_comparison_result() with pre-computed results.
             verbose: Whether to print detailed progress information
             document_non_matches: Whether to document detailed non-match information
             elide_errors: If True, skip documents with errors; if False, accumulate error metrics
@@ -66,10 +68,10 @@ def __init__(
         # Initialize state
         self.reset()
 
+        self._schema_name = target_schema.__name__ if target_schema else "unknown"
+
         if self.verbose:
-            print(
-                f"Initialized BulkStructuredModelEvaluator for {target_schema.__name__}"
-            )
+            print(f"Initialized BulkStructuredModelEvaluator for {self._schema_name}")
             if self.individual_results_jsonl:
                 print(
                     f"Individual results will be appended to: {self.individual_results_jsonl}"
@@ -111,9 +113,8 @@ def update(
         """
         Process a single document pair and accumulate the results in internal state.
 
-        This is the core method for stateful evaluation, inspired by PyTorch Lightning's
-        training_step pattern. Each call processes one document pair and updates
-        the internal confusion matrix counters.
+        Runs compare_with() on the model pair, optionally writes the raw result
+        to JSONL, then delegates accumulation to update_from_comparison_result().
 
         Args:
             gt_model: Ground truth StructuredModel instance
@@ -124,29 +125,70 @@ def update(
             doc_id = f"doc_{self._processed_count}"
 
         try:
-            # Use compare_with method directly on the StructuredModel
-            # Pass document_non_matches to achieve parity with compare_with method
             comparison_result = gt_model.compare_with(
                 pred_model,
                 include_confusion_matrix=True,
                 document_non_matches=self.document_non_matches,
             )
 
-            # Collect non-matches if enabled
+            # JSONL append of raw comparison result before accumulation
+            if self.individual_results_jsonl:
+                record = {"doc_id": doc_id, "comparison_result": comparison_result}
+                with open(self.individual_results_jsonl, "a", encoding="utf-8") as f:
+                    f.write(json.dumps(record) + "\n")
+
+            self.update_from_comparison_result(comparison_result, doc_id)
+
+        except Exception as e:
+            error_record = {
+                "doc_id": doc_id,
+                "error": str(e),
+                "error_type": type(e).__name__,
+            }
+
+            if not self.elide_errors:
+                self._errors.append(error_record)
+                self._confusion_matrix["overall"]["fn"] += 1
+
+            if self.verbose:
+                print(f"Error processing document {doc_id}: {str(e)}")
+
+    def update_from_comparison_result(
+        self,
+        comparison_result: Dict[str, Any],
+        doc_id: Optional[str] = None,
+    ) -> None:
+        """
+        Accumulate a pre-computed compare_with() result into internal state.
+
+        Unlike update(), this method does not require StructuredModel instances
+        or re-run comparisons. It accepts the raw dictionary output of
+        StructuredModel.compare_with(include_confusion_matrix=True) and
+        accumulates its confusion matrix.
+
+        Args:
+            comparison_result: Dictionary returned by StructuredModel.compare_with()
+                with include_confusion_matrix=True. Must contain a "confusion_matrix" key.
+            doc_id: Optional document identifier for error tracking
+        """
+        if doc_id is None:
+            doc_id = f"doc_{self._processed_count}"
+
+        try:
+            if "confusion_matrix" not in comparison_result:
+                raise ValueError(
+                    "comparison_result must contain a 'confusion_matrix' key. "
+                    "Ensure compare_with() was called with include_confusion_matrix=True."
+                )
+
+            # Collect non-matches if enabled and present
             if self.document_non_matches and "non_matches" in comparison_result:
-                # Add doc_id to each non-match for bulk tracking
                 for non_match in comparison_result["non_matches"]:
                     non_match_with_doc = non_match.copy()
                     non_match_with_doc["doc_id"] = doc_id
                     self._non_matches.append(non_match_with_doc)
 
-            # Simple JSONL append of raw comparison result (before any processing)
-            if self.individual_results_jsonl:
-                record = {"doc_id": doc_id, "comparison_result": comparison_result}
-                with open(self.individual_results_jsonl, "a", encoding="utf-8") as f:
-                    f.write(json.dumps(record) + "\n")
-
-            # Accumulate the results into our state (this flattens for aggregation)
+            # Accumulate the confusion matrix
             self._accumulate_confusion_matrix(comparison_result["confusion_matrix"])
 
             self._processed_count += 1
@@ -164,9 +206,6 @@ def update(
 
             if not self.elide_errors:
                 self._errors.append(error_record)
-
-                # For errors, add a "failed" classification to overall metrics
-                # This represents complete failure to process the document
                 self._confusion_matrix["overall"]["fn"] += 1
 
             if self.verbose:
@@ -454,7 +493,7 @@ def save_metrics(self, filepath: str) -> None:
                 "error_rate": len(process_eval.errors) / self._processed_count
                 if self._processed_count > 0
                 else 0,
-                "target_schema": self.target_schema.__name__,
+                "target_schema": self._schema_name,
             },
             "errors": process_eval.errors,
             "metadata": {
@@ -491,7 +530,7 @@ def pretty_print_metrics(self) -> None:
 
         # Header
         print("\n" + "=" * 80)
-        print(f"BULK EVALUATION RESULTS - {self.target_schema.__name__}")
+        print(f"BULK EVALUATION RESULTS - {self._schema_name}")
         print("=" * 80)
 
         # Overall metrics
@@ -575,7 +614,7 @@ def pretty_print_metrics(self) -> None:
         # Configuration info
         print("\nCONFIGURATION:")
         print("-" * 40)
-        print(f"Target Schema: {self.target_schema.__name__}")
+        print(f"Target Schema: {self._schema_name}")
         print(f"Document Non-matches: {'Yes' if self.document_non_matches else 'No'}")
         print(f"Elide Errors: {'Yes' if self.elide_errors else 'No'}")
         if self.individual_results_jsonl:
@@ -606,7 +645,7 @@ def get_state(self) -> Dict[str, Any]:
             "processed_count": self._processed_count,
             "start_time": self._start_time,
             # Configuration
-            "target_schema": self.target_schema.__name__,
+            "target_schema": self._schema_name,
             "elide_errors": self.elide_errors,
         }
 
@@ -621,9 +660,9 @@ def load_state(self, state: Dict[str, Any]) -> None:
             state: State dictionary from get_state()
         """
         # Validate state compatibility
-        if state.get("target_schema") != self.target_schema.__name__:
+        if state.get("target_schema") != self._schema_name:
             raise ValueError(
-                f"State schema {state.get('target_schema')} doesn't match evaluator schema {self.target_schema.__name__}"
+                f"State schema {state.get('target_schema')} doesn't match evaluator schema {self._schema_name}"
             )
 
         # Restore confusion matrix state
@@ -658,9 +697,9 @@ def merge_state(self, other_state: Dict[str, Any]) -> None:
             other_state: State dictionary from another evaluator instance
         """
         # Validate compatibility
-        if other_state.get("target_schema") != self.target_schema.__name__:
+        if other_state.get("target_schema") != self._schema_name:
             raise ValueError(
-                f"Cannot merge incompatible schemas: {other_state.get('target_schema')} vs {self.target_schema.__name__}"
+                f"Cannot merge incompatible schemas: {other_state.get('target_schema')} vs {self._schema_name}"
             )
 
         # Merge overall metrics
@@ -722,3 +761,27 @@ def evaluate_dataframe(self, df) -> ProcessEvaluation:
                 continue
 
         return self.compute()
+
+
+def aggregate_from_comparisons(
+    comparison_results: List[Dict[str, Any]],
+) -> ProcessEvaluation:
+    """
+    Aggregate a list of pre-computed compare_with() results into field-level metrics.
+
+    This is a convenience function for aggregating stored comparison results
+    without needing the original StructuredModel instances. It accepts the raw
+    dictionary outputs of StructuredModel.compare_with(include_confusion_matrix=True).
+
+    Args:
+        comparison_results: List of dictionaries, each returned by
+            StructuredModel.compare_with(include_confusion_matrix=True).
+
+    Returns:
+        ProcessEvaluation with aggregated metrics including overall and
+        per-field precision, recall, F1, and accuracy.
+    """
+    evaluator = BulkStructuredModelEvaluator()
+    for result in comparison_results:
+        evaluator.update_from_comparison_result(result)
+    return evaluator.compute()
@@ -8,6 +8,7 @@
 
 from .null_helper import NullHelper
 from .result_helper import ResultHelper
+from .null_helper import NullHelper
 
 if TYPE_CHECKING:
     from .structured_model import StructuredModel
Original file line number	Diff line number	Diff line change
`@@ -366,7 +366,8 @@ result = model1.compare_with(`
`366`	`366`	`add_confidence_metrics=True, # Add AUROC confidence calibration metric`
`367`	`367`	`evaluator_format=False, # Format for evaluation tools`
`368`	`368`	`recall_with_fd=False, # Include FD in recall calculation`
`369`		`- add_derived_metrics=True # Add precision/recall/F1 metrics`
	`369`	`+ add_derived_metrics=True, # Add precision/recall/F1 metrics`
	`370`	`+ document_field_comparisons=False # Document all field-level comparisons`
`370`	`371`	`)`
`371`	`372`	```
`372`	`373`