Merge branch 'fix/agentic-extraction' into develop

rstrahan · rstrahan · commit d8fc24f0f8a1 · 2026-04-04T14:38:50.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,16 @@ SPDX-License-Identifier: MIT-0
 
 ## [Unreleased]
 
+### Fixed
+
+- **Fixed** agentic extraction crash (`TypeError: unsupported format string passed to NoneType.__format__`) when table parsing stats contain `None` values for `avg_confidence` or `parse_success_rate`.
+
+- **Fixed** agentic extraction `map_table_to_schema` producing phantom empty rows from non-matching tables (e.g. account_summary rows prepended to transaction_details), causing list item ordering to be shifted by several positions.
+
+### Changed
+
+- **Default extraction model updated** to `us.anthropic.claude-sonnet-4-6` (was `us.anthropic.claude-sonnet-4-20250514-v1:0`) in system defaults.
+
 ## [0.5.5]
 
 ### Added
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.5.5
+0.5.6-wip1
diff --git a/lib/idp_common_pkg/idp_common/config/system_defaults/base-extraction.yaml b/lib/idp_common_pkg/idp_common/config/system_defaults/base-extraction.yaml
@@ -20,7 +20,7 @@ extraction:
     target_width: ""
     target_height: ""
   # Claude Sonnet 4 for best agentic performance (was: Nova Lite)
-  model: us.anthropic.claude-sonnet-4-20250514-v1:0
+  model: us.anthropic.claude-sonnet-4-6
   temperature: "0.0"
   top_p: "0.0"
   max_tokens: "64000"                       # Claude 4 maximum
diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py
@@ -1287,8 +1287,8 @@ def _generate_processing_report(self, metadata: dict[str, Any]) -> str:
                     "✓ Table Parsing Tool Results:",
                     f"  - Tables parsed: {stats.get('tables_parsed', 0)}",
                     f"  - Total rows extracted: {stats.get('rows_parsed', 0)}",
-                    f"  - Parse success rate: {stats.get('parse_success_rate', 0):.1%}",
-                    f"  - Avg OCR confidence: {stats.get('avg_confidence', 0):.1f}%",
+                    f"  - Parse success rate: {stats.get('parse_success_rate') or 0:.1%}",
+                    f"  - Avg OCR confidence: {stats.get('avg_confidence') or 0:.1f}%",
                     "",
                 ]
             )
diff --git a/lib/idp_common_pkg/idp_common/extraction/tools/table_parser.py b/lib/idp_common_pkg/idp_common/extraction/tools/table_parser.py
@@ -1026,6 +1026,18 @@ def _detect_and_split_merged_row(
             if len(split_rows) > 1:
                 merged_splits += 1
                 warnings.append(f"Auto-split merged row into {len(split_rows)} entries")
+
+            # Filter out rows where all mapped column values are empty.
+            # This removes phantom rows from non-matching tables (e.g. an
+            # account_summary table whose columns don't match the transaction
+            # column_mapping — every mapped value comes back as "").
+            mapped_col_fields = set(col_map.values())
+            split_rows = [
+                r
+                for r in split_rows
+                if any(str(r.get(f, "")).strip() for f in mapped_col_fields)
+            ]
+
             mapped_rows.extend(split_rows)
 
         # Accumulate mapped rows in agent state for finalize_table_extraction

Original file line number	Diff line number	Diff line change
`@@ -1287,8 +1287,8 @@ def _generate_processing_report(self, metadata: dict[str, Any]) -> str:`
`1287`	`1287`	`"✓ Table Parsing Tool Results:",`
`1288`	`1288`	`f" - Tables parsed: {stats.get('tables_parsed', 0)}",`
`1289`	`1289`	`f" - Total rows extracted: {stats.get('rows_parsed', 0)}",`
`1290`		`- f" - Parse success rate: {stats.get('parse_success_rate', 0):.1%}",`
`1291`		`- f" - Avg OCR confidence: {stats.get('avg_confidence', 0):.1f}%",`
	`1290`	`+ f" - Parse success rate: {stats.get('parse_success_rate') or 0:.1%}",`
	`1291`	`+ f" - Avg OCR confidence: {stats.get('avg_confidence') or 0:.1f}%",`
`1292`	`1292`	`"",`
`1293`	`1293`	`]`
`1294`	`1294`	`)`