Skip to content

Commit d8fc24f

Browse files
committed
Merge branch 'fix/agentic-extraction' into develop
2 parents 8f8ed16 + 3a8252b commit d8fc24f

5 files changed

Lines changed: 26 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ SPDX-License-Identifier: MIT-0
55

66
## [Unreleased]
77

8+
### Fixed
9+
10+
- **Fixed** agentic extraction crash (`TypeError: unsupported format string passed to NoneType.__format__`) when table parsing stats contain `None` values for `avg_confidence` or `parse_success_rate`.
11+
12+
- **Fixed** agentic extraction `map_table_to_schema` producing phantom empty rows from non-matching tables (e.g. account_summary rows prepended to transaction_details), causing list item ordering to be shifted by several positions.
13+
14+
### Changed
15+
16+
- **Default extraction model updated** to `us.anthropic.claude-sonnet-4-6` (was `us.anthropic.claude-sonnet-4-20250514-v1:0`) in system defaults.
17+
818
## [0.5.5]
919

1020
### Added

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.5.5
1+
0.5.6-wip1

lib/idp_common_pkg/idp_common/config/system_defaults/base-extraction.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ extraction:
2020
target_width: ""
2121
target_height: ""
2222
# Claude Sonnet 4 for best agentic performance (was: Nova Lite)
23-
model: us.anthropic.claude-sonnet-4-20250514-v1:0
23+
model: us.anthropic.claude-sonnet-4-6
2424
temperature: "0.0"
2525
top_p: "0.0"
2626
max_tokens: "64000" # Claude 4 maximum

lib/idp_common_pkg/idp_common/extraction/service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,8 +1287,8 @@ def _generate_processing_report(self, metadata: dict[str, Any]) -> str:
12871287
"✓ Table Parsing Tool Results:",
12881288
f" - Tables parsed: {stats.get('tables_parsed', 0)}",
12891289
f" - Total rows extracted: {stats.get('rows_parsed', 0)}",
1290-
f" - Parse success rate: {stats.get('parse_success_rate', 0):.1%}",
1291-
f" - Avg OCR confidence: {stats.get('avg_confidence', 0):.1f}%",
1290+
f" - Parse success rate: {stats.get('parse_success_rate') or 0:.1%}",
1291+
f" - Avg OCR confidence: {stats.get('avg_confidence') or 0:.1f}%",
12921292
"",
12931293
]
12941294
)

lib/idp_common_pkg/idp_common/extraction/tools/table_parser.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,18 @@ def _detect_and_split_merged_row(
10261026
if len(split_rows) > 1:
10271027
merged_splits += 1
10281028
warnings.append(f"Auto-split merged row into {len(split_rows)} entries")
1029+
1030+
# Filter out rows where all mapped column values are empty.
1031+
# This removes phantom rows from non-matching tables (e.g. an
1032+
# account_summary table whose columns don't match the transaction
1033+
# column_mapping — every mapped value comes back as "").
1034+
mapped_col_fields = set(col_map.values())
1035+
split_rows = [
1036+
r
1037+
for r in split_rows
1038+
if any(str(r.get(f, "")).strip() for f in mapped_col_fields)
1039+
]
1040+
10291041
mapped_rows.extend(split_rows)
10301042

10311043
# Accumulate mapped rows in agent state for finalize_table_extraction

0 commit comments

Comments
 (0)