Skip to content

Commit 6c20862

Browse files
committed
Merge remote-tracking branch 'origin/develop' into feature/error-analyzer-2
2 parents e0dd022 + e07ea08 commit 6c20862

58 files changed

Lines changed: 7894 additions & 939 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ notebooks/examples/data
2626
.dsr/
2727
*tmp-dev-assets*
2828
scratch/
29+
.mcp.json
2930

3031
# Node.js / npm
3132
node_modules/

.mcp.json

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"mcpServers": {
3+
"aws-serverless": {
4+
"command": "uvx",
5+
"args": [
6+
"awslabs.aws-serverless-mcp-server@latest",
7+
"--allow-write",
8+
"--allow-sensitive-data-access"
9+
],
10+
"env": {
11+
"AWS_REGION": "us-west-2"
12+
}
13+
},
14+
"cloudwatch": {
15+
"command": "uvx",
16+
"args": [
17+
"awslabs.cloudwatch-mcp-server@latest"
18+
],
19+
"env": {
20+
"AWS_REGION": "us-west-2",
21+
"FASTMCP_LOG_LEVEL": "ERROR"
22+
}
23+
},
24+
"aws-diagram": {
25+
"command": "uvx",
26+
"args": [
27+
"awslabs.aws-diagram-mcp-server"
28+
],
29+
"env": {
30+
"FASTMCP_LOG_LEVEL": "ERROR"
31+
}
32+
},
33+
"aws-knowledge": {
34+
"command": "uvx",
35+
"args": [
36+
"fastmcp",
37+
"run",
38+
"https://knowledge-mcp.global.api.aws"
39+
]
40+
},
41+
"aws-iac": {
42+
"command": "uvx",
43+
"args": [
44+
"awslabs.aws-iac-mcp-server@latest"
45+
],
46+
"env": {
47+
"FASTMCP_LOG_LEVEL": "ERROR"
48+
}
49+
},
50+
"browsermcp": {
51+
"command": "npx",
52+
"args": [
53+
"-y",
54+
"@browsermcp/mcp@latest"
55+
]
56+
}
57+
}
58+
}

CHANGELOG.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,21 @@ SPDX-License-Identifier: MIT-0
77

88
### Added
99

10+
- **Multi-Document Discovery** — New capability to automatically discover document classes from a collection of documents. Instead of manually defining document schemas one at a time, users point to a folder of mixed documents and the system automatically identifies document types, clusters similar documents, generates JSON Schemas with field definitions for each type, and saves them to a configuration version — ready for immediate use in the processing pipeline.
11+
- **Two Input Modes**: S3 path (select bucket + prefix) or zip upload (presigned URL upload flow)
12+
- **Dedicated UI Tab**: New "Multi-Document" tab on the Discovery page with job submission form (config version selector, bucket selector, S3 prefix input, zip upload), jobs table with search/filter/sort/pagination, and detailed job results page
13+
- **Job Details Page**: Shows pipeline progress indicators, discovered classes with expandable JSON schemas, "View in Configuration →" deep-links to the Document Schema tab for the correct config version, and a Quality Review Report rendered as formatted markdown (GFM)
14+
- **Configuration Integration**: Discovered classes are saved directly to the selected config version's `classes` array in DynamoDB, immediately available for document processing without manual schema creation
15+
16+
### Fixed
17+
18+
- **"View Config" from Discovery shows wrong config version** — Fixed race condition where clicking "View in Configuration →" from a discovery job showed classes from the 'default' config version instead of the job's version. Root cause: `selectedVersion` was initialized as `null`, causing `useConfiguration('default')` to fire before URL params were read. Fix: initialize `selectedVersion` synchronously from URL hash params at mount time.
19+
20+
- **Publish pipeline missing multi-doc discovery Docker build trigger** — Added `<MULTI_DOC_DISCOVERY_BUILD_HASH_TOKEN>` to template token replacements and `package_multi_doc_discovery_source()` to create/upload the source zip for CodeBuild, ensuring Docker images are rebuilt when handler code changes.
1021
- **Wildcard pattern support for delete-documents**`idp-cli delete-documents` and `client.batch.delete_documents()` now accept a `--pattern` / `pattern` parameter for fnmatch-style wildcard matching (e.g. `"batch-123/*.pdf"`, `"*invoice*"`). Combines with `--status-filter` to delete e.g. all failed invoices across batches.
1122

1223
- **Chandra OCR Lambda Hook Sample** — New `GENAIIDP-chandra-ocr-hook` sample in `samples/lambda-hook-inference/` that integrates [Datalab Chandra OCR 2](https://github.com/datalab-to/chandra) with the LambdaHook feature for high-quality OCR. Supports 90+ languages, math, tables, forms, and handwriting. Uses the Datalab hosted async API (`/api/v1/convert`) with configurable output format (markdown/json/html) and conversion mode (fast/balanced/accurate). Includes standalone SAM template, local test script, and deployment instructions. See `docs/lambda-hook-inference.md` — Chandra OCR Integration section.
1324

14-
### Fixed
15-
1625
- **`delete-documents` fails with DynamoDB errors** — Fixed two bugs in `get_documents_by_batch()`: (1) passing empty `ExpressionAttributeNames={}` when no status filter caused `ValidationException`, and (2) using low-level DynamoDB client type descriptors (`{"S": "..."}`) with the high-level Table resource caused `begins_with` operand type mismatch. Rewrote to use the high-level `Table.scan()` API with `boto3.dynamodb.conditions.Attr`.
1726

1827
## [0.5.4]

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.5.5.dev2
1+
0.5.5.dev3

lib/idp_common_pkg/idp_common/bedrock/client.py

Lines changed: 170 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -818,22 +818,32 @@ def get_guardrail_config(self) -> Optional[Dict[str, str]]:
818818

819819
def generate_embedding(
820820
self,
821-
text: str,
821+
text: Optional[str] = None,
822822
model_id: str = "amazon.titan-embed-text-v1",
823823
max_retries: Optional[int] = None,
824+
image_bytes: Optional[bytes] = None,
825+
input_type: Optional[str] = "search_document",
824826
) -> List[float]:
825827
"""
826-
Generate an embedding vector for the given text using Amazon Bedrock.
828+
Generate an embedding vector for text and/or image using Amazon Bedrock.
829+
830+
Supports multiple embedding models:
831+
- Amazon Titan Embed Text (text only)
832+
- Amazon Titan Multimodal Embedding (text + image)
833+
- Cohere Embed v3/v4 (text + image, multimodal)
827834
828835
Args:
829-
text: The text to generate embeddings for
836+
text: The text to generate embeddings for (optional if image_bytes provided)
830837
model_id: The embedding model ID to use (default: amazon.titan-embed-text-v1)
831838
max_retries: Optional override for the instance's max_retries setting
839+
image_bytes: Optional image bytes for multimodal embedding models
840+
input_type: Input type for Cohere models (search_document, search_query,
841+
classification, clustering). Defaults to search_document.
832842
833843
Returns:
834844
List of floats representing the embedding vector
835845
"""
836-
if not text or not isinstance(text, str):
846+
if not text and image_bytes is None:
837847
# Return an empty vector for empty input
838848
return []
839849

@@ -845,25 +855,163 @@ def generate_embedding(
845855
# Track total embedding requests
846856
self._put_metric("BedrockEmbeddingRequestsTotal", 1)
847857

848-
# Normalize whitespace and prepare the input text
849-
normalized_text = " ".join(text.split())
858+
# Normalize whitespace if text provided
859+
normalized_text = " ".join(text.split()) if text else None
850860

851861
# Prepare the request body based on the model
852-
if "amazon.titan-embed" in model_id:
853-
request_body = json.dumps({"inputText": normalized_text})
854-
else:
855-
# Default format for other models
856-
request_body = json.dumps({"text": normalized_text})
862+
request_body = self._build_embedding_request_body(
863+
model_id=model_id,
864+
text=normalized_text,
865+
image_bytes=image_bytes,
866+
input_type=input_type,
867+
)
857868

858869
# Call the recursive embedding function
859870
return self._generate_embedding_with_retry(
860871
model_id=model_id,
861872
request_body=request_body,
862-
normalized_text=normalized_text,
873+
normalized_text=normalized_text or "(image-only)",
863874
retry_count=0,
864875
max_retries=effective_max_retries,
865876
)
866877

878+
def generate_embeddings_batch(
879+
self,
880+
items: List[Dict[str, Any]],
881+
model_id: str = "amazon.titan-embed-text-v1",
882+
max_retries: Optional[int] = None,
883+
max_concurrent: int = 5,
884+
input_type: Optional[str] = "search_document",
885+
progress_callback: Optional[Any] = None,
886+
) -> List[Optional[List[float]]]:
887+
"""
888+
Generate embeddings for a batch of items with concurrency control.
889+
890+
Each item in the batch can contain text, image_bytes, or both.
891+
892+
Args:
893+
items: List of dicts with optional 'text' and 'image_bytes' keys
894+
model_id: The embedding model ID to use
895+
max_retries: Optional override for retry count
896+
max_concurrent: Maximum concurrent embedding requests
897+
input_type: Input type for Cohere models
898+
progress_callback: Optional callable(completed, total) for progress updates
899+
900+
Returns:
901+
List of embedding vectors (None for failed items)
902+
"""
903+
import concurrent.futures
904+
905+
total = len(items)
906+
results: List[Optional[List[float]]] = [None] * total
907+
completed = 0
908+
909+
def _embed_single(index: int, item: Dict[str, Any]) -> tuple:
910+
"""Embed a single item and return (index, embedding)."""
911+
try:
912+
embedding = self.generate_embedding(
913+
text=item.get("text"),
914+
model_id=model_id,
915+
max_retries=max_retries,
916+
image_bytes=item.get("image_bytes"),
917+
input_type=input_type,
918+
)
919+
return (index, embedding)
920+
except Exception as e:
921+
logger.warning(f"Failed to generate embedding for item {index}: {e}")
922+
return (index, None)
923+
924+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent) as executor:
925+
futures = {
926+
executor.submit(_embed_single, i, item): i
927+
for i, item in enumerate(items)
928+
}
929+
for future in concurrent.futures.as_completed(futures):
930+
idx, embedding = future.result()
931+
results[idx] = embedding
932+
completed += 1
933+
if progress_callback:
934+
try:
935+
progress_callback(completed, total)
936+
except Exception:
937+
pass
938+
939+
return results
940+
941+
def _build_embedding_request_body(
942+
self,
943+
model_id: str,
944+
text: Optional[str] = None,
945+
image_bytes: Optional[bytes] = None,
946+
input_type: Optional[str] = "search_document",
947+
) -> str:
948+
"""
949+
Build the JSON request body for an embedding model.
950+
951+
Supports:
952+
- Amazon Titan Embed Text v1/v2: text-only via inputText
953+
- Amazon Titan Multimodal Embedding: text + image via inputText/inputImage
954+
- Cohere Embed v3/v4: text + image via texts/images arrays
955+
956+
Args:
957+
model_id: The embedding model ID
958+
text: Optional text input
959+
image_bytes: Optional image bytes
960+
input_type: Input type for Cohere models
961+
962+
Returns:
963+
JSON string for the request body
964+
"""
965+
import base64
966+
967+
model_lower = model_id.lower()
968+
969+
if "cohere" in model_lower:
970+
# Cohere Embed v3/v4 format
971+
body: Dict[str, Any] = {
972+
"input_type": input_type or "search_document",
973+
}
974+
# Detect v4 models (embed-v4) vs v3 (embed-english-v3, embed-multilingual-v3)
975+
is_v4 = "embed-v4" in model_lower
976+
if is_v4:
977+
# Cohere v4 requires explicit embedding type and supports output_dimension
978+
body["embedding_types"] = ["float"]
979+
body["output_dimension"] = 1024
980+
if text:
981+
body["texts"] = [text]
982+
if image_bytes is not None:
983+
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
984+
if is_v4:
985+
# Cohere v4 requires data URI format for images
986+
body["images"] = [f"data:image/png;base64,{img_b64}"]
987+
else:
988+
# Cohere v3 uses raw base64
989+
body["images"] = [img_b64]
990+
return json.dumps(body)
991+
992+
elif "titan-embed-image" in model_lower or (
993+
"titan-embed" in model_lower and image_bytes is not None
994+
):
995+
# Amazon Titan Multimodal Embedding G1 format
996+
body = {}
997+
if text:
998+
body["inputText"] = text
999+
if image_bytes is not None:
1000+
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
1001+
body["inputImage"] = img_b64
1002+
return json.dumps(body)
1003+
1004+
elif "titan-embed" in model_lower:
1005+
# Amazon Titan Embed Text v1/v2 (text-only)
1006+
return json.dumps({"inputText": text or ""})
1007+
1008+
else:
1009+
# Default format
1010+
body = {}
1011+
if text:
1012+
body["text"] = text
1013+
return json.dumps(body)
1014+
8671015
def _generate_embedding_with_retry(
8681016
self,
8691017
model_id: str,
@@ -912,8 +1060,17 @@ def _generate_embedding_with_retry(
9121060
# Handle different response formats based on the model
9131061
if "amazon.titan-embed" in model_id:
9141062
embedding = response_body.get("embedding", [])
1063+
elif "cohere" in model_id.lower() and "embed-v4" in model_id.lower():
1064+
# Cohere Embed v4 returns {"embeddings": {"float": [[...]]}}
1065+
embeddings_obj = response_body.get("embeddings", {})
1066+
if isinstance(embeddings_obj, dict):
1067+
float_embeddings = embeddings_obj.get("float", [])
1068+
embedding = float_embeddings[0] if float_embeddings else []
1069+
else:
1070+
# Fallback for unexpected format
1071+
embedding = embeddings_obj[0] if embeddings_obj else []
9151072
else:
916-
# Default extraction format
1073+
# Default extraction format (Cohere v3 and others)
9171074
embedding = response_body.get("embedding", [])
9181075

9191076
# Track successful requests and latency

0 commit comments

Comments
 (0)