Merge branch 'fix/discovery-robustness' into 'develop'

rstrahan · rstrahan · commit f76e939ce7be · 2026-03-09T18:52:02.000Z
fix: improve discovery robustness (markdown stripping, retry, config-version handling)

See merge request genaiic-reusable-assets/engagement-artifacts/genaiic-idp-accelerator!572
diff --git a/lib/idp_cli_pkg/idp_cli/cli.py b/lib/idp_cli_pkg/idp_cli/cli.py
@@ -4545,10 +4545,10 @@ def discover(
             )
             console.print("[green]✓ Batch discovery complete[/green]")
 
-        if stack_name and succeeded > 0:
+        if stack_name and config_version and succeeded > 0:
             console.print(
                 f"[green]✓ Schema(s) saved to configuration"
-                f"{' (version: ' + config_version + ')' if config_version else ''}[/green]"
+                f" (version: {config_version})[/green]"
             )
 
         if failed > 0:
diff --git a/lib/idp_common_pkg/idp_common/config/system_defaults/base-discovery.yaml b/lib/idp_common_pkg/idp_common/config/system_defaults/base-discovery.yaml
@@ -58,22 +58,21 @@ discovery:
       extraction and ensure consistency with expected document structure and
       field definitions.
     max_tokens: "10000"
-    top_p: "0.0"
-    temperature: "0.0"
+    top_p: "0.1"
+    temperature: "1.0"
     user_prompt: >-
-      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.
       <GROUND_TRUTH_REFERENCE>
       {ground_truth_json}
       </GROUND_TRUTH_REFERENCE>
       Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
       Image may contain multiple pages, process all pages.
       Extract all field names including those without values.
       Do not change the group name and field name from ground truth in the extracted data json.
-      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
-      Add two fields document_class and document_description. 
-      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
-      For document_description generate a description about the document in less than 50 words.
-      If the group repeats and follows table format, update the attributeType as "list".                         
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field.
+      Make sure to fill out the top-level "$id" and "x-aws-idp-document-type" with the extracted document class, and the top-level "description" with a brief description of the document class.
+      Nesting Groups: Do not nest the groups i.e. groups within groups. All groups should be directly associated under main "properties".
+      If the group repeats and follows table format, update the attributeType as "list".
       Do not extract the values.
       Format the extracted data using the below JSON format:
       Format the extracted groups and fields using the below JSON format:
@@ -85,21 +84,20 @@ discovery:
       and organizational structure. Focus on creating comprehensive blueprints
       for document processing without extracting actual values.
     max_tokens: "10000"
-    top_p: "0.0"
-    temperature: "0.0"
+    top_p: "0.1"
+    temperature: "1.0"
     user_prompt: >-
       This image contains forms data. Analyze the form line by line.
-      Image may contains multiple pages, process all the pages. 
-      Form may contain multiple name value pair in one line. 
-      Extract all the names in the form including the name value pair which doesn't have value. 
-      Organize them into groups, extract field_name, data_type and field description
-      Field_name should be less than 60 characters, should not have space use '-' instead of space.
-      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Image may contain multiple pages, process all the pages.
+      Form may contain multiple name value pair in one line.
+      Extract all the names in the form including the name value pair which doesn't have value.
+      Organize them into groups, extract field_name, data_type and field description.
+      Field names should be less than 30 characters, use camelCase or snake_case, name should not start with number and name should not have special characters.
+      Field descriptions should include location hints (box number, line number, section).
       Field_name should be unique within the group.
-      Add two fields document_class and document_description. 
-      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
-      For document_description generate a description about the document in less than 50 words. 
+      Make sure to fill out the top-level "$id" and "x-aws-idp-document-type" with the extracted document class, and the top-level "description" with a brief description of the document class.
       Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      Nesting Groups: Do not nest the groups i.e. groups within groups. All groups should be directly associated under main "properties".
       If the group repeats and follows table format, update the attributeType as "list".
       Do not extract the values.
       Return the extracted data in JSON format.
diff --git a/lib/idp_common_pkg/idp_common/discovery/classes_discovery.py b/lib/idp_common_pkg/idp_common/discovery/classes_discovery.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import re
 from typing import Any, Dict, Optional, cast
 
 import jsonschema
@@ -246,6 +247,14 @@ def _merge_and_save_class(self, new_class: Dict[str, Any]) -> None:
             "Config", existing_custom, version=self.version
         )
 
+    @staticmethod
+    def _extract_json(text: str) -> str:
+        """Strip markdown code fences from LLM response before JSON parsing."""
+        match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL)
+        if match:
+            return match.group(1)
+        return text
+
     def _validate_json_schema(self, schema: Dict[str, Any]) -> tuple[bool, str]:
         """
         Validate that the response is a valid JSON Schema.
@@ -367,7 +376,7 @@ def _extract_data_from_document(
                 )
 
                 # Parse JSON response
-                schema = json.loads(content_text)
+                schema = json.loads(self._extract_json(content_text))
 
                 # Validate the schema
                 is_valid, error_msg = self._validate_json_schema(schema)
@@ -493,7 +502,7 @@ def _extract_data_from_document_with_ground_truth(
                 )
 
                 # Parse JSON response
-                schema = json.loads(content_text)
+                schema = json.loads(self._extract_json(content_text))
 
                 # Validate the schema
                 is_valid, error_msg = self._validate_json_schema(schema)
diff --git a/lib/idp_sdk/idp_sdk/operations/discovery.py b/lib/idp_sdk/idp_sdk/operations/discovery.py
@@ -6,8 +6,9 @@
 import json
 import logging
 import os
+import re
 from pathlib import Path
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 from idp_sdk.exceptions import IDPConfigurationError, IDPResourceNotFoundError
 from idp_sdk.models.discovery import DiscoveryBatchResult, DiscoveryResult
@@ -116,29 +117,47 @@ def _run_with_stack(
 
             from idp_common.discovery.classes_discovery import ClassesDiscovery
 
-            # ClassesDiscovery reads config from DynamoDB but we pass file_bytes
-            # so it never reads from S3. input_prefix is used only for extension.
-            discovery = ClassesDiscovery(
-                input_bucket="local",
-                input_prefix=doc_path.name,
-                region=self._client._region,
-                version=config_version,
-            )
+            # Try loading the requested config version; fall back to active
+            # config if the version doesn't exist yet (user wants to create it).
+            try:
+                discovery = ClassesDiscovery(
+                    input_bucket="local",
+                    input_prefix=doc_path.name,
+                    region=self._client._region,
+                    version=config_version,
+                )
+            except Exception:
+                if config_version is None:
+                    raise
+                logger.warning(
+                    f"Config version '{config_version}' not found, "
+                    f"reading from active config and saving to '{config_version}'"
+                )
+                discovery = ClassesDiscovery(
+                    input_bucket="local",
+                    input_prefix=doc_path.name,
+                    region=self._client._region,
+                    version=None,
+                )
+                discovery.version = config_version
+
+            # Only save to config if a version was explicitly specified
+            save = config_version is not None
 
             if gt_data:
                 result = discovery.discovery_classes_with_document_and_ground_truth(
                     input_bucket="local",
                     input_prefix=doc_path.name,
                     file_bytes=file_bytes,
                     ground_truth_data=gt_data,
-                    save_to_config=True,
+                    save_to_config=save,
                 )
             else:
                 result = discovery.discovery_classes_with_document(
                     input_bucket="local",
                     input_prefix=doc_path.name,
                     file_bytes=file_bytes,
-                    save_to_config=True,
+                    save_to_config=save,
                 )
 
             schema = result.get("schema")
@@ -167,6 +186,7 @@ def _run_local(
         doc_path: Path,
         file_bytes: bytes,
         gt_data: Optional[dict],
+        max_retries: int = 3,
     ) -> DiscoveryResult:
         """Local mode: uses system defaults, no stack needed, no config save."""
         try:
@@ -202,8 +222,6 @@ def _run_local(
             else:
                 user_prompt = mode_cfg.get("user_prompt") or _prompt_without_gt()
 
-            full_prompt = f"{user_prompt}\nFormat the extracted data using the below JSON format:\n{sample_format}"
-
             # Create content with file bytes
             file_extension = doc_path.suffix.lower().lstrip(".")
             if file_extension == "pdf":
@@ -215,35 +233,74 @@ def _run_local(
                             "source": {"bytes": file_bytes},
                         }
                     },
-                    {"text": full_prompt},
                 ]
             else:
-                image_content = image.prepare_bedrock_image_attachment(file_bytes)
-                content = [image_content, {"text": full_prompt}]
+                content = [image.prepare_bedrock_image_attachment(file_bytes)]
 
-            # Call Bedrock
+            # Call Bedrock with retry/validation loop
             region = self._client._region or os.environ.get("AWS_REGION", "us-west-2")
-            client = bedrock.BedrockClient(region=region)
-            response = client.invoke_model(
-                model_id=model_id,
-                system_prompt=system_prompt,
-                content=content,
-                temperature=temperature,
-                top_p=top_p,
-                max_tokens=max_tokens,
-                context="ClassesDiscoveryLocal",
-            )
+            bedrock_client = bedrock.BedrockClient(region=region)
+
+            validation_feedback = ""
+            for attempt in range(max_retries):
+                try:
+                    retry_prompt = ""
+                    if attempt > 0 and validation_feedback:
+                        retry_prompt = (
+                            f"\n\nPREVIOUS ATTEMPT FAILED: {validation_feedback}\n"
+                            f"Please fix the issue and generate a valid JSON Schema.\n\n"
+                        )
+
+                    full_prompt = (
+                        f"{retry_prompt}{user_prompt}\n"
+                        f"Format the extracted data using the below JSON format:\n{sample_format}"
+                    )
 
-            content_text = bedrock.extract_text_from_response(response)
-            schema = json.loads(content_text)
+                    response = bedrock_client.invoke_model(
+                        model_id=model_id,
+                        system_prompt=system_prompt,
+                        content=content + [{"text": full_prompt}],
+                        temperature=temperature,
+                        top_p=top_p,
+                        max_tokens=max_tokens,
+                        context="ClassesDiscoveryLocal",
+                    )
 
-            doc_class = schema.get("$id") or schema.get("x-aws-idp-document-type")
+                    content_text = bedrock.extract_text_from_response(response)
+                    schema = json.loads(_extract_json(content_text))
+
+                    is_valid, error_msg = _validate_json_schema(schema)
+                    if is_valid:
+                        logger.info(
+                            f"Successfully generated valid JSON Schema on attempt {attempt + 1}"
+                        )
+                        doc_class = schema.get("$id") or schema.get(
+                            "x-aws-idp-document-type"
+                        )
+                        return DiscoveryResult(
+                            status="SUCCESS",
+                            document_class=doc_class,
+                            json_schema=schema,
+                            document_path=str(doc_path),
+                        )
+                    else:
+                        validation_feedback = error_msg
+                        logger.warning(
+                            f"Invalid schema on attempt {attempt + 1}: {error_msg}"
+                        )
+
+                except json.JSONDecodeError as e:
+                    validation_feedback = f"Invalid JSON format: {str(e)}"
+                    logger.warning(f"JSON parse error on attempt {attempt + 1}: {e}")
+                except Exception as e:
+                    logger.error(f"Error on attempt {attempt + 1}: {e}")
+                    if attempt == max_retries - 1:
+                        raise
 
             return DiscoveryResult(
-                status="SUCCESS",
-                document_class=doc_class,
-                json_schema=schema,
+                status="FAILED",
                 document_path=str(doc_path),
+                error=f"Failed to generate valid schema after {max_retries} attempts",
             )
 
         except Exception as e:
@@ -319,6 +376,32 @@ def _get_config_table(self, stack_name: str) -> str:
         raise IDPResourceNotFoundError("ConfigurationTable not found in stack.")
 
 
+# --- Helpers for local mode ---
+
+
+def _extract_json(text: str) -> str:
+    """Strip markdown code fences from LLM response before JSON parsing."""
+    match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL)
+    if match:
+        return match.group(1)
+    return text
+
+
+def _validate_json_schema(schema: Dict[str, Any]) -> tuple:
+    """Validate that the response is a valid JSON Schema."""
+    required_fields = ["$schema", "$id", "type", "properties"]
+    for field in required_fields:
+        if field not in schema:
+            return False, f"Missing required field: {field}"
+    if "x-aws-idp-document-type" not in schema:
+        return False, "Missing x-aws-idp-document-type field"
+    if schema.get("type") != "object":
+        return False, "Root type must be 'object'"
+    if not isinstance(schema.get("properties"), dict):
+        return False, "Properties must be an object"
+    return True, ""
+
+
 # --- Standalone prompt helpers for local mode ---
 
 

Original file line number	Diff line number	Diff line change
`@@ -4545,10 +4545,10 @@ def discover(`
`4545`	`4545`	`)`
`4546`	`4546`	`console.print("[green]✓ Batch discovery complete[/green]")`
`4547`	`4547`
`4548`		`- if stack_name and succeeded > 0:`
	`4548`	`+ if stack_name and config_version and succeeded > 0:`
`4549`	`4549`	`console.print(`
`4550`	`4550`	`f"[green]✓ Schema(s) saved to configuration"`
`4551`		`- f"{' (version: ' + config_version + ')' if config_version else ''}[/green]"`
	`4551`	`+ f" (version: {config_version})[/green]"`
`4552`	`4552`	`)`
`4553`	`4553`
`4554`	`4554`	`if failed > 0:`