add reserved metadata validation

chienyuanchang · chienyuanchang · commit 29804b340df8 · 2026-04-28T11:07:05.000-07:00
diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py
@@ -28,6 +28,18 @@
     from azure.core.exceptions import ODataV4Format
 
 
+_RESERVED_METADATA_KEYS = frozenset(
+    {
+        "contentType",
+        "timeRange",
+        "category",
+        "pages",
+        "fields",
+        "rai_warnings",
+    }
+)
+
+
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
@@ -40,7 +52,7 @@ def to_llm_input(
     include_markdown: bool = True,
     metadata: Optional[Dict[str, Any]] = None,
 ) -> str:
-    """Convert a CU analysis result into LLM-friendly text.
+    """Convert a Content Understanding analysis result into LLM-friendly text.
 
     Produces a formatted text string from the analysis result,
     suitable for injecting into an LLM prompt, storing in a vector
@@ -49,11 +61,11 @@ def to_llm_input(
     For single-content results (documents, images), the output is a
     flat text block. For multi-segment results (video, audio), each
     segment is rendered with its time range. For document
-    classification results (parent + categorized children), the
-    helper automatically skips the parent and renders each child with
-    its category label.
+    classification results (parent with nested segments), the
+    helper automatically expands the parent into per-segment blocks
+    with category labels and markdown slices.
 
-    :param result: The ``AnalysisResult`` from a CU analyze operation.
+    :param result: The ``AnalysisResult`` from a Content Understanding analyze operation.
     :type result: ~azure.ai.contentunderstanding.models.AnalysisResult
     :keyword include_fields: Whether to include structured fields in the
         output. Defaults to True. Set to False for markdown-only
@@ -68,12 +80,14 @@ def to_llm_input(
         ``"source"`` (filename), ``"department"``,
         ``"batch_id"``, etc. Metadata keys are placed after
         ``contentType`` and before auto-detected keys
-        (``timeRange``, ``category``, ``pages``).
+        (``timeRange``, ``category``, ``pages``). Metadata keys must not
+        conflict with helper-generated front matter keys.
     :paramtype metadata: dict[str, Any] or None
     :returns: A formatted text string with YAML front matter followed
         by markdown content.
     :rtype: str
     :raises TypeError: If *result* is not an ``AnalysisResult``.
+    :raises ValueError: If *metadata* contains a reserved front matter key.
 
     Example::
 
@@ -96,6 +110,8 @@ def to_llm_input(
     if not isinstance(result, _AnalysisResult):
         raise TypeError(f"Expected AnalysisResult, got {type(result).__name__}")
 
+    _validate_metadata(metadata)
+
     if not result.contents:
         return ""
 
@@ -123,6 +139,25 @@ def to_llm_input(
     return "\n\n*****\n\n".join(blocks)
 
 
+def _validate_metadata(metadata: Optional[Dict[str, Any]]) -> None:
+    """Validate user-supplied front matter metadata.
+
+    :param metadata: Optional user-supplied metadata.
+    :type metadata: dict[str, Any] or None
+    :raises ValueError: If metadata contains helper-generated front matter keys.
+    """
+    if not metadata:
+        return
+
+    reserved = sorted(set(metadata).intersection(_RESERVED_METADATA_KEYS))
+    if reserved:
+        keys = ", ".join(reserved)
+        raise ValueError(
+            f"metadata contains reserved front matter key(s): {keys}. "
+            "Use custom keys such as 'source', 'documentId', or 'department' instead."
+        )
+
+
 # ---------------------------------------------------------------------------
 # Field resolution (internal)
 # ---------------------------------------------------------------------------
@@ -217,8 +252,10 @@ def _get_renderable_contents(
             routed_paths.add(c.path)
 
     result: List["AnalysisContent"] = []
+    expanded_classification = False
     for c in contents:
         if isinstance(c, DocumentContent) and c.segments and not c.category:
+            expanded_classification = True
             parent_path = c.path or ""
             # Expand parent into per-segment synthetic DocumentContent items,
             # but skip segments that have a routed top-level content.
@@ -242,15 +279,16 @@ def _get_renderable_contents(
         else:
             result.append(c)
 
-    # Sort by page number so output follows document order.
-    # This matters when routed segments (with fields) appear as
-    # separate top-level contents after expanded parent segments.
-    def _sort_key(c: "AnalysisContent") -> int:
-        if isinstance(c, DocumentContent) and c.start_page_number is not None:
-            return c.start_page_number
-        return 0
+    if expanded_classification:
+        # Sort classification blocks by page number so routed segments (with fields)
+        # appear in document order. Non-classification results preserve service order.
+        def _sort_key(c: "AnalysisContent") -> int:
+            if isinstance(c, DocumentContent) and c.start_page_number is not None:
+                return c.start_page_number
+            return 0
+
+        result.sort(key=_sort_key)
 
-    result.sort(key=_sort_key)
     return result
 
 
diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py
@@ -677,6 +677,34 @@ def test_metadata_position_after_content_type(self):
         fields_idx = next(i for i, l in enumerate(lines) if l.strip() == "fields:")
         assert ct_idx < src_idx < fields_idx
 
+    @pytest.mark.parametrize(
+        "reserved_key",
+        ["contentType", "timeRange", "category", "pages", "fields", "rai_warnings"],
+    )
+    def test_reserved_metadata_key_raises(self, reserved_key):
+        doc = _make_invoice_doc()
+
+        with pytest.raises(ValueError, match="reserved front matter key"):
+            to_llm_input(_make_result([doc]), metadata={reserved_key: "custom"})
+
+    def test_non_classification_documents_preserve_input_order(self):
+        doc_page_2 = DocumentContent(
+            kind="document",
+            markdown="Second input document.",
+            start_page_number=2,
+            end_page_number=2,
+        )
+        doc_page_1 = DocumentContent(
+            kind="document",
+            markdown="First input document.",
+            start_page_number=1,
+            end_page_number=1,
+        )
+
+        output = to_llm_input(_make_result([doc_page_2, doc_page_1]))
+
+        assert output.index("Second input document.") < output.index("First input document.")
+
 
 # ===========================================================================
 # 9. YAML front matter structure