2828 from azure .core .exceptions import ODataV4Format
2929
3030
31+ _RESERVED_METADATA_KEYS = frozenset (
32+ {
33+ "contentType" ,
34+ "timeRange" ,
35+ "category" ,
36+ "pages" ,
37+ "fields" ,
38+ "rai_warnings" ,
39+ }
40+ )
41+
42+
3143# ---------------------------------------------------------------------------
3244# Public API
3345# ---------------------------------------------------------------------------
@@ -40,7 +52,7 @@ def to_llm_input(
4052 include_markdown : bool = True ,
4153 metadata : Optional [Dict [str , Any ]] = None ,
4254) -> str :
43- """Convert a CU analysis result into LLM-friendly text.
55+ """Convert a Content Understanding analysis result into LLM-friendly text.
4456
4557 Produces a formatted text string from the analysis result,
4658 suitable for injecting into an LLM prompt, storing in a vector
@@ -49,11 +61,11 @@ def to_llm_input(
4961 For single-content results (documents, images), the output is a
5062 flat text block. For multi-segment results (video, audio), each
5163 segment is rendered with its time range. For document
52- classification results (parent + categorized children ), the
53- helper automatically skips the parent and renders each child with
54- its category label .
64+ classification results (parent with nested segments ), the
65+ helper automatically expands the parent into per-segment blocks
66+ with category labels and markdown slices .
5567
56- :param result: The ``AnalysisResult`` from a CU analyze operation.
68+ :param result: The ``AnalysisResult`` from a Content Understanding analyze operation.
5769 :type result: ~azure.ai.contentunderstanding.models.AnalysisResult
5870 :keyword include_fields: Whether to include structured fields in the
5971 output. Defaults to True. Set to False for markdown-only
@@ -68,12 +80,14 @@ def to_llm_input(
6880 ``"source"`` (filename), ``"department"``,
6981 ``"batch_id"``, etc. Metadata keys are placed after
7082 ``contentType`` and before auto-detected keys
71- (``timeRange``, ``category``, ``pages``).
83+ (``timeRange``, ``category``, ``pages``). Metadata keys must not
84+ conflict with helper-generated front matter keys.
7285 :paramtype metadata: dict[str, Any] or None
7386 :returns: A formatted text string with YAML front matter followed
7487 by markdown content.
7588 :rtype: str
7689 :raises TypeError: If *result* is not an ``AnalysisResult``.
90+ :raises ValueError: If *metadata* contains a reserved front matter key.
7791
7892 Example::
7993
@@ -96,6 +110,8 @@ def to_llm_input(
96110 if not isinstance (result , _AnalysisResult ):
97111 raise TypeError (f"Expected AnalysisResult, got { type (result ).__name__ } " )
98112
113+ _validate_metadata (metadata )
114+
99115 if not result .contents :
100116 return ""
101117
@@ -123,6 +139,25 @@ def to_llm_input(
123139 return "\n \n *****\n \n " .join (blocks )
124140
125141
142+ def _validate_metadata (metadata : Optional [Dict [str , Any ]]) -> None :
143+ """Validate user-supplied front matter metadata.
144+
145+ :param metadata: Optional user-supplied metadata.
146+ :type metadata: dict[str, Any] or None
147+ :raises ValueError: If metadata contains helper-generated front matter keys.
148+ """
149+ if not metadata :
150+ return
151+
152+ reserved = sorted (set (metadata ).intersection (_RESERVED_METADATA_KEYS ))
153+ if reserved :
154+ keys = ", " .join (reserved )
155+ raise ValueError (
156+ f"metadata contains reserved front matter key(s): { keys } . "
157+ "Use custom keys such as 'source', 'documentId', or 'department' instead."
158+ )
159+
160+
126161# ---------------------------------------------------------------------------
127162# Field resolution (internal)
128163# ---------------------------------------------------------------------------
@@ -217,8 +252,10 @@ def _get_renderable_contents(
217252 routed_paths .add (c .path )
218253
219254 result : List ["AnalysisContent" ] = []
255+ expanded_classification = False
220256 for c in contents :
221257 if isinstance (c , DocumentContent ) and c .segments and not c .category :
258+ expanded_classification = True
222259 parent_path = c .path or ""
223260 # Expand parent into per-segment synthetic DocumentContent items,
224261 # but skip segments that have a routed top-level content.
@@ -242,15 +279,16 @@ def _get_renderable_contents(
242279 else :
243280 result .append (c )
244281
245- # Sort by page number so output follows document order.
246- # This matters when routed segments (with fields) appear as
247- # separate top-level contents after expanded parent segments.
248- def _sort_key (c : "AnalysisContent" ) -> int :
249- if isinstance (c , DocumentContent ) and c .start_page_number is not None :
250- return c .start_page_number
251- return 0
282+ if expanded_classification :
283+ # Sort classification blocks by page number so routed segments (with fields)
284+ # appear in document order. Non-classification results preserve service order.
285+ def _sort_key (c : "AnalysisContent" ) -> int :
286+ if isinstance (c , DocumentContent ) and c .start_page_number is not None :
287+ return c .start_page_number
288+ return 0
289+
290+ result .sort (key = _sort_key )
252291
253- result .sort (key = _sort_key )
254292 return result
255293
256294
0 commit comments