Skip to content

Commit 29804b3

Browse files
add reserved metadata validation
1 parent 85f22f4 commit 29804b3

2 files changed

Lines changed: 80 additions & 14 deletions

File tree

sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,18 @@
2828
from azure.core.exceptions import ODataV4Format
2929

3030

31+
_RESERVED_METADATA_KEYS = frozenset(
32+
{
33+
"contentType",
34+
"timeRange",
35+
"category",
36+
"pages",
37+
"fields",
38+
"rai_warnings",
39+
}
40+
)
41+
42+
3143
# ---------------------------------------------------------------------------
3244
# Public API
3345
# ---------------------------------------------------------------------------
@@ -40,7 +52,7 @@ def to_llm_input(
4052
include_markdown: bool = True,
4153
metadata: Optional[Dict[str, Any]] = None,
4254
) -> str:
43-
"""Convert a CU analysis result into LLM-friendly text.
55+
"""Convert a Content Understanding analysis result into LLM-friendly text.
4456
4557
Produces a formatted text string from the analysis result,
4658
suitable for injecting into an LLM prompt, storing in a vector
@@ -49,11 +61,11 @@ def to_llm_input(
4961
For single-content results (documents, images), the output is a
5062
flat text block. For multi-segment results (video, audio), each
5163
segment is rendered with its time range. For document
52-
classification results (parent + categorized children), the
53-
helper automatically skips the parent and renders each child with
54-
its category label.
64+
classification results (parent with nested segments), the
65+
helper automatically expands the parent into per-segment blocks
66+
with category labels and markdown slices.
5567
56-
:param result: The ``AnalysisResult`` from a CU analyze operation.
68+
:param result: The ``AnalysisResult`` from a Content Understanding analyze operation.
5769
:type result: ~azure.ai.contentunderstanding.models.AnalysisResult
5870
:keyword include_fields: Whether to include structured fields in the
5971
output. Defaults to True. Set to False for markdown-only
@@ -68,12 +80,14 @@ def to_llm_input(
6880
``"source"`` (filename), ``"department"``,
6981
``"batch_id"``, etc. Metadata keys are placed after
7082
``contentType`` and before auto-detected keys
71-
(``timeRange``, ``category``, ``pages``).
83+
(``timeRange``, ``category``, ``pages``). Metadata keys must not
84+
conflict with helper-generated front matter keys.
7285
:paramtype metadata: dict[str, Any] or None
7386
:returns: A formatted text string with YAML front matter followed
7487
by markdown content.
7588
:rtype: str
7689
:raises TypeError: If *result* is not an ``AnalysisResult``.
90+
:raises ValueError: If *metadata* contains a reserved front matter key.
7791
7892
Example::
7993
@@ -96,6 +110,8 @@ def to_llm_input(
96110
if not isinstance(result, _AnalysisResult):
97111
raise TypeError(f"Expected AnalysisResult, got {type(result).__name__}")
98112

113+
_validate_metadata(metadata)
114+
99115
if not result.contents:
100116
return ""
101117

@@ -123,6 +139,25 @@ def to_llm_input(
123139
return "\n\n*****\n\n".join(blocks)
124140

125141

142+
def _validate_metadata(metadata: Optional[Dict[str, Any]]) -> None:
143+
"""Validate user-supplied front matter metadata.
144+
145+
:param metadata: Optional user-supplied metadata.
146+
:type metadata: dict[str, Any] or None
147+
:raises ValueError: If metadata contains helper-generated front matter keys.
148+
"""
149+
if not metadata:
150+
return
151+
152+
reserved = sorted(set(metadata).intersection(_RESERVED_METADATA_KEYS))
153+
if reserved:
154+
keys = ", ".join(reserved)
155+
raise ValueError(
156+
f"metadata contains reserved front matter key(s): {keys}. "
157+
"Use custom keys such as 'source', 'documentId', or 'department' instead."
158+
)
159+
160+
126161
# ---------------------------------------------------------------------------
127162
# Field resolution (internal)
128163
# ---------------------------------------------------------------------------
@@ -217,8 +252,10 @@ def _get_renderable_contents(
217252
routed_paths.add(c.path)
218253

219254
result: List["AnalysisContent"] = []
255+
expanded_classification = False
220256
for c in contents:
221257
if isinstance(c, DocumentContent) and c.segments and not c.category:
258+
expanded_classification = True
222259
parent_path = c.path or ""
223260
# Expand parent into per-segment synthetic DocumentContent items,
224261
# but skip segments that have a routed top-level content.
@@ -242,15 +279,16 @@ def _get_renderable_contents(
242279
else:
243280
result.append(c)
244281

245-
# Sort by page number so output follows document order.
246-
# This matters when routed segments (with fields) appear as
247-
# separate top-level contents after expanded parent segments.
248-
def _sort_key(c: "AnalysisContent") -> int:
249-
if isinstance(c, DocumentContent) and c.start_page_number is not None:
250-
return c.start_page_number
251-
return 0
282+
if expanded_classification:
283+
# Sort classification blocks by page number so routed segments (with fields)
284+
# appear in document order. Non-classification results preserve service order.
285+
def _sort_key(c: "AnalysisContent") -> int:
286+
if isinstance(c, DocumentContent) and c.start_page_number is not None:
287+
return c.start_page_number
288+
return 0
289+
290+
result.sort(key=_sort_key)
252291

253-
result.sort(key=_sort_key)
254292
return result
255293

256294

sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,34 @@ def test_metadata_position_after_content_type(self):
677677
fields_idx = next(i for i, l in enumerate(lines) if l.strip() == "fields:")
678678
assert ct_idx < src_idx < fields_idx
679679

680+
@pytest.mark.parametrize(
681+
"reserved_key",
682+
["contentType", "timeRange", "category", "pages", "fields", "rai_warnings"],
683+
)
684+
def test_reserved_metadata_key_raises(self, reserved_key):
685+
doc = _make_invoice_doc()
686+
687+
with pytest.raises(ValueError, match="reserved front matter key"):
688+
to_llm_input(_make_result([doc]), metadata={reserved_key: "custom"})
689+
690+
def test_non_classification_documents_preserve_input_order(self):
691+
doc_page_2 = DocumentContent(
692+
kind="document",
693+
markdown="Second input document.",
694+
start_page_number=2,
695+
end_page_number=2,
696+
)
697+
doc_page_1 = DocumentContent(
698+
kind="document",
699+
markdown="First input document.",
700+
start_page_number=1,
701+
end_page_number=1,
702+
)
703+
704+
output = to_llm_input(_make_result([doc_page_2, doc_page_1]))
705+
706+
assert output.index("Second input document.") < output.index("First input document.")
707+
680708

681709
# ===========================================================================
682710
# 9. YAML front matter structure

0 commit comments

Comments
 (0)