Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 115 additions & 67 deletions docs/CustomizeSchemaData.md

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion infra/scripts/post_deployment.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,15 @@ if (-not $ApiReady) {

Write-Host " Registering new schema '$ClassName'..."

# Only JSON Schema descriptors are accepted. The legacy .py format
# was removed as part of the schemavault RCE remediation.
$extension = [System.IO.Path]::GetExtension($SchemaFile).ToLowerInvariant()
if ($extension -ne '.json') {
Write-Host " Unsupported schema extension '$extension' for '$SchemaFile'. Only .json is accepted. Skipping..."
continue
Comment thread
Prajwal-Microsoft marked this conversation as resolved.
}
$contentType = 'application/json'

# Build multipart form data
$dataPayload = @{ ClassName = $ClassName; Description = $Description } | ConvertTo-Json -Compress
$fileBytes = [System.IO.File]::ReadAllBytes($SchemaFile)
Expand All @@ -137,7 +146,7 @@ if (-not $ApiReady) {
$dataPayload,
"--$boundary",
"Content-Disposition: form-data; name=`"file`"; filename=`"$fileName`"",
"Content-Type: text/x-python$LF",
"Content-Type: $contentType$LF",
[System.Text.Encoding]::UTF8.GetString($fileBytes),
"--$boundary--$LF"
) -join $LF
Expand Down
11 changes: 10 additions & 1 deletion infra/scripts/post_deployment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,19 @@ else
echo " Registering new schema '$CLASS_NAME'..."
DATA_PAYLOAD="{\"ClassName\": \"$CLASS_NAME\", \"Description\": \"$DESCRIPTION\"}"

# Only JSON Schema descriptors are accepted. The legacy .py format
# was removed as part of the schemavault RCE remediation.
EXT=$(echo "${FILE_NAME##*.}" | tr '[:upper:]' '[:lower:]')
if [ "$EXT" != "json" ]; then
echo " Unsupported schema extension '.$EXT' for '$FILE_NAME'. Only .json is accepted. Skipping..."
continue
Comment thread
Prajwal-Microsoft marked this conversation as resolved.
fi
Comment thread
Prajwal-Microsoft marked this conversation as resolved.
CONTENT_TYPE="application/json"

RESPONSE=$(curl -s -w "\n%{http_code}" \
-X POST "$SCHEMAVAULT_URL" \
-F "data=$DATA_PAYLOAD" \
-F "file=@$SCHEMA_FILE;type=text/x-python" \
-F "file=@$SCHEMA_FILE;type=$CONTENT_TYPE" \
--connect-timeout 60)

HTTP_CODE=$(echo "$RESPONSE" | tail -1)
Expand Down
10 changes: 7 additions & 3 deletions src/ContentProcessor/src/libs/pipeline/entities/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class file (in blob storage) that defines the structured output
"""

import datetime
from typing import Optional
from typing import Literal, Optional

from pydantic import BaseModel, Field

Expand All @@ -21,17 +21,21 @@ class Schema(BaseModel):

Attributes:
Id: Unique schema identifier.
ClassName: Python class name in the remote module.
ClassName: Class name to materialise from the schema artifact.
Description: Human-readable description.
FileName: Blob filename containing the schema class.
FileName: Blob filename containing the schema artifact.
ContentType: Target content type this schema handles.
Format: Storage format of the schema artifact. Always
``"json"`` — declarative JSON Schema descriptors are the
only supported format.
"""

Id: str
ClassName: str
Description: str
FileName: str
ContentType: str
Format: Literal["json"] = Field(default="json")
Created_On: Optional[datetime.datetime] = Field(default=None)
Updated_On: Optional[datetime.datetime] = Field(default=None)

Expand Down
18 changes: 14 additions & 4 deletions src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from libs.pipeline.entities.pipeline_step_result import StepResult
from libs.pipeline.entities.schema import Schema
from libs.pipeline.queue_handler_base import HandlerBase
from libs.utils.remote_module_loader import load_schema_from_blob
from libs.utils.remote_schema_loader import load_schema_from_blob_json

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -151,12 +151,22 @@ async def execute(self, context: MessageContext) -> StepResult:
schema_id=context.data_pipeline.pipeline_status.schema_id,
)

# Load the schema class for structured output
schema_class = load_schema_from_blob(
# Load the schema class for structured output. Only JSON schemas
# are supported; the worker materialises the descriptor as an
# in-memory Pydantic model without ever executing uploaded code.
schema_format = getattr(selected_schema, "Format", "json") or "json"
if schema_format != "json":
raise ValueError(
f"Schema {selected_schema.Id} has unsupported Format "
f"'{schema_format}'. Re-register the schema as a JSON "
"Schema (.json) document; legacy Python (.py) schemas "
"are no longer supported."
Comment thread
Prajwal-Microsoft marked this conversation as resolved.
)
Comment thread
Prajwal-Microsoft marked this conversation as resolved.
schema_class = load_schema_from_blob_json(
account_url=self.application_context.configuration.app_storage_blob_url,
container_name=f"{self.application_context.configuration.app_cps_configuration}/Schemas/{context.data_pipeline.pipeline_status.schema_id}",
blob_name=selected_schema.FileName,
Comment thread
Prajwal-Microsoft marked this conversation as resolved.
module_name=selected_schema.ClassName,
model_name=selected_schema.ClassName,
)

# Invoke Model with Agent Framework SDK
Expand Down
4 changes: 2 additions & 2 deletions src/ContentProcessor/src/libs/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
base64_util: Base-64 encoding detection.
credential_util: Convenience re-export of credential and token-provider
helpers (mirrors azure_credential_utils).
remote_module_loader: Dynamically load Python modules from Azure Blob
Storage.
remote_schema_loader: Materialise Pydantic models from JSON Schema
descriptors stored in Azure Blob Storage (no code execution).
stopwatch: Lightweight elapsed-time measurement context manager.
utils: General-purpose JSON encoding, dict flattening, and value
comparison helpers.
Expand Down
65 changes: 0 additions & 65 deletions src/ContentProcessor/src/libs/utils/remote_module_loader.py

This file was deleted.

Loading
Loading