diff --git a/.github/workflows/validate-bicep-params.yml b/.github/workflows/validate-bicep-params.yml index 4ae614ee..ffe3e73c 100644 --- a/.github/workflows/validate-bicep-params.yml +++ b/.github/workflows/validate-bicep-params.yml @@ -35,9 +35,9 @@ jobs: continue-on-error: true env: ACCELERATOR_NAME: ${{ env.accelerator_name }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | set +e - RUN_URL="https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" python infra/scripts/validate_bicep_params.py --dir infra --strict --no-color \ --json-output infra_results.json \ --html-output email_body.html \ diff --git a/infra/scripts/validate_bicep_params.py b/infra/scripts/validate_bicep_params.py index 6da7d91e..dc6fcba9 100644 --- a/infra/scripts/validate_bicep_params.py +++ b/infra/scripts/validate_bicep_params.py @@ -29,6 +29,7 @@ from __future__ import annotations import argparse +import html import json import re import sys @@ -346,13 +347,14 @@ def print_report(results: list[ValidationResult], *, use_color: bool = True) -> # --------------------------------------------------------------------------- def _html_escape(text: str) -> str: - """Escape HTML special characters.""" - return ( - text.replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace('"', """) - ) + """Escape HTML special characters (including quotes) for safe use in + both element content and attribute values. + + Thin wrapper around :func:`html.escape` so we can keep a single, stable + call-site in this module while delegating the actual escaping rules to + the stdlib. + """ + return html.escape(text, quote=True) def generate_html_report( diff --git a/src/ContentProcessor/src/libs/azure_helper/content_understanding.py b/src/ContentProcessor/src/libs/azure_helper/content_understanding.py index 26c3a861..7ed038a4 100644 --- a/src/ContentProcessor/src/libs/azure_helper/content_understanding.py +++ b/src/ContentProcessor/src/libs/azure_helper/content_understanding.py @@ -12,6 +12,7 @@ import logging import time from pathlib import Path +from typing import Optional import requests from requests.models import Response @@ -294,20 +295,32 @@ def begin_analyze(self, analyzer_id: str, file_location: str): def get_image_from_analyze_operation( self, analyze_response: Response, image_id: str - ): - """Retrieves a generated file (e.g., a rendered page image) from a - completed analyze operation by its file id / path. - - In Content Understanding GA the file-retrieval URL changed from - ``{operationLocation}/images/{imageId}`` to - ``{operationLocation}/files/{fileId}`` (where ``operationLocation`` now - ends in ``/analyzerResults/{operationId}``). + ) -> Optional[bytes]: + """Retrieve a rendered page image (JPEG) generated by a completed + analyze operation, by its file id / path. + + Although the GA file-retrieval endpoint is generic + (``{operationLocation}/files/{fileId}``, replacing the legacy + ``{operationLocation}/images/{imageId}``), this helper is intentionally + image-specific: it asserts that the returned ``Content-Type`` is + ``image/jpeg`` and is only intended for use with JPEG page images + produced by the analyzer. Use a different helper if you need to fetch + non-image generated files. Args: - analyze_response (Response): The response object from the analyze operation. - image_id (str): The id (or path) of the file to retrieve. + analyze_response (Response): The response object from the analyze + operation (used only to read its ``operation-location`` header). + image_id (str): The id (or path) of the image file to retrieve. + Returns: - bytes: The file content as a byte string. + Optional[bytes]: The JPEG image bytes on success, or ``None`` if + the HTTP request fails (the underlying :class:`RequestException` + is logged but not re-raised). + + Raises: + ValueError: If the analyze response does not contain an + ``operation-location`` header. + AssertionError: If the retrieved file is not ``image/jpeg``. """ operation_location = analyze_response.headers.get("operation-location", "") if not operation_location: @@ -326,7 +339,7 @@ def get_image_from_analyze_operation( return response.content except requests.exceptions.RequestException as e: - print(f"HTTP request failed: {e}") + self._logger.error("HTTP request failed while retrieving image: %s", e) return None def poll_result(