fix: fix for ingest non-English file names and URLs (#2025)

Akhileswara-Microsoft · web-flow · commit 77caef87cade · 2026-02-19T12:44:56.000+05:30
diff --git a/code/backend/batch/add_url_embeddings.py b/code/backend/batch/add_url_embeddings.py
@@ -59,11 +59,15 @@ def process_url_contents_directly(url: str, env_helper: EnvHelper):
 
 def download_url_and_upload_to_blob(url: str):
     try:
-        response = requests.get(url)
+        # Add User-Agent header to avoid being blocked by websites (e.g., Wikipedia)
+        headers = {
+            'User-Agent': 'cwyd-admin-user'
+        }
+        response = requests.get(url, headers=headers)
         parsed_data = BeautifulSoup(response.content, "html.parser")
         with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream:
             blob_client = AzureBlobStorageClient()
-            blob_client.upload_file(stream, url, metadata={"title": url})
+            blob_client.upload_file(stream, url)
         return func.HttpResponse(f"URL {url} added to knowledge base", status_code=200)
 
     except Exception:
diff --git a/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py b/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py
@@ -46,6 +46,10 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
                     source_field_name="metadata_storage_path",
                     target_field_name="source",
                 ),
+                FieldMapping(
+                    source_field_name="metadata_storage_name",
+                    target_field_name="title",
+                ),
                 FieldMapping(
                     source_field_name="/document/normalized_images/*/text",
                     target_field_name="text",
diff --git a/code/backend/pages/01_Ingest_Data.py b/code/backend/pages/01_Ingest_Data.py
@@ -62,21 +62,31 @@ def add_urls():
 
 
 def sanitize_metadata_value(value):
-    # Remove invalid characters
-    return re.sub(r"[^a-zA-Z0-9-_ .]", "?", value)
+    if not value:
+        return value
+    sanitized = value
+    # Remove characters that are problematic in HTTP headers/URLs
+    # Specifically remove: < > : " | ? * \ (common filesystem/URL issues)
+    sanitized = re.sub(r'[<>:"|?*\\]', '', sanitized)
+    # Remove empty spaces
+    sanitized = sanitized.replace(' ', '')
+
+    return sanitized
 
 
 def add_url_embeddings(urls: list[str]):
-    has_valid_url = bool(list(filter(str.strip, urls)))
-    if not has_valid_url:
+    # Filter out empty lines and whitespace before processing
+    valid_urls = [url.strip() for url in urls if url.strip()]
+
+    if not valid_urls:
         st.error("Please enter at least one valid URL.")
         return False
 
     params = {}
     if env_helper.FUNCTION_KEY is not None:
         params["code"] = env_helper.FUNCTION_KEY
         params["clientId"] = "clientKey"
-    for url in urls:
+    for url in valid_urls:
         body = {"url": url}
         backend_url = urllib.parse.urljoin(
             env_helper.BACKEND_URL, "/api/AddURLEmbeddings"
@@ -106,12 +116,12 @@ def add_url_embeddings(urls: list[str]):
             for up in uploaded_files:
                 # To read file as bytes:
                 bytes_data = up.getvalue()
-                title = sanitize_metadata_value(up.name)
                 if st.session_state.get("filename", "") != up.name:
+                    title = sanitize_metadata_value(up.name)
                     # Upload a new file
                     st.session_state["filename"] = up.name
                     st.session_state["file_url"] = blob_client.upload_file(
-                        bytes_data, up.name, metadata={"title": title}
+                        bytes_data, title
                     )
             if len(uploaded_files) > 0:
                 st.success(
diff --git a/code/backend/pages/03_Delete_Data.py b/code/backend/pages/03_Delete_Data.py
@@ -57,13 +57,30 @@ def load_css(file_path):
         st.write("Select files to delete:")
 
     files = search_handler.output_results(results)
+    # Format filenames for display based on configuration
+    container_name = env_helper.AZURE_BLOB_CONTAINER_NAME
+
+    # For CosmosDB with Integrated Vectorization and Semantic Search, display /{container}/filename
+    # For other configurations, display only filename
+    if (
+        env_helper.DATABASE_TYPE == DatabaseType.COSMOSDB.value
+        and env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
+        and env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH
+    ):
+        display_files = {f"/{container_name}/{fname}": fname for fname in files.keys()}
+    else:
+        display_files = {fname: fname for fname in files.keys()}
+
     with st.form("delete_form", clear_on_submit=True, border=False):
         selections = {
-            filename: st.checkbox(filename, False, key=filename)
-            for filename in files.keys()
+            display_name: st.checkbox(display_name, False, key=display_name)
+            for display_name in display_files.keys()
         }
+        # Map display names back to actual filenames
         selected_files = {
-            filename: ids for filename, ids in files.items() if selections[filename]
+            fname: files[fname]
+            for display_name, fname in display_files.items()
+            if selections[display_name]
         }
 
         if st.form_submit_button("Delete"):
diff --git a/code/create_app.py b/code/create_app.py
@@ -9,7 +9,7 @@
 from os import path
 import sys
 import re
-from urllib.parse import quote
+from urllib.parse import quote, unquote
 
 import requests
 from openai import AzureOpenAI, Stream, APIStatusError
@@ -422,7 +422,7 @@ def static_file(path):
     def health():
         return "OK"
 
-    @app.route("/api/files/<filename>", methods=["GET"])
+    @app.route("/api/files/<path:filename>", methods=["GET"])
     def get_file(filename):
         """
         Download a file from the 'docs' container in Azure Blob Storage using Managed Identity.
@@ -433,29 +433,70 @@ def get_file(filename):
         Returns:
             Flask Response: The file content with appropriate headers, or error response
         """
-        logger.info("File download request for: %s", filename)
+        logger.info("File download request (raw): %s", filename)
+        logger.info("File download request (repr): %r", filename)
 
         try:
+            # URL decode the filename (Flask's path converter doesn't decode)
+            try:
+                decoded_filename = unquote(filename)
+                logger.info("Decoded filename: %s", decoded_filename)
+                logger.info("Decoded filename (repr): %r", decoded_filename)
+
+                # Detect double-encoding attack
+                # If decoding again changes the value, it was double-encoded
+                double_decoded = unquote(decoded_filename)
+                if double_decoded != decoded_filename:
+                    logger.warning("Double-encoded filename detected: %s", filename)
+                    return jsonify({"error": "Invalid filename encoding"}), 400
+
+            except Exception as decode_error:
+                logger.error("Failed to decode filename: %s", decode_error)
+                return jsonify({"error": "Invalid filename encoding"}), 400
+
+            # Use decoded filename for all subsequent operations
+            filename = decoded_filename
+
             # Enhanced input validation - prevent path traversal and unauthorized access
             if not filename:
                 logger.warning("Empty filename provided")
                 return jsonify({"error": "Filename is required"}), 400
 
-            # Prevent path traversal attacks
-            if '..' in filename or '/' in filename or '\\' in filename:
-                logger.warning("Invalid filename with path traversal attempt: %s", filename)
-                return jsonify({"error": "Invalid filename"}), 400
+            # Detect if it's a URL vs regular filename
+            is_url = filename.startswith(('http://', 'https://'))
 
-            # Validate filename length and characters
-            if len(filename) > 255:
+            # Check for path traversal attacks
+            if is_url:
+                # For URLs, block directory traversal patterns
+                if '/../' in filename or '\\..\\' in filename or filename.endswith('/..') or filename.endswith('\\..'):
+                    logger.warning("Path traversal attempt in URL: %s", filename)
+                    return jsonify({"error": "Invalid filename"}), 400
+            else:
+                # For regular files, block path separators first
+                if '/' in filename or '\\' in filename:
+                    logger.warning("Path separators in regular filename: %s", filename)
+                    return jsonify({"error": "Invalid filename"}), 400
+                # Note: .. without path separators is safe (e.g., version..2.pdf)
+
+            # Validate filename length (URLs can be longer)
+            max_length = 2048 if is_url else 255
+            if len(filename) > max_length:
                 logger.warning("Filename too long: %s", filename)
                 return jsonify({"error": "Filename too long"}), 400
 
-            # Only allow safe characters (alphanumeric, dots, dashes, underscores, spaces)
-            if not re.match(r'^[a-zA-Z0-9._\-\s]+$', filename):
-                logger.warning("Filename contains invalid characters: %s", filename)
+            # Block control characters - allows multilingual filenames (Japanese, Hebrew, Arabic, etc.)
+            # This regex allows all Unicode characters except control characters
+            if not re.match(r'^[^\x00-\x1f\x7f]+$', filename):
+                logger.warning("Filename contains invalid control characters: %s", filename)
                 return jsonify({"error": "Invalid filename characters"}), 400
 
+            # For URLs, additional URL-specific validation
+            if is_url:
+                # Validate URL format: must start with http:// or https:// and not contain whitespace or control chars
+                if not re.match(r'^https?://[^\s\x00-\x1f\x7f]+$', filename):
+                    logger.warning("Invalid URL format: %s", filename)
+                    return jsonify({"error": "Invalid URL format"}), 400
+
             # Initialize blob storage client with 'documents' container
             blob_client = AzureBlobStorageClient(container_name="documents")
 
@@ -480,12 +521,14 @@ def get_file(filename):
                 logger.info("Large file detected: %s, size: %d bytes", filename, file_size)
 
             # Create response with comprehensive headers
+            # Use RFC 5987 encoding for Unicode filenames in Content-Disposition
+            encoded_filename = quote(filename)
             response = Response(
                 file_data,
                 status=200,
                 mimetype=content_type,
                 headers={
-                    'Content-Disposition': f'inline; filename="{filename}"',
+                    'Content-Disposition': f"inline; filename*=UTF-8''{encoded_filename}",
                     'Content-Length': str(file_size),
                     'Cache-Control': 'public, max-age=3600',
                     'X-Content-Type-Options': 'nosniff',
diff --git a/code/frontend/src/components/Answer/Answer.tsx b/code/frontend/src/components/Answer/Answer.tsx
@@ -158,11 +158,26 @@ export const Answer = ({
     let citationFilename = "";
 
     if (citation.filepath && citation.chunk_id != null) {
-      if (truncate && citation.filepath.length > filePathTruncationLimit) {
-        const citationLength = citation.filepath.length;
-        citationFilename = `${citation.filepath.substring(0, 20)}...${citation.filepath.substring(citationLength - 20)} - Part ${citation.chunk_id}`;
+      // Decode the URL-encoded filepath from backend, falling back to the original on failure
+      let decodedFilepath: string;
+      try {
+        decodedFilepath = decodeURIComponent(citation.filepath);
+      } catch (error) {
+         console.warn("Failed to decode citation filepath:", citation.filepath, error);
+         decodedFilepath = citation.filepath;
+      }
+
+      // Strip container prefix if present (e.g., "documents/filename.pdf" -> "filename.pdf")
+      const isLikelyUrl = /^https?:\/\//i.test(decodedFilepath);
+      if (!isLikelyUrl && decodedFilepath.includes("/")) {
+        decodedFilepath = decodedFilepath.split("/").pop() || decodedFilepath;
+      }
+
+      if (truncate && decodedFilepath.length > filePathTruncationLimit) {
+        const citationLength = decodedFilepath.length;
+        citationFilename = `${decodedFilepath.substring(0, 20)}...${decodedFilepath.substring(citationLength - 20)} - Part ${citation.chunk_id}`;
       } else {
-        citationFilename = `${citation.filepath} - Part ${citation.chunk_id}`;
+        citationFilename = `${decodedFilepath} - Part ${citation.chunk_id}`;
       }
     } else {
       citationFilename = `Citation ${index}`;
diff --git a/code/frontend/src/components/CitationPanel/CitationPanel.tsx b/code/frontend/src/components/CitationPanel/CitationPanel.tsx
@@ -20,12 +20,22 @@ function rewriteCitationUrl(markdownText: string) {
         const blobStorageHost = 'blob.core.windows.net';
 
         if (parsed.hostname.includes(blobStorageHost)) {
-          // Extract the filename from the path
-          const filename = parsed.pathname.split('/').pop();
-          return `[${title}](/api/files/${filename})`;
+          // Extract the path after the container name (e.g., /documents/filename or /documents/https://...)
+          const pathParts = parsed.pathname.split('/');
+          // Remove empty first element and container name, join the rest
+          const filenameOrUrl = pathParts.slice(2).join('/');
+
+          // Check if it's a URL (BYOD case where URL is stored as blob path)
+          if (filenameOrUrl.startsWith('http://') || filenameOrUrl.startsWith('https://')) {
+            const decodedUrl = decodeURIComponent(filenameOrUrl);
+            return `[${title}](${decodedUrl})`;
+          }
+
+          const decodedFilename = decodeURIComponent(filenameOrUrl);
+          return `[${title}](/api/files/${decodedFilename})`;
         } else {
-          // Return the full external URL
-          return `[${title}](${parsed.href})`;
+          const decodedparsedhref = decodeURIComponent(parsed.href);
+          return `[${title}](${decodedparsedhref})`;
         }
       } catch {
         return match; // fallback if URL parsing fails
diff --git a/code/tests/test_add_url_embeddings.py b/code/tests/test_add_url_embeddings.py
@@ -100,8 +100,9 @@ def test_add_url_embeddings_integrated_vectorization(
 
     # then
     assert response.status_code == 200
+    mock_get.assert_called_once_with(url, headers={'User-Agent': 'cwyd-admin-user'})
     mock_blob_storage_client_instance.upload_file.assert_called_once_with(
-        ANY, url, metadata={"title": url}
+        ANY, url
     )
 
 
@@ -137,6 +138,7 @@ def test_add_url_embeddings_integrated_vectorization_returns_500_when_exception_
 
     # then
     assert response.status_code == 500
+    mock_get.assert_called_once_with(url, headers={'User-Agent': 'cwyd-admin-user'})
     assert (
         b"Error occurred while adding https://example.com to the knowledge base."
         in response.get_body()
diff --git a/code/tests/test_create_app.py b/code/tests/test_create_app.py
@@ -3,6 +3,7 @@
 """
 
 from unittest.mock import AsyncMock, MagicMock, Mock, patch
+from urllib.parse import quote
 
 from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundError, ServiceRequestError
 from openai import RateLimitError, BadRequestError, InternalServerError
@@ -949,7 +950,7 @@ def test_get_file_success(self, mock_blob_client_class, client):
         assert response.status_code == 200
         assert response.data == file_content
         assert response.headers["Content-Type"] == "application/pdf"
-        assert response.headers["Content-Disposition"] == f'inline; filename="{filename}"'
+        assert response.headers["Content-Disposition"] == f"inline; filename*=UTF-8''{quote(filename)}"
         assert response.headers["Content-Length"] == str(len(file_content))
         assert response.headers["Cache-Control"] == "public, max-age=3600"
         assert response.headers["X-Content-Type-Options"] == "nosniff"