Skip to content

Commit 77caef8

Browse files
fix: fix for ingest non-English file names and URLs (#2025)
1 parent 5d729e8 commit 77caef8

9 files changed

Lines changed: 142 additions & 36 deletions

File tree

code/backend/batch/add_url_embeddings.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,15 @@ def process_url_contents_directly(url: str, env_helper: EnvHelper):
5959

6060
def download_url_and_upload_to_blob(url: str):
6161
try:
62-
response = requests.get(url)
62+
# Add User-Agent header to avoid being blocked by websites (e.g., Wikipedia)
63+
headers = {
64+
'User-Agent': 'cwyd-admin-user'
65+
}
66+
response = requests.get(url, headers=headers)
6367
parsed_data = BeautifulSoup(response.content, "html.parser")
6468
with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream:
6569
blob_client = AzureBlobStorageClient()
66-
blob_client.upload_file(stream, url, metadata={"title": url})
70+
blob_client.upload_file(stream, url)
6771
return func.HttpResponse(f"URL {url} added to knowledge base", status_code=200)
6872

6973
except Exception:

code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
4646
source_field_name="metadata_storage_path",
4747
target_field_name="source",
4848
),
49+
FieldMapping(
50+
source_field_name="metadata_storage_name",
51+
target_field_name="title",
52+
),
4953
FieldMapping(
5054
source_field_name="/document/normalized_images/*/text",
5155
target_field_name="text",

code/backend/pages/01_Ingest_Data.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,21 +62,31 @@ def add_urls():
6262

6363

6464
def sanitize_metadata_value(value):
65-
# Remove invalid characters
66-
return re.sub(r"[^a-zA-Z0-9-_ .]", "?", value)
65+
if not value:
66+
return value
67+
sanitized = value
68+
# Remove characters that are problematic in HTTP headers/URLs
69+
# Specifically remove: < > : " | ? * \ (common filesystem/URL issues)
70+
sanitized = re.sub(r'[<>:"|?*\\]', '', sanitized)
71+
# Remove empty spaces
72+
sanitized = sanitized.replace(' ', '')
73+
74+
return sanitized
6775

6876

6977
def add_url_embeddings(urls: list[str]):
70-
has_valid_url = bool(list(filter(str.strip, urls)))
71-
if not has_valid_url:
78+
# Filter out empty lines and whitespace before processing
79+
valid_urls = [url.strip() for url in urls if url.strip()]
80+
81+
if not valid_urls:
7282
st.error("Please enter at least one valid URL.")
7383
return False
7484

7585
params = {}
7686
if env_helper.FUNCTION_KEY is not None:
7787
params["code"] = env_helper.FUNCTION_KEY
7888
params["clientId"] = "clientKey"
79-
for url in urls:
89+
for url in valid_urls:
8090
body = {"url": url}
8191
backend_url = urllib.parse.urljoin(
8292
env_helper.BACKEND_URL, "/api/AddURLEmbeddings"
@@ -106,12 +116,12 @@ def add_url_embeddings(urls: list[str]):
106116
for up in uploaded_files:
107117
# To read file as bytes:
108118
bytes_data = up.getvalue()
109-
title = sanitize_metadata_value(up.name)
110119
if st.session_state.get("filename", "") != up.name:
120+
title = sanitize_metadata_value(up.name)
111121
# Upload a new file
112122
st.session_state["filename"] = up.name
113123
st.session_state["file_url"] = blob_client.upload_file(
114-
bytes_data, up.name, metadata={"title": title}
124+
bytes_data, title
115125
)
116126
if len(uploaded_files) > 0:
117127
st.success(

code/backend/pages/03_Delete_Data.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,30 @@ def load_css(file_path):
5757
st.write("Select files to delete:")
5858

5959
files = search_handler.output_results(results)
60+
# Format filenames for display based on configuration
61+
container_name = env_helper.AZURE_BLOB_CONTAINER_NAME
62+
63+
# For CosmosDB with Integrated Vectorization and Semantic Search, display /{container}/filename
64+
# For other configurations, display only filename
65+
if (
66+
env_helper.DATABASE_TYPE == DatabaseType.COSMOSDB.value
67+
and env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
68+
and env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH
69+
):
70+
display_files = {f"/{container_name}/{fname}": fname for fname in files.keys()}
71+
else:
72+
display_files = {fname: fname for fname in files.keys()}
73+
6074
with st.form("delete_form", clear_on_submit=True, border=False):
6175
selections = {
62-
filename: st.checkbox(filename, False, key=filename)
63-
for filename in files.keys()
76+
display_name: st.checkbox(display_name, False, key=display_name)
77+
for display_name in display_files.keys()
6478
}
79+
# Map display names back to actual filenames
6580
selected_files = {
66-
filename: ids for filename, ids in files.items() if selections[filename]
81+
fname: files[fname]
82+
for display_name, fname in display_files.items()
83+
if selections[display_name]
6784
}
6885

6986
if st.form_submit_button("Delete"):

code/create_app.py

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from os import path
1010
import sys
1111
import re
12-
from urllib.parse import quote
12+
from urllib.parse import quote, unquote
1313

1414
import requests
1515
from openai import AzureOpenAI, Stream, APIStatusError
@@ -422,7 +422,7 @@ def static_file(path):
422422
def health():
423423
return "OK"
424424

425-
@app.route("/api/files/<filename>", methods=["GET"])
425+
@app.route("/api/files/<path:filename>", methods=["GET"])
426426
def get_file(filename):
427427
"""
428428
Download a file from the 'docs' container in Azure Blob Storage using Managed Identity.
@@ -433,29 +433,70 @@ def get_file(filename):
433433
Returns:
434434
Flask Response: The file content with appropriate headers, or error response
435435
"""
436-
logger.info("File download request for: %s", filename)
436+
logger.info("File download request (raw): %s", filename)
437+
logger.info("File download request (repr): %r", filename)
437438

438439
try:
440+
# URL decode the filename (Flask's path converter doesn't decode)
441+
try:
442+
decoded_filename = unquote(filename)
443+
logger.info("Decoded filename: %s", decoded_filename)
444+
logger.info("Decoded filename (repr): %r", decoded_filename)
445+
446+
# Detect double-encoding attack
447+
# If decoding again changes the value, it was double-encoded
448+
double_decoded = unquote(decoded_filename)
449+
if double_decoded != decoded_filename:
450+
logger.warning("Double-encoded filename detected: %s", filename)
451+
return jsonify({"error": "Invalid filename encoding"}), 400
452+
453+
except Exception as decode_error:
454+
logger.error("Failed to decode filename: %s", decode_error)
455+
return jsonify({"error": "Invalid filename encoding"}), 400
456+
457+
# Use decoded filename for all subsequent operations
458+
filename = decoded_filename
459+
439460
# Enhanced input validation - prevent path traversal and unauthorized access
440461
if not filename:
441462
logger.warning("Empty filename provided")
442463
return jsonify({"error": "Filename is required"}), 400
443464

444-
# Prevent path traversal attacks
445-
if '..' in filename or '/' in filename or '\\' in filename:
446-
logger.warning("Invalid filename with path traversal attempt: %s", filename)
447-
return jsonify({"error": "Invalid filename"}), 400
465+
# Detect if it's a URL vs regular filename
466+
is_url = filename.startswith(('http://', 'https://'))
448467

449-
# Validate filename length and characters
450-
if len(filename) > 255:
468+
# Check for path traversal attacks
469+
if is_url:
470+
# For URLs, block directory traversal patterns
471+
if '/../' in filename or '\\..\\' in filename or filename.endswith('/..') or filename.endswith('\\..'):
472+
logger.warning("Path traversal attempt in URL: %s", filename)
473+
return jsonify({"error": "Invalid filename"}), 400
474+
else:
475+
# For regular files, block path separators first
476+
if '/' in filename or '\\' in filename:
477+
logger.warning("Path separators in regular filename: %s", filename)
478+
return jsonify({"error": "Invalid filename"}), 400
479+
# Note: .. without path separators is safe (e.g., version..2.pdf)
480+
481+
# Validate filename length (URLs can be longer)
482+
max_length = 2048 if is_url else 255
483+
if len(filename) > max_length:
451484
logger.warning("Filename too long: %s", filename)
452485
return jsonify({"error": "Filename too long"}), 400
453486

454-
# Only allow safe characters (alphanumeric, dots, dashes, underscores, spaces)
455-
if not re.match(r'^[a-zA-Z0-9._\-\s]+$', filename):
456-
logger.warning("Filename contains invalid characters: %s", filename)
487+
# Block control characters - allows multilingual filenames (Japanese, Hebrew, Arabic, etc.)
488+
# This regex allows all Unicode characters except control characters
489+
if not re.match(r'^[^\x00-\x1f\x7f]+$', filename):
490+
logger.warning("Filename contains invalid control characters: %s", filename)
457491
return jsonify({"error": "Invalid filename characters"}), 400
458492

493+
# For URLs, additional URL-specific validation
494+
if is_url:
495+
# Validate URL format: must start with http:// or https:// and not contain whitespace or control chars
496+
if not re.match(r'^https?://[^\s\x00-\x1f\x7f]+$', filename):
497+
logger.warning("Invalid URL format: %s", filename)
498+
return jsonify({"error": "Invalid URL format"}), 400
499+
459500
# Initialize blob storage client with 'documents' container
460501
blob_client = AzureBlobStorageClient(container_name="documents")
461502

@@ -480,12 +521,14 @@ def get_file(filename):
480521
logger.info("Large file detected: %s, size: %d bytes", filename, file_size)
481522

482523
# Create response with comprehensive headers
524+
# Use RFC 5987 encoding for Unicode filenames in Content-Disposition
525+
encoded_filename = quote(filename)
483526
response = Response(
484527
file_data,
485528
status=200,
486529
mimetype=content_type,
487530
headers={
488-
'Content-Disposition': f'inline; filename="{filename}"',
531+
'Content-Disposition': f"inline; filename*=UTF-8''{encoded_filename}",
489532
'Content-Length': str(file_size),
490533
'Cache-Control': 'public, max-age=3600',
491534
'X-Content-Type-Options': 'nosniff',

code/frontend/src/components/Answer/Answer.tsx

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,26 @@ export const Answer = ({
158158
let citationFilename = "";
159159

160160
if (citation.filepath && citation.chunk_id != null) {
161-
if (truncate && citation.filepath.length > filePathTruncationLimit) {
162-
const citationLength = citation.filepath.length;
163-
citationFilename = `${citation.filepath.substring(0, 20)}...${citation.filepath.substring(citationLength - 20)} - Part ${citation.chunk_id}`;
161+
// Decode the URL-encoded filepath from backend, falling back to the original on failure
162+
let decodedFilepath: string;
163+
try {
164+
decodedFilepath = decodeURIComponent(citation.filepath);
165+
} catch (error) {
166+
console.warn("Failed to decode citation filepath:", citation.filepath, error);
167+
decodedFilepath = citation.filepath;
168+
}
169+
170+
// Strip container prefix if present (e.g., "documents/filename.pdf" -> "filename.pdf")
171+
const isLikelyUrl = /^https?:\/\//i.test(decodedFilepath);
172+
if (!isLikelyUrl && decodedFilepath.includes("/")) {
173+
decodedFilepath = decodedFilepath.split("/").pop() || decodedFilepath;
174+
}
175+
176+
if (truncate && decodedFilepath.length > filePathTruncationLimit) {
177+
const citationLength = decodedFilepath.length;
178+
citationFilename = `${decodedFilepath.substring(0, 20)}...${decodedFilepath.substring(citationLength - 20)} - Part ${citation.chunk_id}`;
164179
} else {
165-
citationFilename = `${citation.filepath} - Part ${citation.chunk_id}`;
180+
citationFilename = `${decodedFilepath} - Part ${citation.chunk_id}`;
166181
}
167182
} else {
168183
citationFilename = `Citation ${index}`;

code/frontend/src/components/CitationPanel/CitationPanel.tsx

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,22 @@ function rewriteCitationUrl(markdownText: string) {
2020
const blobStorageHost = 'blob.core.windows.net';
2121

2222
if (parsed.hostname.includes(blobStorageHost)) {
23-
// Extract the filename from the path
24-
const filename = parsed.pathname.split('/').pop();
25-
return `[${title}](/api/files/${filename})`;
23+
// Extract the path after the container name (e.g., /documents/filename or /documents/https://...)
24+
const pathParts = parsed.pathname.split('/');
25+
// Remove empty first element and container name, join the rest
26+
const filenameOrUrl = pathParts.slice(2).join('/');
27+
28+
// Check if it's a URL (BYOD case where URL is stored as blob path)
29+
if (filenameOrUrl.startsWith('http://') || filenameOrUrl.startsWith('https://')) {
30+
const decodedUrl = decodeURIComponent(filenameOrUrl);
31+
return `[${title}](${decodedUrl})`;
32+
}
33+
34+
const decodedFilename = decodeURIComponent(filenameOrUrl);
35+
return `[${title}](/api/files/${decodedFilename})`;
2636
} else {
27-
// Return the full external URL
28-
return `[${title}](${parsed.href})`;
37+
const decodedparsedhref = decodeURIComponent(parsed.href);
38+
return `[${title}](${decodedparsedhref})`;
2939
}
3040
} catch {
3141
return match; // fallback if URL parsing fails

code/tests/test_add_url_embeddings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,9 @@ def test_add_url_embeddings_integrated_vectorization(
100100

101101
# then
102102
assert response.status_code == 200
103+
mock_get.assert_called_once_with(url, headers={'User-Agent': 'cwyd-admin-user'})
103104
mock_blob_storage_client_instance.upload_file.assert_called_once_with(
104-
ANY, url, metadata={"title": url}
105+
ANY, url
105106
)
106107

107108

@@ -137,6 +138,7 @@ def test_add_url_embeddings_integrated_vectorization_returns_500_when_exception_
137138

138139
# then
139140
assert response.status_code == 500
141+
mock_get.assert_called_once_with(url, headers={'User-Agent': 'cwyd-admin-user'})
140142
assert (
141143
b"Error occurred while adding https://example.com to the knowledge base."
142144
in response.get_body()

code/tests/test_create_app.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from unittest.mock import AsyncMock, MagicMock, Mock, patch
6+
from urllib.parse import quote
67

78
from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundError, ServiceRequestError
89
from openai import RateLimitError, BadRequestError, InternalServerError
@@ -949,7 +950,7 @@ def test_get_file_success(self, mock_blob_client_class, client):
949950
assert response.status_code == 200
950951
assert response.data == file_content
951952
assert response.headers["Content-Type"] == "application/pdf"
952-
assert response.headers["Content-Disposition"] == f'inline; filename="{filename}"'
953+
assert response.headers["Content-Disposition"] == f"inline; filename*=UTF-8''{quote(filename)}"
953954
assert response.headers["Content-Length"] == str(len(file_content))
954955
assert response.headers["Cache-Control"] == "public, max-age=3600"
955956
assert response.headers["X-Content-Type-Options"] == "nosniff"

0 commit comments

Comments
 (0)