99from os import path
1010import sys
1111import re
12- from urllib .parse import quote
12+ from urllib .parse import quote , unquote
1313
1414import requests
1515from openai import AzureOpenAI , Stream , APIStatusError
@@ -422,7 +422,7 @@ def static_file(path):
422422 def health ():
423423 return "OK"
424424
425- @app .route ("/api/files/<filename>" , methods = ["GET" ])
425+ @app .route ("/api/files/<path: filename>" , methods = ["GET" ])
426426 def get_file (filename ):
427427 """
428428 Download a file from the 'docs' container in Azure Blob Storage using Managed Identity.
@@ -433,29 +433,70 @@ def get_file(filename):
433433 Returns:
434434 Flask Response: The file content with appropriate headers, or error response
435435 """
436- logger .info ("File download request for: %s" , filename )
436+ logger .info ("File download request (raw): %s" , filename )
437+ logger .info ("File download request (repr): %r" , filename )
437438
438439 try :
440+ # URL decode the filename (Flask's path converter doesn't decode)
441+ try :
442+ decoded_filename = unquote (filename )
443+ logger .info ("Decoded filename: %s" , decoded_filename )
444+ logger .info ("Decoded filename (repr): %r" , decoded_filename )
445+
446+ # Detect double-encoding attack
447+ # If decoding again changes the value, it was double-encoded
448+ double_decoded = unquote (decoded_filename )
449+ if double_decoded != decoded_filename :
450+ logger .warning ("Double-encoded filename detected: %s" , filename )
451+ return jsonify ({"error" : "Invalid filename encoding" }), 400
452+
453+ except Exception as decode_error :
454+ logger .error ("Failed to decode filename: %s" , decode_error )
455+ return jsonify ({"error" : "Invalid filename encoding" }), 400
456+
457+ # Use decoded filename for all subsequent operations
458+ filename = decoded_filename
459+
439460 # Enhanced input validation - prevent path traversal and unauthorized access
440461 if not filename :
441462 logger .warning ("Empty filename provided" )
442463 return jsonify ({"error" : "Filename is required" }), 400
443464
444- # Prevent path traversal attacks
445- if '..' in filename or '/' in filename or '\\ ' in filename :
446- logger .warning ("Invalid filename with path traversal attempt: %s" , filename )
447- return jsonify ({"error" : "Invalid filename" }), 400
465+ # Detect if it's a URL vs regular filename
466+ is_url = filename .startswith (('http://' , 'https://' ))
448467
449- # Validate filename length and characters
450- if len (filename ) > 255 :
468+ # Check for path traversal attacks
469+ if is_url :
470+ # For URLs, block directory traversal patterns
471+ if '/../' in filename or '\\ ..\\ ' in filename or filename .endswith ('/..' ) or filename .endswith ('\\ ..' ):
472+ logger .warning ("Path traversal attempt in URL: %s" , filename )
473+ return jsonify ({"error" : "Invalid filename" }), 400
474+ else :
475+ # For regular files, block path separators first
476+ if '/' in filename or '\\ ' in filename :
477+ logger .warning ("Path separators in regular filename: %s" , filename )
478+ return jsonify ({"error" : "Invalid filename" }), 400
479+ # Note: .. without path separators is safe (e.g., version..2.pdf)
480+
481+ # Validate filename length (URLs can be longer)
482+ max_length = 2048 if is_url else 255
483+ if len (filename ) > max_length :
451484 logger .warning ("Filename too long: %s" , filename )
452485 return jsonify ({"error" : "Filename too long" }), 400
453486
454- # Only allow safe characters (alphanumeric, dots, dashes, underscores, spaces)
455- if not re .match (r'^[a-zA-Z0-9._\-\s]+$' , filename ):
456- logger .warning ("Filename contains invalid characters: %s" , filename )
487+ # Block control characters - allows multilingual filenames (Japanese, Hebrew, Arabic, etc.)
488+ # This regex allows all Unicode characters except control characters
489+ if not re .match (r'^[^\x00-\x1f\x7f]+$' , filename ):
490+ logger .warning ("Filename contains invalid control characters: %s" , filename )
457491 return jsonify ({"error" : "Invalid filename characters" }), 400
458492
493+ # For URLs, additional URL-specific validation
494+ if is_url :
495+ # Validate URL format: must start with http:// or https:// and not contain whitespace or control chars
496+ if not re .match (r'^https?://[^\s\x00-\x1f\x7f]+$' , filename ):
497+ logger .warning ("Invalid URL format: %s" , filename )
498+ return jsonify ({"error" : "Invalid URL format" }), 400
499+
459500 # Initialize blob storage client with 'documents' container
460501 blob_client = AzureBlobStorageClient (container_name = "documents" )
461502
@@ -480,12 +521,14 @@ def get_file(filename):
480521 logger .info ("Large file detected: %s, size: %d bytes" , filename , file_size )
481522
482523 # Create response with comprehensive headers
524+ # Use RFC 5987 encoding for Unicode filenames in Content-Disposition
525+ encoded_filename = quote (filename )
483526 response = Response (
484527 file_data ,
485528 status = 200 ,
486529 mimetype = content_type ,
487530 headers = {
488- 'Content-Disposition' : f' inline; filename=" { filename } "' ,
531+ 'Content-Disposition' : f" inline; filename*=UTF-8'' { encoded_filename } " ,
489532 'Content-Length' : str (file_size ),
490533 'Cache-Control' : 'public, max-age=3600' ,
491534 'X-Content-Type-Options' : 'nosniff' ,
0 commit comments