6565ALLOWED_BUCKET = "pyplots-images"
6666
6767
68- def validate_gcs_url (url : str ) -> bool :
69- """Validate that URL is from allowed GCS bucket with no path traversal."""
68+ def build_safe_gcs_url (url : str ) -> str | None :
69+ """
70+ Validate URL and return a reconstructed safe GCS URL.
71+
72+ This prevents SSRF by constructing the URL from hardcoded values
73+ instead of passing user input directly.
74+
75+ Args:
76+ url: User-provided URL to validate
77+
78+ Returns:
79+ Reconstructed safe URL or None if validation fails
80+ """
7081 try :
7182 parsed = urlparse (url )
7283 # Must be HTTPS
7384 if parsed .scheme != "https" :
74- return False
85+ return None
7586 # Must be exact host (no subdomains)
7687 if parsed .netloc != ALLOWED_HOST :
77- return False
78- # Path must start with bucket name and not contain traversal
88+ return None
89+ # Path must start with bucket name
7990 path_parts = parsed .path .strip ("/" ).split ("/" )
8091 if len (path_parts ) < 2 :
81- return False
92+ return None
8293 if path_parts [0 ] != ALLOWED_BUCKET :
83- return False
94+ return None
8495 # Check for path traversal attempts
8596 if ".." in parsed .path :
86- return False
87- return True
97+ return None
98+ # Validate path contains only safe characters (alphanumeric, hyphens, underscores, dots, slashes)
99+ safe_path = parsed .path .strip ("/" )
100+ if not all (c .isalnum () or c in "-_./+" for c in safe_path ):
101+ return None
102+ # Reconstruct URL from hardcoded values to prevent SSRF
103+ # This breaks the taint flow by not using the original URL
104+ return f"https://{ ALLOWED_HOST } /{ safe_path } "
88105 except Exception :
89- return False
106+ return None
90107
91108
92109@router .get ("/proxy/html" , response_class = HTMLResponse )
@@ -105,14 +122,15 @@ async def proxy_html(url: str):
105122 Returns:
106123 Modified HTML with size reporting script injected
107124 """
108- # Security: Validate URL strictly to prevent SSRF and path traversal
109- if not validate_gcs_url (url ):
125+ # Security: Validate and reconstruct URL to prevent SSRF
126+ safe_url = build_safe_gcs_url (url )
127+ if safe_url is None :
110128 raise HTTPException (status_code = 400 , detail = f"Only URLs from { ALLOWED_HOST } /{ ALLOWED_BUCKET } are allowed" )
111129
112130 # Fetch the HTML with shorter timeout
113131 async with httpx .AsyncClient (timeout = 10.0 ) as client :
114132 try :
115- response = await client .get (url )
133+ response = await client .get (safe_url )
116134 response .raise_for_status ()
117135 except httpx .HTTPStatusError as e :
118136 raise HTTPException (status_code = e .response .status_code , detail = "Failed to fetch HTML" ) from e
0 commit comments