Skip to content

Commit 28d15f8

Browse files
fix: prevent SSRF by reconstructing URL from validated parts
CodeQL was flagging the original approach because user-controlled URL was still passed to client.get(). Now we extract the path, validate it, and reconstruct a new URL from hardcoded scheme/host values. This breaks the taint flow and properly prevents SSRF attacks.
1 parent 3a218e2 commit 28d15f8

1 file changed

Lines changed: 31 additions & 13 deletions

File tree

api/routers/proxy.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -65,28 +65,45 @@
6565
ALLOWED_BUCKET = "pyplots-images"
6666

6767

68-
def validate_gcs_url(url: str) -> bool:
69-
"""Validate that URL is from allowed GCS bucket with no path traversal."""
68+
def build_safe_gcs_url(url: str) -> str | None:
69+
"""
70+
Validate URL and return a reconstructed safe GCS URL.
71+
72+
This prevents SSRF by constructing the URL from hardcoded values
73+
instead of passing user input directly.
74+
75+
Args:
76+
url: User-provided URL to validate
77+
78+
Returns:
79+
Reconstructed safe URL or None if validation fails
80+
"""
7081
try:
7182
parsed = urlparse(url)
7283
# Must be HTTPS
7384
if parsed.scheme != "https":
74-
return False
85+
return None
7586
# Must be exact host (no subdomains)
7687
if parsed.netloc != ALLOWED_HOST:
77-
return False
78-
# Path must start with bucket name and not contain traversal
88+
return None
89+
# Path must start with bucket name
7990
path_parts = parsed.path.strip("/").split("/")
8091
if len(path_parts) < 2:
81-
return False
92+
return None
8293
if path_parts[0] != ALLOWED_BUCKET:
83-
return False
94+
return None
8495
# Check for path traversal attempts
8596
if ".." in parsed.path:
86-
return False
87-
return True
97+
return None
98+
# Validate path contains only safe characters (alphanumeric, hyphens, underscores, dots, slashes)
99+
safe_path = parsed.path.strip("/")
100+
if not all(c.isalnum() or c in "-_./+" for c in safe_path):
101+
return None
102+
# Reconstruct URL from hardcoded values to prevent SSRF
103+
# This breaks the taint flow by not using the original URL
104+
return f"https://{ALLOWED_HOST}/{safe_path}"
88105
except Exception:
89-
return False
106+
return None
90107

91108

92109
@router.get("/proxy/html", response_class=HTMLResponse)
@@ -105,14 +122,15 @@ async def proxy_html(url: str):
105122
Returns:
106123
Modified HTML with size reporting script injected
107124
"""
108-
# Security: Validate URL strictly to prevent SSRF and path traversal
109-
if not validate_gcs_url(url):
125+
# Security: Validate and reconstruct URL to prevent SSRF
126+
safe_url = build_safe_gcs_url(url)
127+
if safe_url is None:
110128
raise HTTPException(status_code=400, detail=f"Only URLs from {ALLOWED_HOST}/{ALLOWED_BUCKET} are allowed")
111129

112130
# Fetch the HTML with shorter timeout
113131
async with httpx.AsyncClient(timeout=10.0) as client:
114132
try:
115-
response = await client.get(url)
133+
response = await client.get(safe_url)
116134
response.raise_for_status()
117135
except httpx.HTTPStatusError as e:
118136
raise HTTPException(status_code=e.response.status_code, detail="Failed to fetch HTML") from e

0 commit comments

Comments
 (0)