|
| 1 | +"""HTML proxy endpoint for interactive plots with size reporting.""" |
| 2 | + |
| 3 | +from urllib.parse import urlparse |
| 4 | + |
| 5 | +import httpx |
| 6 | +from fastapi import APIRouter, HTTPException |
| 7 | +from fastapi.responses import HTMLResponse |
| 8 | + |
| 9 | + |
| 10 | +router = APIRouter(tags=["proxy"]) |
| 11 | + |
| 12 | +# Script injected to report content size to parent window |
| 13 | +# Uses specific origin (pyplots.ai) for postMessage security |
| 14 | +SIZE_REPORTER_SCRIPT = """ |
| 15 | +<script> |
| 16 | +(function() { |
| 17 | + function reportSize() { |
| 18 | + try { |
| 19 | + // Find the main content element (try common patterns for different libraries) |
| 20 | + var content = document.querySelector( |
| 21 | + '.bk-root, .vega-embed, .plotly, .chart-container, #container, .lp-plot, svg, canvas' |
| 22 | + ) || document.body.firstElementChild || document.body; |
| 23 | +
|
| 24 | + // Get actual rendered size |
| 25 | + var rect = content.getBoundingClientRect(); |
| 26 | + var width = Math.max(rect.width, content.scrollWidth || 0, document.body.scrollWidth || 0); |
| 27 | + var height = Math.max(rect.height, content.scrollHeight || 0, document.body.scrollHeight || 0); |
| 28 | +
|
| 29 | + // Add padding to account for action buttons, toolbars, and other UI elements |
| 30 | + var padding = 40; |
| 31 | + width += padding; |
| 32 | + height += padding; |
| 33 | +
|
| 34 | + // Send to parent with specific origin for security |
| 35 | + if (width > 0 && height > 0 && window.parent !== window) { |
| 36 | + window.parent.postMessage({ |
| 37 | + type: 'pyplots-size', |
| 38 | + width: Math.ceil(width), |
| 39 | + height: Math.ceil(height) |
| 40 | + }, 'https://pyplots.ai'); |
| 41 | + } |
| 42 | + } catch (e) { |
| 43 | + // Silently fail if postMessage is blocked |
| 44 | + } |
| 45 | + } |
| 46 | +
|
| 47 | + // Report after load and after delays (for async rendering libraries) |
| 48 | + if (document.readyState === 'complete') { |
| 49 | + setTimeout(reportSize, 100); |
| 50 | + setTimeout(reportSize, 500); |
| 51 | + setTimeout(reportSize, 1000); |
| 52 | + } else { |
| 53 | + window.addEventListener('load', function() { |
| 54 | + setTimeout(reportSize, 100); |
| 55 | + setTimeout(reportSize, 500); |
| 56 | + setTimeout(reportSize, 1000); |
| 57 | + }); |
| 58 | + } |
| 59 | +})(); |
| 60 | +</script> |
| 61 | +""" |
| 62 | + |
| 63 | +# Allowed GCS bucket for security |
| 64 | +ALLOWED_HOST = "storage.googleapis.com" |
| 65 | +ALLOWED_BUCKET = "pyplots-images" |
| 66 | + |
| 67 | + |
| 68 | +def build_safe_gcs_url(url: str) -> str | None: |
| 69 | + """ |
| 70 | + Validate URL and return a reconstructed safe GCS URL. |
| 71 | +
|
| 72 | + This prevents SSRF by constructing the URL from hardcoded values |
| 73 | + instead of passing user input directly. |
| 74 | +
|
| 75 | + Args: |
| 76 | + url: User-provided URL to validate |
| 77 | +
|
| 78 | + Returns: |
| 79 | + Reconstructed safe URL or None if validation fails |
| 80 | + """ |
| 81 | + try: |
| 82 | + parsed = urlparse(url) |
| 83 | + # Must be HTTPS |
| 84 | + if parsed.scheme != "https": |
| 85 | + return None |
| 86 | + # Must be exact host (no subdomains) |
| 87 | + if parsed.netloc != ALLOWED_HOST: |
| 88 | + return None |
| 89 | + # Path must start with bucket name |
| 90 | + path_parts = parsed.path.strip("/").split("/") |
| 91 | + if len(path_parts) < 2: |
| 92 | + return None |
| 93 | + if path_parts[0] != ALLOWED_BUCKET: |
| 94 | + return None |
| 95 | + # Check for path traversal attempts |
| 96 | + if ".." in parsed.path: |
| 97 | + return None |
| 98 | + # Validate path contains only safe characters (alphanumeric, hyphens, underscores, dots, slashes) |
| 99 | + safe_path = parsed.path.strip("/") |
| 100 | + if not all(c.isalnum() or c in "-_./+" for c in safe_path): |
| 101 | + return None |
| 102 | + # Reconstruct URL from hardcoded values to prevent SSRF |
| 103 | + # This breaks the taint flow by not using the original URL |
| 104 | + return f"https://{ALLOWED_HOST}/{safe_path}" |
| 105 | + except Exception: |
| 106 | + return None |
| 107 | + |
| 108 | + |
| 109 | +@router.get("/proxy/html", response_class=HTMLResponse) |
| 110 | +async def proxy_html(url: str): |
| 111 | + """ |
| 112 | + Proxy an HTML file and inject size reporting script. |
| 113 | +
|
| 114 | + This endpoint fetches HTML from GCS, injects a script that reports |
| 115 | + the content's actual dimensions via postMessage, and returns the |
| 116 | + modified HTML. This allows the frontend to dynamically scale the |
| 117 | + iframe based on actual content size. |
| 118 | +
|
| 119 | + Args: |
| 120 | + url: The GCS URL to fetch (must be from allowed bucket) |
| 121 | +
|
| 122 | + Returns: |
| 123 | + Modified HTML with size reporting script injected |
| 124 | + """ |
| 125 | + # Security: Validate and reconstruct URL to prevent SSRF |
| 126 | + safe_url = build_safe_gcs_url(url) |
| 127 | + if safe_url is None: |
| 128 | + raise HTTPException(status_code=400, detail=f"Only URLs from {ALLOWED_HOST}/{ALLOWED_BUCKET} are allowed") |
| 129 | + |
| 130 | + # Fetch the HTML with shorter timeout |
| 131 | + async with httpx.AsyncClient(timeout=10.0) as client: |
| 132 | + try: |
| 133 | + response = await client.get(safe_url) |
| 134 | + response.raise_for_status() |
| 135 | + except httpx.HTTPStatusError as e: |
| 136 | + raise HTTPException(status_code=e.response.status_code, detail="Failed to fetch HTML") from e |
| 137 | + except httpx.RequestError as e: |
| 138 | + raise HTTPException(status_code=502, detail="Failed to connect to storage") from e |
| 139 | + |
| 140 | + html_content = response.text |
| 141 | + |
| 142 | + # Inject the size reporter script before </body> |
| 143 | + if "</body>" in html_content: |
| 144 | + html_content = html_content.replace("</body>", f"{SIZE_REPORTER_SCRIPT}</body>") |
| 145 | + elif "</html>" in html_content: |
| 146 | + html_content = html_content.replace("</html>", f"{SIZE_REPORTER_SCRIPT}</html>") |
| 147 | + else: |
| 148 | + # Fallback: append to end |
| 149 | + html_content += SIZE_REPORTER_SCRIPT |
| 150 | + |
| 151 | + return HTMLResponse(content=html_content) |
0 commit comments