diff --git a/docs/har.md b/docs/har.md index b0ea088..1426f74 100644 --- a/docs/har.md +++ b/docs/har.md @@ -55,6 +55,23 @@ Archive: datasette-io.har.zip You can record multiple pages to a single HTTP Archive using the {ref}`shot-scraper multi --har option`. +## Extracting resources from HAR files + +Use the `--extract` or `-x` option to automatically extract all resources from the HAR file into a directory: + +```bash +shot-scraper har https://datasette.io/ --extract +``` +This will create both `datasette-io.har` and a `datasette-io/` directory containing all resources with meaningful filenames derived from their URLs. + +The extracted files use extensions based on their content-type. For example, a request to `/api/data` that returns JSON will be saved with a `.json` extension. + +You can combine this with `--zip`: +```bash +shot-scraper har https://datasette.io/ --extract --zip +``` +This creates `datasette-io.har.zip` and extracts resources to the `datasette-io/` directory. + ## `shot-scraper har --help` Full `--help` for this command: @@ -87,8 +104,11 @@ Usage: shot-scraper har [OPTIONS] URL Use --zip to save as a .har.zip file instead, or specify a filename ending in .har.zip + Use --extract / -x to also extract all resources from the HAR into a directory + Options: -z, --zip Save as a .har.zip file + -x, --extract Extract resources from the HAR file into a directory -a, --auth FILENAME Path to JSON authentication context file -o, --output FILE HAR filename --wait INTEGER Wait this many milliseconds before taking the diff --git a/shot_scraper/cli.py b/shot_scraper/cli.py index c7dca8a..2a2ee0b 100644 --- a/shot_scraper/cli.py +++ b/shot_scraper/cli.py @@ -1,3 +1,4 @@ +import base64 import secrets import subprocess import sys @@ -6,6 +7,7 @@ import json import os import pathlib +import zipfile from runpy import run_module from click_default_group import DefaultGroup import yaml @@ -13,7 +15,12 @@ from playwright.sync_api import sync_playwright, Error, TimeoutError -from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path +from shot_scraper.utils import ( + filename_for_url, + filename_for_har_entry, + load_github_script, + url_or_file_path, +) BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta") @@ -712,6 +719,13 @@ def accessibility( @cli.command() @click.argument("url") @click.option("zip_", "-z", "--zip", is_flag=True, help="Save as a .har.zip file") +@click.option( + "extract", + "-x", + "--extract", + is_flag=True, + help="Extract resources from the HAR file into a directory", +) @click.option( "-a", "--auth", @@ -741,6 +755,7 @@ def accessibility( def har( url, zip_, + extract, auth, output, wait, @@ -766,6 +781,8 @@ def har( shot-scraper har https://datasette.io/ -o trace.har Use --zip to save as a .har.zip file instead, or specify a filename ending in .har.zip + + Use --extract / -x to also extract all resources from the HAR into a directory """ if output is None: output = filename_for_url( @@ -800,6 +817,104 @@ def har( context.close() browser_obj.close() + if extract: + _extract_har_resources(output) + + +def _extract_har_resources(har_path): + """Extract resources from a HAR file into a directory.""" + har_path = pathlib.Path(har_path) + + # Determine if it's a zip file + is_zip = zipfile.is_zipfile(har_path) + + # Determine extract directory name (parallel to har file) + if str(har_path).endswith(".har.zip"): + extract_dir = har_path.parent / har_path.name.replace(".har.zip", "") + else: + extract_dir = har_path.parent / har_path.name.replace(".har", "") + + # Create the extract directory + extract_dir.mkdir(exist_ok=True) + + # Track existing files to handle duplicates + existing_files = set() + + def file_exists_in_dir(filename): + return filename in existing_files + + # Load the HAR data (and keep zip file open if needed) + if is_zip: + with zipfile.ZipFile(har_path) as zf: + with zf.open("har.har") as har_file: + har_data = json.load(har_file) + + # Extract each entry (with zip file open for _file references) + for entry in har_data.get("log", {}).get("entries", []): + _extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, zf) + else: + with open(har_path) as har_file: + har_data = json.load(har_file) + + # Extract each entry + for entry in har_data.get("log", {}).get("entries", []): + _extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, None) + + click.echo(f"Extracted resources to: {extract_dir}", err=True) + + +def _extract_har_entry(entry, extract_dir, existing_files, file_exists_fn, zip_file): + """Extract a single HAR entry to the extract directory.""" + request = entry.get("request", {}) + response = entry.get("response", {}) + content = response.get("content", {}) + + url = request.get("url", "") + if not url: + return + + # Get content-type from response headers + content_type = None + for header in response.get("headers", []): + if header.get("name", "").lower() == "content-type": + content_type = header.get("value", "") + break + + # Get the content - either from text field or from _file reference in zip + text = content.get("text", "") + encoding = content.get("encoding", "") + file_ref = content.get("_file", "") + + data = None + + if file_ref and zip_file: + # Content is stored as a separate file in the zip + try: + with zip_file.open(file_ref) as f: + data = f.read() + except KeyError: + pass + elif text: + # Decode the content from text field + if encoding == "base64": + try: + data = base64.b64decode(text) + except Exception: + return + else: + data = text.encode("utf-8") + + if not data: + return + + # Generate filename + filename = filename_for_har_entry(url, content_type, file_exists=file_exists_fn) + existing_files.add(filename) + + # Write the file + file_path = extract_dir / filename + file_path.write_bytes(data) + @cli.command() @click.argument("url") diff --git a/shot_scraper/utils.py b/shot_scraper/utils.py index 5fbc757..f8816eb 100644 --- a/shot_scraper/utils.py +++ b/shot_scraper/utils.py @@ -1,8 +1,50 @@ import urllib.parse import re +import os.path disallowed_re = re.compile("[^a-zA-Z0-9_-]") +# Map content-type to file extension +CONTENT_TYPE_EXTENSIONS = { + "text/html": "html", + "text/css": "css", + "application/javascript": "js", + "text/javascript": "js", + "application/json": "json", + "image/png": "png", + "image/jpeg": "jpg", + "image/gif": "gif", + "image/webp": "webp", + "image/svg+xml": "svg", + "application/pdf": "pdf", + "text/plain": "txt", + "application/xml": "xml", + "text/xml": "xml", + "font/woff2": "woff2", + "font/woff": "woff", + "application/font-woff": "woff", +} + +# Map file extension to expected content-type prefix +EXTENSION_CONTENT_TYPES = { + "html": "text/html", + "htm": "text/html", + "css": "text/css", + "js": "application/javascript", + "json": "application/json", + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "webp": "image/webp", + "svg": "image/svg+xml", + "pdf": "application/pdf", + "txt": "text/plain", + "xml": "application/xml", + "woff2": "font/woff2", + "woff": "font/woff", +} + def file_exists_never(filename): return False @@ -72,3 +114,72 @@ def load_github_script(github_path: str) -> str: ) except urllib.error.URLError as e: raise ValueError(f"Error fetching from GitHub: {e}") + + +def extension_for_content_type(content_type): + """ + Return the file extension for a given content-type. + + Returns None if the content-type is unknown or empty. + """ + if not content_type: + return None + # Strip charset and other parameters + mime_type = content_type.split(";")[0].strip().lower() + return CONTENT_TYPE_EXTENSIONS.get(mime_type) + + +def filename_for_har_entry(url, content_type, file_exists=file_exists_never): + """ + Derive a filename for a HAR entry based on its URL and content-type. + + Uses the URL to generate a base filename, then determines the extension: + - If the URL has an extension that matches the content-type, use it + - If the URL has no extension, or the extension doesn't match, use content-type + - If neither URL nor content-type provide an extension, use .bin + """ + bits = urllib.parse.urlparse(url) + url_path = bits.path + + # Try to get extension from URL path + path_base, url_ext_with_dot = os.path.splitext(url_path) + url_ext = url_ext_with_dot.lstrip(".").lower() if url_ext_with_dot else None + + # Get extension from content-type + ct_ext = extension_for_content_type(content_type) + + # Determine if URL extension matches content-type + url_ext_matches_ct = False + if url_ext and ct_ext: + expected_ct = EXTENSION_CONTENT_TYPES.get(url_ext, "").lower() + actual_ct = content_type.split(";")[0].strip().lower() if content_type else "" + if expected_ct and expected_ct == actual_ct: + url_ext_matches_ct = True + elif url_ext in ("jpg", "jpeg") and ct_ext in ("jpg", "jpeg"): + url_ext_matches_ct = True + + # Get base filename from URL (netloc + path, excluding query) + # Only strip extension from path if it matches content-type + if url_ext and url_ext_matches_ct: + path_for_base = path_base + else: + path_for_base = url_path + base = (bits.netloc + path_for_base).replace(".", "-").replace("/", "-").rstrip("-") + base = disallowed_re.sub("", base).lstrip("-") + + # Determine final extension + if url_ext_matches_ct: + ext = url_ext + elif ct_ext: + ext = ct_ext + elif url_ext: + ext = url_ext + else: + ext = "bin" + + filename = f"{base}.{ext}" + suffix = 0 + while file_exists(filename): + suffix += 1 + filename = f"{base}.{suffix}.{ext}" + return filename diff --git a/tests/test_shot_scraper.py b/tests/test_shot_scraper.py index 99650e2..da5ed96 100644 --- a/tests/test_shot_scraper.py +++ b/tests/test_shot_scraper.py @@ -343,3 +343,108 @@ def test_multi_har(http_server, args, expect_zip, record_shots): assert num_shots == 2 else: assert num_shots == 0 + + +@pytest.mark.parametrize( + "args,expect_zip", + ( + (["--extract"], False), + (["-x"], False), + (["--extract", "--zip"], True), + (["-x", "-z"], True), + (["--extract", "-o", "output.har"], False), + (["-x", "-o", "output.har.zip"], True), + ), +) +def test_har_extract(http_server, args, expect_zip): + """Test that --extract creates a directory with HAR resources.""" + runner = CliRunner() + # Create additional files on the server with different content types + (http_server.base_dir / "style.css").write_text("body { color: red; }") + (http_server.base_dir / "script.js").write_text("console.log('hello');") + # Create an HTML file that references the CSS and JS + (http_server.base_dir / "page.html").write_text( + """ + + + + + +Hello +""" + ) + with runner.isolated_filesystem(): + here = pathlib.Path(".") + result = runner.invoke(cli, ["har", f"{http_server.base_url}/page.html"] + args) + assert result.exit_code == 0, result.output + + # HAR file should have been created + if expect_zip: + har_files = list(here.glob("*.har.zip")) + else: + har_files = list(here.glob("*.har")) + assert len(har_files) == 1 + har_file = har_files[0] + + # Extract directory should have been created + if expect_zip: + extract_dir_name = str(har_file.name).replace(".har.zip", "") + else: + extract_dir_name = str(har_file.name).replace(".har", "") + extract_dir = here / extract_dir_name + assert extract_dir.exists(), f"Extract directory {extract_dir} should exist" + assert extract_dir.is_dir(), f"{extract_dir} should be a directory" + + # Should contain extracted files + extracted_files = list(extract_dir.glob("*")) + assert len(extracted_files) >= 1, "Should have extracted at least one file" + + # Check that at least the main HTML file was extracted + html_files = list(extract_dir.glob("*.html")) + assert len(html_files) >= 1, "Should have extracted at least one HTML file" + + +def test_har_extract_filenames(http_server): + """Test that extracted files have correct names based on URLs.""" + runner = CliRunner() + (http_server.base_dir / "api").mkdir() + (http_server.base_dir / "api" / "data.json").write_text('{"key": "value"}') + # Create an HTML page that loads the JSON + (http_server.base_dir / "loader.html").write_text( + '' + ) + with runner.isolated_filesystem(): + here = pathlib.Path(".") + result = runner.invoke( + cli, ["har", f"{http_server.base_url}/loader.html", "--extract", "-o", "test.har"] + ) + assert result.exit_code == 0, result.output + + extract_dir = here / "test" + assert extract_dir.exists() + + extracted_files = list(extract_dir.glob("*")) + assert len(extracted_files) >= 1 + # The /api/data.json file should be extracted with derived name + file_names = [f.name for f in extracted_files] + assert any("api-data" in name for name in file_names), f"Expected api-data in {file_names}" + + +def test_har_extract_content_type_extension(http_server): + """Test that extracted files have correct extension based on content-type.""" + runner = CliRunner() + # Create an HTML file that will be served with text/html content-type + (http_server.base_dir / "test-page.html").write_text("Test page") + with runner.isolated_filesystem(): + here = pathlib.Path(".") + result = runner.invoke( + cli, ["har", f"{http_server.base_url}/test-page.html", "--extract", "-o", "test.har"] + ) + assert result.exit_code == 0, result.output + + extract_dir = here / "test" + assert extract_dir.exists() + + # The file should have .html extension based on content-type text/html + html_files = list(extract_dir.glob("*.html")) + assert len(html_files) >= 1, f"Should have .html file, got: {list(extract_dir.glob('*'))}" diff --git a/tests/test_utils.py b/tests/test_utils.py index 1be6ed5..667b5e2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,9 @@ import pytest -from shot_scraper.utils import filename_for_url +from shot_scraper.utils import ( + filename_for_url, + extension_for_content_type, + filename_for_har_entry, +) @pytest.mark.parametrize( @@ -33,3 +37,58 @@ def test_filename_for_url(url, ext, expected): ) def test_filename_for_url_if_exists(url, existing_files, expected): assert filename_for_url(url, file_exists=lambda s: s in existing_files) == expected + + +@pytest.mark.parametrize( + "content_type,expected", + ( + ("text/html", "html"), + ("text/html; charset=utf-8", "html"), + ("text/css", "css"), + ("application/javascript", "js"), + ("text/javascript", "js"), + ("application/json", "json"), + ("image/png", "png"), + ("image/jpeg", "jpg"), + ("image/gif", "gif"), + ("image/webp", "webp"), + ("image/svg+xml", "svg"), + ("application/pdf", "pdf"), + ("text/plain", "txt"), + ("application/xml", "xml"), + ("text/xml", "xml"), + ("font/woff2", "woff2"), + ("font/woff", "woff"), + ("application/font-woff", "woff"), + ("application/octet-stream", None), + ("", None), + (None, None), + ), +) +def test_extension_for_content_type(content_type, expected): + assert extension_for_content_type(content_type) == expected + + +@pytest.mark.parametrize( + "url,content_type,existing_files,expected", + ( + # URL has extension that matches content-type + ("https://example.com/style.css", "text/css", [], "example-com-style.css"), + # URL has extension that matches content-type (with charset) + ("https://example.com/page.html", "text/html; charset=utf-8", [], "example-com-page.html"), + # URL has no extension, use content-type + ("https://example.com/api/data", "application/json", [], "example-com-api-data.json"), + # URL has no extension and no content-type, use .bin + ("https://example.com/api/data", None, [], "example-com-api-data.bin"), + # URL has wrong extension, use content-type + ("https://example.com/image.php", "image/png", [], "example-com-image-php.png"), + # Handle duplicate files + ("https://example.com/style.css", "text/css", ["example-com-style.css"], "example-com-style.1.css"), + # Complex URL path + ("https://example.com/assets/v1/icons/logo.svg", "image/svg+xml", [], "example-com-assets-v1-icons-logo.svg"), + # Query string should be stripped, and matching extension is not duplicated + ("https://example.com/image.png?v=123", "image/png", [], "example-com-image.png"), + ), +) +def test_filename_for_har_entry(url, content_type, existing_files, expected): + assert filename_for_har_entry(url, content_type, file_exists=lambda s: s in existing_files) == expected