diff --git a/abx_plugins/plugins/wgetlua/__init__.py b/abx_plugins/plugins/wgetlua/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/wgetlua/config.json b/abx_plugins/plugins/wgetlua/config.json new file mode 100644 index 0000000..a86fe05 --- /dev/null +++ b/abx_plugins/plugins/wgetlua/config.json @@ -0,0 +1,120 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "wgetlua", + "description": "Archive pages with wget-at (Archive Team wget-lua) for better WARC compliance, ready to feed into archive.org.", + "type": "object", + "additionalProperties": false, + "required_plugins": [], + "required_binaries": [ + { + "name": "{WGETLUA_BINARY}", + "binproviders": "env,brew,custom", + "min_version": null, + "overrides": { + "brew": { + "install_args": [ + "wget-at" + ] + }, + "custom": { + "install": "apt-get update -qq && apt-get install -y -qq autoconf automake autoconf-archive autopoint libtool pkg-config gperf flex gettext libgnutls28-dev liblua5.1-0-dev zlib1g-dev libpsl-dev libpcre2-dev libbrotli-dev >/dev/null 2>&1; TMPDIR=$(mktemp -d) && git clone --depth=1 https://github.com/ArchiveTeam/wget-lua.git $TMPDIR/wget-lua && cd $TMPDIR/wget-lua && ./bootstrap >/dev/null 2>&1 && ./configure --with-ssl=gnutls >/dev/null 2>&1 && make -j$(nproc) >/dev/null 2>&1 && cp src/wget /usr/local/bin/wget-at && rm -rf $TMPDIR" + } + } + } + ], + "output_mimetypes": [ + "text/html", + "application/warc", + "application/gzip", + "image/", + "text/css", + "application/javascript", + "font/", + "audio/", + "video/" + ], + "properties": { + "WGETLUA_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": [ + "SAVE_WGETLUA", + "USE_WGETLUA" + ], + "description": "Enable wget-at (Archive Team wget-lua) archiving" + }, + "WGETLUA_WARC_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": [ + "WGETLUA_SAVE_WARC" + ], + "description": "Save WARC archive file (default behavior for wget-at)" + }, + "WGETLUA_BINARY": { + "type": "string", + "default": "wget-at", + "description": "Path to wget-at binary" + }, + "WGETLUA_TIMEOUT": { + "type": "integer", + "default": 60, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for wget-at in seconds" + }, + "WGETLUA_USER_AGENT": { + "type": "string", + "default": "", + "x-fallback": "USER_AGENT", + "description": "User agent string for wget-at" + }, + "WGETLUA_COOKIES_FILE": { + "type": "string", + "default": "", + "x-fallback": "COOKIES_FILE", + "description": "Path to cookies file" + }, + "WGETLUA_CHECK_SSL_VALIDITY": { + "type": "boolean", + "default": true, + "x-fallback": "CHECK_SSL_VALIDITY", + "description": "Whether to verify SSL certificates" + }, + "WGETLUA_ARGS": { + "type": "array", + "items": { + "type": "string" + }, + "default": [ + "--no-verbose", + "--adjust-extension", + "--convert-links", + "--force-directories", + "--backup-converted", + "--span-hosts", + "--no-parent", + "--page-requisites", + "--restrict-file-names=windows", + "--tries=2", + "-e", + "robots=off" + ], + "x-aliases": [ + "WGETLUA_DEFAULT_ARGS" + ], + "description": "Default wget-at arguments" + }, + "WGETLUA_ARGS_EXTRA": { + "type": "array", + "items": { + "type": "string" + }, + "default": [], + "x-aliases": [ + "WGETLUA_EXTRA_ARGS" + ], + "description": "Extra arguments to append to wget-at command" + } + } +} diff --git a/abx_plugins/plugins/wgetlua/on_Snapshot__07_wgetlua.finite.bg.py b/abx_plugins/plugins/wgetlua/on_Snapshot__07_wgetlua.finite.bg.py new file mode 100755 index 0000000..4bf342a --- /dev/null +++ b/abx_plugins/plugins/wgetlua/on_Snapshot__07_wgetlua.finite.bg.py @@ -0,0 +1,205 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "pydantic-settings", +# "jambo", +# "rich-click", +# "abx-plugins", +# ] +# /// +# +# Archive a URL using wget-at (Archive Team wget-lua) for better WARC compliance. +# +# Usage: on_Snapshot__07_wgetlua.finite.bg.py --url= +# Output: Downloads files to $PWD +# +# Environment variables: +# WGETLUA_ENABLED: Enable wget-at archiving (default: True) +# WGETLUA_WARC_ENABLED: Save WARC file (default: True) +# WGETLUA_BINARY: Path to wget-at binary (default: wget-at) +# WGETLUA_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) +# WGETLUA_USER_AGENT: User agent string (x-fallback: USER_AGENT) +# WGETLUA_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) +# WGETLUA_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY) +# WGETLUA_ARGS: Default wget-at arguments (JSON array) +# WGETLUA_ARGS_EXTRA: Extra arguments to append (JSON array) +# + +import os +import re +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +from abx_plugins.plugins.base.utils import ( + emit_archive_result_record, + has_staticfile_output, + load_config, +) + +import rich_click as click + + +# Extractor metadata +PLUGIN_NAME = "wgetlua" +BIN_NAME = "wget-at" +BIN_PROVIDERS = "env,brew,custom" +PLUGIN_DIR = Path(__file__).resolve().parent.name +CONFIG = load_config() +SNAP_DIR = Path(CONFIG.SNAP_DIR or ".").resolve() +OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def rel_output(path_str: str | None) -> str | None: + if not path_str: + return path_str + path = Path(path_str) + resolved = path.resolve() + if not resolved.exists(): + return path_str + try: + return str(resolved.relative_to(SNAP_DIR.resolve())) + except Exception: + return path.name or path_str + + +def save_wgetlua(url: str, binary: str) -> tuple[bool, str | None, str]: + """ + Archive URL using wget-at (Archive Team wget-lua). + + Returns: (success, output_path, error_message) + """ + # Load config from config.json (auto-resolves x-aliases and x-fallback from env) + config = load_config() + timeout = config.WGETLUA_TIMEOUT + user_agent = config.WGETLUA_USER_AGENT or "Mozilla/5.0 (compatible; ArchiveBox/1.0)" + check_ssl = config.WGETLUA_CHECK_SSL_VALIDITY + cookies_file = config.WGETLUA_COOKIES_FILE + wgetlua_args = config.WGETLUA_ARGS + wgetlua_args_extra = config.WGETLUA_ARGS_EXTRA + warc_enabled = config.WGETLUA_WARC_ENABLED + + # Build wget-at command (later options take precedence) + cmd = [ + binary, + *wgetlua_args, + f"--timeout={timeout}", + ] + + if user_agent: + cmd.append(f"--user-agent={user_agent}") + + if warc_enabled: + warc_dir = Path("warc") + warc_dir.mkdir(exist_ok=True) + warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) + cmd.append(f"--warc-file={warc_path}") + else: + cmd.append("--timestamping") + + if cookies_file and Path(cookies_file).is_file(): + cmd.extend(["--load-cookies", cookies_file]) + + if not check_ssl: + cmd.extend(["--no-check-certificate", "--no-hsts"]) + + if wgetlua_args_extra: + cmd.extend(wgetlua_args_extra) + + cmd.append(url) + + # Run wget-at + try: + print("saving page with wget-at (Archive Team wget-lua)...") + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout * 2, # Allow extra time for large downloads + ) + + # Find downloaded files + downloaded_files = [ + f + for f in Path(".").rglob("*") + if f.is_file() and f.name != ".gitkeep" and not str(f).startswith("warc/") + ] + + if not downloaded_files: + if result.returncode != 0: + return False, None, f"wget-at failed (exit={result.returncode})" + return True, "No files downloaded", "" + + # Find main HTML file + html_files = [ + f + for f in downloaded_files + if re.search(r"\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f)) + ] + output_path = str(html_files[0]) if html_files else str(downloaded_files[0]) + + return True, output_path, "" + + except subprocess.TimeoutExpired: + return False, None, f"Timed out after {timeout * 2} seconds" + except Exception as e: + return False, None, f"{type(e).__name__}: {e}" + + +@click.command( + context_settings={"ignore_unknown_options": True, "allow_extra_args": True}, +) +@click.option("--url", required=True, help="URL to archive") +def main(url: str): + """Archive a URL using wget-at (Archive Team wget-lua).""" + + output = None + error = "" + + try: + config = load_config() + + # Check if wgetlua is enabled + if not config.WGETLUA_ENABLED: + print("Skipping wgetlua (WGETLUA_ENABLED=False)", file=sys.stderr) + emit_archive_result_record("skipped", "WGETLUA_ENABLED=False") + sys.exit(0) + + # Check if staticfile extractor already handled this (permanent skip) + if has_staticfile_output(): + print( + "Skipping wgetlua - staticfile extractor already downloaded this", + file=sys.stderr, + ) + emit_archive_result_record("noresults", "staticfile already handled") + sys.exit(0) + + # Get binary from environment + binary = config.WGETLUA_BINARY + + # Run extraction + success, output, error = save_wgetlua(url, binary) + + if success: + status = "noresults" if output == "No files downloaded" else "succeeded" + # Success - emit ArchiveResult + emit_archive_result_record(status, rel_output(output) or "") + sys.exit(0) + else: + print(f"ERROR: {error}", file=sys.stderr) + emit_archive_result_record("failed", error or "") + sys.exit(1) + + except Exception as e: + error = f"{type(e).__name__}: {e}" + print(f"ERROR: {error}", file=sys.stderr) + emit_archive_result_record("failed", error) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/wgetlua/templates/card.html b/abx_plugins/plugins/wgetlua/templates/card.html new file mode 100644 index 0000000..20de1c5 --- /dev/null +++ b/abx_plugins/plugins/wgetlua/templates/card.html @@ -0,0 +1,8 @@ + +
+ +
diff --git a/abx_plugins/plugins/wgetlua/templates/icon.html b/abx_plugins/plugins/wgetlua/templates/icon.html new file mode 100644 index 0000000..feef74a --- /dev/null +++ b/abx_plugins/plugins/wgetlua/templates/icon.html @@ -0,0 +1 @@ + diff --git a/abx_plugins/plugins/wgetlua/tests/__init__.py b/abx_plugins/plugins/wgetlua/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/wgetlua/tests/test_wgetlua.py b/abx_plugins/plugins/wgetlua/tests/test_wgetlua.py new file mode 100644 index 0000000..1bfef4b --- /dev/null +++ b/abx_plugins/plugins/wgetlua/tests/test_wgetlua.py @@ -0,0 +1,490 @@ +""" +Integration tests for wgetlua plugin (Archive Team wget-lua / wget-at) + +Tests verify: + pass +1. Validate hook checks for wget-at binary +2. Verify deps with abx-pkg +3. Config options work (WGETLUA_ENABLED, WGETLUA_SAVE_WARC, etc.) +4. Extraction works against real https://example.com +5. Output files contain actual page content +6. WARC files contain correct content +7. Skip cases work (WGETLUA_ENABLED=False, staticfile present) +8. Failure cases handled (404, network errors) +""" + +import gzip +import json +import os +import shutil +import subprocess +import sys +import tempfile +import uuid +from pathlib import Path + +import pytest + +from abx_plugins.plugins.base.test_utils import parse_jsonl_output + + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +WGETLUA_HOOK = next(PLUGIN_DIR.glob("on_Snapshot__*_wgetlua.*")) +BREW_HOOK = next((PLUGINS_ROOT / "brew").glob("on_BinaryRequest__*_brew.py"), None) +CUSTOM_HOOK = next( + (PLUGINS_ROOT / "custom").glob("on_BinaryRequest__*_custom.py"), None +) +TEST_URL = "https://example.com" +PLUGIN_CONFIG = json.loads((PLUGIN_DIR / "config.json").read_text()) + + +def _provider_runtime_unavailable(proc: subprocess.CompletedProcess[str]) -> bool: + combined = f"{proc.stdout}\n{proc.stderr}" + return ( + "BinProviderOverrides" in combined + or "PydanticUndefinedAnnotation" in combined + or "not fully defined" in combined + ) + + +def _ensure_wget_at_installed() -> str | None: + """Ensure wget-at is installed, return its path or None.""" + # Check if already on PATH + path = shutil.which("wget-at") + if path: + return path + + # Try installing via brew + if shutil.which("brew") and BREW_HOOK and BREW_HOOK.exists(): + result = subprocess.run( + [ + str(BREW_HOOK), + "--binary-id", str(uuid.uuid4()), + "--machine-id", str(uuid.uuid4()), + "--plugin-name", "wgetlua", + "--hook-name", "required_binaries", + "--name", "wget-at", + "--binproviders", "brew", + "--overrides", json.dumps({"brew": {"install_args": ["wget-at"]}}), + ], + capture_output=True, + text=True, + timeout=600, + ) + if result.returncode == 0: + path = shutil.which("wget-at") + if path: + return path + + # Try installing via custom provider (build from source) + if CUSTOM_HOOK and CUSTOM_HOOK.exists(): + overrides = PLUGIN_CONFIG["required_binaries"][0].get("overrides", {}) + result = subprocess.run( + [ + str(CUSTOM_HOOK), + "--name", "wget-at", + "--binproviders", "custom", + "--overrides", json.dumps(overrides), + ], + capture_output=True, + text=True, + timeout=600, + ) + if result.returncode == 0: + path = shutil.which("wget-at") + if path: + return path + + return None + + +def test_hook_script_exists(): + """Verify hook script exists.""" + assert WGETLUA_HOOK.exists(), f"Hook script not found: {WGETLUA_HOOK}" + + +def test_wgetlua_declares_env_brew_custom_providers(): + """required_binaries should declare wget-at via env,brew,custom with overrides.""" + required_binaries = PLUGIN_CONFIG["required_binaries"] + binary_record = next( + ( + record + for record in required_binaries + if record.get("name") == "{WGETLUA_BINARY}" + ), + None, + ) + assert binary_record is not None, ( + f"Expected wgetlua required_binaries entry: {required_binaries}" + ) + assert binary_record["binproviders"] == "env,brew,custom" + + # Verify overrides are defined for brew and custom + overrides = binary_record.get("overrides", {}) + assert "brew" in overrides, "Should have brew overrides" + assert "custom" in overrides, "Should have custom overrides" + assert overrides["brew"]["install_args"] == ["wget-at"], ( + "brew should install wget-at" + ) + assert "install" in overrides["custom"], ( + "custom should have an install command" + ) + + +def test_can_install_wget_at(): + """Test that wget-at can be installed via provider hooks.""" + path = _ensure_wget_at_installed() + assert path is not None, ( + "wget-at could not be installed via any provider (env, brew, custom)" + ) + assert Path(path).exists(), f"wget-at binary should exist at {path}" + + # Verify it runs + result = subprocess.run( + [path, "--version"], + capture_output=True, + text=True, + timeout=10, + ) + assert result.returncode == 0, f"wget-at --version failed: {result.stderr}" + assert "wget" in result.stdout.lower() or "gnu" in result.stdout.lower(), ( + f"wget-at --version should identify as wget: {result.stdout}" + ) + + +def test_reports_missing_dependency_when_not_installed(): + """Test that script reports failure when wget-at is not found.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Run with empty PATH so binary won't be found + env = {"PATH": "/nonexistent", "HOME": str(tmpdir)} + + result = subprocess.run( + [ + sys.executable, + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + ) + + # Missing binary is a hard dependency failure. + assert result.returncode == 1, "Should exit 1 when dependency missing" + + # Should emit failed JSONL describing the missing dependency. + result_json = parse_jsonl_output(result.stdout) + assert result_json, "Expected failed JSONL output" + assert result_json["status"] == "failed", result_json + assert "wget" in result_json["output_str"].lower(), result_json + + # Should log error to stderr + assert ( + "wget" in result.stderr.lower() or "error" in result.stderr.lower() + ), "Should report error in stderr" + + +def test_archives_example_com(): + """Test full workflow: install wget-at then archive https://example.com with content verification.""" + + wget_at_path = _ensure_wget_at_installed() + if not wget_at_path: + pytest.fail( + "wget-at could not be installed - required for live integration test" + ) + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = os.environ.copy() + env["SNAP_DIR"] = str(tmpdir) + env["WGETLUA_BINARY"] = wget_at_path + + # Run wgetlua extraction against real https://example.com + result = subprocess.run( + [ + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + + assert result.returncode == 0, f"Extraction failed: {result.stderr}" + + # Parse clean JSONL output + result_json = parse_jsonl_output(result.stdout) + + assert result_json, "Should have ArchiveResult JSONL output" + assert result_json["status"] == "succeeded", f"Should succeed: {result_json}" + + # Verify files were downloaded to wgetlua output directory. + output_root = tmpdir / "wgetlua" + assert output_root.exists(), "wgetlua output directory was not created" + + downloaded_files = [f for f in output_root.rglob("*") if f.is_file()] + assert downloaded_files, "No files downloaded" + + # Verify the emitted output path is relative and starts with wgetlua/ + assert result_json.get("output_str", "").startswith("wgetlua/"), result_json + output_path = (tmpdir / result_json.get("output_str", "")).resolve() + candidate_files = [output_path] if output_path.is_file() else [] + candidate_files.extend(downloaded_files) + + main_html = None + for candidate in candidate_files: + content = candidate.read_text(errors="ignore") + if "example domain" in content.lower(): + main_html = candidate + break + + assert main_html is not None, ( + "Could not find downloaded file containing example.com content" + ) + + # Verify page content contains REAL example.com text. + html_content = main_html.read_text(errors="ignore") + assert len(html_content) > 200, ( + f"HTML content too short: {len(html_content)} bytes" + ) + assert "example domain" in html_content.lower(), ( + "Missing 'Example Domain' in HTML" + ) + assert ( + "this domain" in html_content.lower() + or "illustrative examples" in html_content.lower() + ), "Missing example.com description text" + assert ( + "iana" in html_content.lower() + or "more information" in html_content.lower() + ), "Missing IANA reference" + + +def test_warc_output_contains_correct_content(): + """Test that WARC output from wget-at contains correct content for https://example.com.""" + + wget_at_path = _ensure_wget_at_installed() + if not wget_at_path: + pytest.fail( + "wget-at could not be installed - required for WARC integration test" + ) + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = os.environ.copy() + env["SNAP_DIR"] = str(tmpdir) + env["WGETLUA_BINARY"] = wget_at_path + env["WGETLUA_SAVE_WARC"] = "True" + + result = subprocess.run( + [ + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + + assert result.returncode == 0, f"Extraction failed: {result.stderr}" + + # Look for WARC files in wgetlua/warc/ subdirectory + warc_dir = tmpdir / "wgetlua" / "warc" + assert warc_dir.exists(), "WARC output directory was not created" + + warc_files = [ + f + for f in warc_dir.rglob("*") + if f.is_file() and f.suffix in (".warc", ".gz", ".warc.gz") + ] + assert len(warc_files) > 0, ( + "WARC file not created when WGETLUA_SAVE_WARC=True" + ) + + # Read WARC content and verify it contains example.com data + warc_content = "" + for warc_file in warc_files: + if warc_file.name.endswith(".gz"): + try: + warc_content += gzip.open(warc_file, "rt", errors="ignore").read() + except Exception: + warc_content += warc_file.read_bytes().decode( + errors="ignore" + ) + else: + warc_content += warc_file.read_text(errors="ignore") + + assert "example.com" in warc_content.lower() or "example domain" in warc_content.lower(), ( + "WARC file should contain example.com content" + ) + assert "WARC/1" in warc_content, ( + "WARC file should contain valid WARC headers" + ) + + +def test_config_wgetlua_false_skips(): + """Test that WGETLUA_ENABLED=False exits without archiving.""" + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + env = os.environ.copy() + env["WGETLUA_ENABLED"] = "False" + + result = subprocess.run( + [ + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30, + ) + + # Should exit 0 when feature disabled + assert result.returncode == 0, ( + f"Should exit 0 when feature disabled: {result.stderr}" + ) + + # Feature disabled should emit skipped JSONL + assert "Skipping" in result.stderr or "False" in result.stderr, ( + "Should log skip reason to stderr" + ) + + result_json = parse_jsonl_output(result.stdout) + assert result_json, "Expected skipped JSONL output" + assert result_json["status"] == "skipped", result_json + assert result_json["output_str"] == "WGETLUA_ENABLED=False", result_json + + +def test_staticfile_present_skips(): + """Test that wgetlua skips when staticfile already downloaded.""" + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = os.environ.copy() + env["SNAP_DIR"] = str(tmpdir) + + # Create directory structure like real ArchiveBox: + staticfile_dir = tmpdir / "staticfile" + staticfile_dir.mkdir() + (staticfile_dir / "stdout.log").write_text( + '{"type":"ArchiveResult","status":"succeeded","output_str":"responses/example.com/test.json","content_type":"application/json"}\n', + ) + + wgetlua_dir = tmpdir / "wgetlua" + wgetlua_dir.mkdir() + + result = subprocess.run( + [ + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=wgetlua_dir, + capture_output=True, + text=True, + timeout=30, + env=env, + ) + + # Should exit 0 with a noresults JSONL + assert result.returncode == 0, ( + "Should exit 0 when staticfile already handled the URL" + ) + + result_json = parse_jsonl_output(result.stdout) + + assert result_json, ( + "Should emit ArchiveResult JSONL when staticfile already handled the URL" + ) + assert result_json["status"] == "noresults", ( + f"Should have status='noresults': {result_json}" + ) + assert "staticfile" in result_json.get("output_str", "").lower(), ( + "Should mention staticfile in output_str" + ) + + +def test_config_timeout_honored(): + """Test that WGETLUA_TIMEOUT config is respected.""" + + wget_at_path = _ensure_wget_at_installed() + if not wget_at_path: + pytest.skip("wget-at not available") + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + env = os.environ.copy() + env["WGETLUA_TIMEOUT"] = "5" + env["WGETLUA_BINARY"] = wget_at_path + + result = subprocess.run( + [ + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30, + ) + + # Verify it completed (success or fail, but didn't hang) + assert result.returncode in (0, 1), "Should complete (success or fail)" + + +def test_config_user_agent(): + """Test that WGETLUA_USER_AGENT config is used.""" + + wget_at_path = _ensure_wget_at_installed() + if not wget_at_path: + pytest.skip("wget-at not available") + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + env = os.environ.copy() + env["WGETLUA_USER_AGENT"] = "TestBot/1.0" + env["WGETLUA_BINARY"] = wget_at_path + + result = subprocess.run( + [ + str(WGETLUA_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + + if result.returncode == 0: + result_json = parse_jsonl_output(result.stdout) + assert result_json, "Should have ArchiveResult JSONL output" + assert result_json["status"] == "succeeded", ( + f"Should succeed: {result_json}" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])