Skip to content

Commit db61762

Browse files
committed
refactor: direct in-process tool execution, security hardening
- Extract execute_recipe_endpoint() as public API for MCP bridge/server - MCP server calls recipes directly (no more httpx self-call) - MCP bridge uses execute_recipe_endpoint() instead of faking Request objects - Sanitize upload filenames with path traversal protection - Replace regex param validation with Python identifier check - Configurable host port via WEB2API_HOST_PORT env var - Add tests: path traversal, MCP special chars, keyword param rejection - Update fallback version to 0.4.0
1 parent 4e3a406 commit db61762

10 files changed

Lines changed: 301 additions & 87 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ All recipe endpoints follow the pattern: `GET /{slug}/{endpoint}?page=1&q=...`
348348
- `page` — pagination (default: 1)
349349
- `q` — query text (required when `requires_query: true`)
350350
- additional query params are passed to custom scrapers
351-
- extra query param names must match `[a-zA-Z0-9][a-zA-Z0-9_-]{0,63}` and values are capped at 512 chars
351+
- extra query param names must be valid Python identifiers (and not keywords); values are capped at 512 chars
352352

353353
### Error Codes
354354

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ services:
55
dockerfile: Dockerfile
66
restart: unless-stopped
77
ports:
8-
- "127.0.0.1:8010:8000"
8+
- "127.0.0.1:${WEB2API_HOST_PORT:-8010}:8000"
99
volumes:
1010
- ./data/recipes:/data/recipes
1111
environment:

tests/e2e/test_e2e.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import os
66
import shutil
7+
import socket
78
import subprocess
89
import time
910
import uuid
@@ -17,7 +18,6 @@
1718

1819
PROJECT_ROOT = Path(__file__).resolve().parents[2]
1920
COMPOSE_FILE = PROJECT_ROOT / "docker-compose.yml"
20-
BASE_URL = "http://127.0.0.1:8010"
2121
HEALTH_TIMEOUT_SECONDS = 180
2222
HEALTH_POLL_INTERVAL_SECONDS = 2.0
2323
LIVE_ERROR_MARKERS = (
@@ -62,6 +62,12 @@ def _is_docker_unavailable(stderr_stdout: str) -> bool:
6262
return any(marker in message for marker in DOCKER_UNAVAILABLE_MARKERS)
6363

6464

65+
def _allocate_host_port() -> int:
66+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
67+
sock.bind(("127.0.0.1", 0))
68+
return int(sock.getsockname()[1])
69+
70+
6571
def _wait_for_health(base_url: str) -> None:
6672
deadline = time.monotonic() + HEALTH_TIMEOUT_SECONDS
6773
last_error = "no response"
@@ -129,6 +135,9 @@ def dockerized_web2api() -> Iterator[str]:
129135
compose_project = f"web2api-e2e-{uuid.uuid4().hex[:8]}"
130136
env = os.environ.copy()
131137
env["COMPOSE_PROJECT_NAME"] = compose_project
138+
host_port = _allocate_host_port()
139+
env["WEB2API_HOST_PORT"] = str(host_port)
140+
base_url = f"http://127.0.0.1:{host_port}"
132141

133142
started = False
134143
try:
@@ -140,8 +149,8 @@ def dockerized_web2api() -> Iterator[str]:
140149
pytest.fail(f"docker compose up failed:\n{combined}")
141150

142151
started = True
143-
_wait_for_health(BASE_URL)
144-
yield BASE_URL
152+
_wait_for_health(base_url)
153+
yield base_url
145154
finally:
146155
if started:
147156
_run_compose(

tests/integration/test_api.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import asyncio
66
import logging
77
import subprocess
8+
import uuid
89
from datetime import UTC, datetime
910
from pathlib import Path
1011

@@ -259,6 +260,9 @@ async def fake_scrape(
259260
invalid_extra_resp = await client.get("/alpha/read?bad!param=1")
260261
assert invalid_extra_resp.status_code == 400
261262
assert invalid_extra_resp.json()["error"]["code"] == "INVALID_PARAMS"
263+
invalid_identifier_resp = await client.get("/alpha/read?model-id=1")
264+
assert invalid_identifier_resp.status_code == 400
265+
assert invalid_identifier_resp.json()["error"]["code"] == "INVALID_PARAMS"
262266

263267
# Non-existent endpoint on a recipe (404 from FastAPI)
264268
unknown_ep_resp = await client.get("/beta/search")
@@ -527,6 +531,126 @@ async def test_recipe_management_uninstall_force_for_unmanaged_local(
527531
assert "local-only" not in slugs
528532

529533

534+
@pytest.mark.asyncio
535+
async def test_mcp_bridge_preserves_special_characters_in_params(
536+
tmp_path: Path,
537+
monkeypatch: pytest.MonkeyPatch,
538+
) -> None:
539+
recipes_dir = tmp_path / "recipes"
540+
_write_recipe(
541+
recipes_dir,
542+
"alpha",
543+
endpoints={
544+
"search": {
545+
"url": "https://example.com/search?q={query}&page={page}",
546+
"requires_query": True,
547+
"params": {
548+
"tools_url": {
549+
"description": "MCP bridge URL",
550+
"required": False,
551+
},
552+
},
553+
"items": {"container": ".item", "fields": {"title": {"selector": ".title"}}},
554+
"pagination": {"type": "page_param", "param": "page"},
555+
},
556+
},
557+
)
558+
559+
captured: dict[str, object] = {}
560+
561+
async def fake_scrape(
562+
*,
563+
pool: FakePool,
564+
recipe,
565+
endpoint: str,
566+
page: int = 1,
567+
query: str | None = None,
568+
extra_params: dict[str, str] | None = None,
569+
scrape_timeout: float = 30.0,
570+
) -> ApiResponse:
571+
_ = pool, recipe, endpoint, page, scrape_timeout
572+
captured["query"] = query
573+
captured["extra_params"] = dict(extra_params or {})
574+
return _success_response(slug="alpha", endpoint="search", page=1, query=query)
575+
576+
monkeypatch.setattr("web2api.main.scrape", fake_scrape)
577+
578+
fake_pool = FakePool()
579+
app = create_app(recipes_dir=recipes_dir, pool=fake_pool)
580+
581+
async with app.router.lifespan_context(app):
582+
transport = ASGITransport(app=app)
583+
async with AsyncClient(transport=transport, base_url="http://testserver") as client:
584+
response = await client.post(
585+
"/mcp/tools/alpha__search",
586+
json={
587+
"q": "cats & dogs",
588+
"tools_url": "http://localhost:8100/mcp/tools?x=1&y=2",
589+
},
590+
)
591+
592+
assert response.status_code == 200
593+
assert captured["query"] == "cats & dogs"
594+
assert captured["extra_params"] == {
595+
"tools_url": "http://localhost:8100/mcp/tools?x=1&y=2",
596+
}
597+
598+
599+
@pytest.mark.asyncio
600+
async def test_post_upload_rejects_path_traversal_filenames(
601+
tmp_path: Path,
602+
monkeypatch: pytest.MonkeyPatch,
603+
) -> None:
604+
recipes_dir = tmp_path / "recipes"
605+
_write_recipe(recipes_dir, "alpha")
606+
607+
captured: dict[str, object] = {}
608+
escaped_name = f"web2api_escape_{uuid.uuid4().hex}.txt"
609+
escaped_path = Path("/tmp") / escaped_name
610+
if escaped_path.exists():
611+
escaped_path.unlink()
612+
613+
async def fake_scrape(
614+
*,
615+
pool: FakePool,
616+
recipe,
617+
endpoint: str,
618+
page: int = 1,
619+
query: str | None = None,
620+
extra_params: dict[str, str] | None = None,
621+
scrape_timeout: float = 30.0,
622+
) -> ApiResponse:
623+
_ = pool, recipe, endpoint, page, query, scrape_timeout
624+
captured["extra_params"] = dict(extra_params or {})
625+
return _success_response(slug="alpha", endpoint="read", page=1)
626+
627+
monkeypatch.setattr("web2api.main.scrape", fake_scrape)
628+
629+
fake_pool = FakePool()
630+
app = create_app(recipes_dir=recipes_dir, pool=fake_pool)
631+
632+
async with app.router.lifespan_context(app):
633+
transport = ASGITransport(app=app)
634+
async with AsyncClient(transport=transport, base_url="http://testserver") as client:
635+
response = await client.post(
636+
"/alpha/read",
637+
files={
638+
"files": (f"../{escaped_name}", b"uploaded-content", "text/plain"),
639+
},
640+
)
641+
642+
assert response.status_code == 200
643+
assert escaped_path.exists() is False
644+
extra_params = captured.get("extra_params")
645+
assert isinstance(extra_params, dict)
646+
file_paths = extra_params.get("file_paths", [])
647+
assert isinstance(file_paths, list)
648+
assert len(file_paths) == 1
649+
uploaded_path = Path(str(file_paths[0]))
650+
assert uploaded_path.name == escaped_name
651+
assert "/../" not in str(uploaded_path)
652+
653+
530654
@pytest.mark.asyncio
531655
async def test_check_updates_endpoint(
532656
tmp_path: Path,

tests/unit/test_config.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,12 +217,22 @@ def test_reserved_param_name_page_is_rejected() -> None:
217217

218218

219219
def test_invalid_param_name_is_rejected() -> None:
220-
"""Param names that don't match the pattern should fail."""
220+
"""Param names that are not Python identifiers should fail."""
221221
data = _valid_recipe_data()
222222
data["endpoints"]["read"]["params"] = {
223223
"bad!name": {"description": "invalid chars"},
224224
}
225-
with pytest.raises(ValidationError, match="must match pattern"):
225+
with pytest.raises(ValidationError, match="valid Python identifier"):
226+
RecipeConfig.model_validate(data)
227+
228+
229+
def test_keyword_param_name_is_rejected() -> None:
230+
"""Python keywords are not valid parameter names."""
231+
data = _valid_recipe_data()
232+
data["endpoints"]["read"]["params"] = {
233+
"class": {"description": "invalid keyword"},
234+
}
235+
with pytest.raises(ValidationError, match="valid Python identifier"):
226236
RecipeConfig.model_validate(data)
227237

228238

web2api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
try:
88
__version__ = version("web2api")
99
except PackageNotFoundError:
10-
__version__ = "0.2.0"
10+
__version__ = "0.4.0"

web2api/config.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import keyword
56
import re
67
from collections.abc import Mapping
78
from typing import Literal
@@ -12,7 +13,6 @@
1213
PaginationType = Literal["page_param", "next_link", "offset_param"]
1314
FieldContext = Literal["self", "next_sibling", "parent"]
1415
RESERVED_RECIPE_SLUGS = {"api", "health", "docs", "openapi", "redoc"}
15-
_PARAM_NAME_PATTERN = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]{0,63}$")
1616
_RESERVED_PARAM_NAMES = {"q", "page"}
1717
FieldTransform = Literal[
1818
"regex_int",
@@ -24,6 +24,11 @@
2424
]
2525

2626

27+
def is_valid_param_name(name: str) -> bool:
28+
"""Return ``True`` if *name* is a valid Python identifier and not a keyword."""
29+
return name.isidentifier() and not keyword.iskeyword(name)
30+
31+
2732
class ActionConfig(BaseModel):
2833
"""Playwright action executed before extraction."""
2934

@@ -137,10 +142,10 @@ def validate_params(self) -> EndpointConfig:
137142
raise ValueError(
138143
f"param name '{name}' conflicts with reserved query parameter"
139144
)
140-
if not _PARAM_NAME_PATTERN.match(name):
145+
if not is_valid_param_name(name):
141146
raise ValueError(
142-
f"param name '{name}' must match pattern "
143-
"[a-zA-Z0-9][a-zA-Z0-9_-]{0,63}"
147+
f"param name '{name}' must be a valid Python identifier "
148+
"(letters/digits/underscore, not starting with a digit, not a keyword)"
144149
)
145150
return self
146151

0 commit comments

Comments
 (0)