Skip to content

Commit 80bf57f

Browse files
committed
Improve type annotations and align storage backends with ABC contract
- Add precise type annotations across all modules (dict -> dict[str, Any]) - Fix JSONL and Postgres backends to implement StorageBackend ABC interface - Adopt Python 3.11+ idioms (StrEnum, datetime.UTC, collections.abc) - Reformat long lines for consistent code style
1 parent 5d77ab7 commit 80bf57f

27 files changed

Lines changed: 473 additions & 326 deletions

File tree

src/scraperguard/alerts/models.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from dataclasses import dataclass, field
66
from datetime import UTC, datetime
7+
from typing import Any
78

89

910
def _utcnow() -> datetime:
@@ -21,9 +22,9 @@ class Alert:
2122
url: str
2223
run_id: str
2324
timestamp: datetime = field(default_factory=_utcnow)
24-
details: dict = field(default_factory=dict)
25+
details: dict[str, Any] = field(default_factory=dict)
2526

26-
def to_dict(self) -> dict:
27+
def to_dict(self) -> dict[str, Any]:
2728
"""Serialize to a plain dict for JSON transport."""
2829
return {
2930
"severity": self.severity,

src/scraperguard/alerts/slack.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import json
66
import urllib.request
7-
from typing import TYPE_CHECKING
7+
from typing import TYPE_CHECKING, Any
88

99
from scraperguard.alerts.base import AlertDispatcher
1010

@@ -28,7 +28,7 @@ def __init__(self, webhook_url: str) -> None:
2828
def name(self) -> str:
2929
return "slack"
3030

31-
def _build_payload(self, alert: Alert) -> dict:
31+
def _build_payload(self, alert: Alert) -> dict[str, Any]:
3232
emoji = _SEVERITY_EMOJI.get(alert.severity, ":grey_question:")
3333
return {
3434
"blocks": [
@@ -73,6 +73,6 @@ def send(self, alert: Alert) -> bool:
7373
method="POST",
7474
)
7575
with urllib.request.urlopen(req) as resp:
76-
return resp.status == 200
76+
return bool(resp.status == 200)
7777
except Exception:
7878
return False

src/scraperguard/alerts/webhook.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import json
66
import urllib.request
7-
from typing import TYPE_CHECKING
7+
from typing import TYPE_CHECKING, Any
88

99
from scraperguard.alerts.base import AlertDispatcher
1010

@@ -15,7 +15,7 @@
1515
class WebhookDispatcher(AlertDispatcher):
1616
"""Dispatches alerts as JSON POST requests to a configurable URL."""
1717

18-
def __init__(self, url: str, headers: dict | None = None) -> None:
18+
def __init__(self, url: str, headers: dict[str, Any] | None = None) -> None:
1919
self.url = url
2020
self.headers = headers or {}
2121

@@ -38,6 +38,6 @@ def send(self, alert: Alert) -> bool:
3838
method="POST",
3939
)
4040
with urllib.request.urlopen(req) as resp:
41-
return resp.status == 200
41+
return bool(resp.status == 200)
4242
except Exception:
4343
return False

src/scraperguard/api/app.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import logging
66
import time
7+
from typing import Any
78

89
from fastapi import FastAPI, Request
910
from fastapi.middleware.cors import CORSMiddleware
@@ -54,7 +55,7 @@ async def _unhandled_exception_handler(request: Request, exc: Exception) -> JSON
5455

5556
# Request logging middleware
5657
@app.middleware("http")
57-
async def _request_logging_middleware(request: Request, call_next):
58+
async def _request_logging_middleware(request: Request, call_next: Any) -> Any:
5859
start = time.perf_counter()
5960
response = await call_next(request)
6061
duration_ms = (time.perf_counter() - start) * 1000

src/scraperguard/api/routes.py

Lines changed: 39 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
from dataclasses import asdict
6+
from typing import Any
67

78
from fastapi import APIRouter, Query, Request
89
from fastapi.responses import JSONResponse
@@ -16,18 +17,18 @@
1617
router = APIRouter(prefix="/api")
1718

1819

19-
def _get_storage(request: Request):
20+
def _get_storage(request: Request) -> Any:
2021
return request.app.state.storage
2122

2223

2324
@router.get("/health")
24-
async def health() -> dict:
25+
async def health() -> dict[str, str]:
2526
"""Service health check."""
2627
return {"status": "ok", "version": scraperguard.__version__}
2728

2829

2930
@router.get("/runs")
30-
async def list_runs(request: Request, limit: int = Query(default=20, ge=1)) -> dict:
31+
async def list_runs(request: Request, limit: int = Query(default=20, ge=1)) -> dict[str, Any]:
3132
"""List recent scraper runs."""
3233
storage = _get_storage(request)
3334
runs = storage.list_runs(limit=limit)
@@ -49,7 +50,7 @@ async def list_snapshots(
4950
request: Request,
5051
url: str = Query(...),
5152
limit: int = Query(default=10, ge=1),
52-
) -> dict:
53+
) -> dict[str, Any]:
5354
"""List recent snapshots for a URL (lightweight, no HTML bodies)."""
5455
storage = _get_storage(request)
5556
snapshots = storage.list_snapshots(url, limit=limit)
@@ -79,7 +80,7 @@ async def list_validation_results(
7980
request: Request,
8081
schema_name: str = Query(...),
8182
limit: int = Query(default=10, ge=1),
82-
) -> dict:
83+
) -> dict[str, Any]:
8384
"""List validation result history for a URL and schema."""
8485
storage = _get_storage(request)
8586
results = storage.list_validation_results(url, schema_name, limit=limit)
@@ -103,11 +104,13 @@ async def get_drift(
103104
content={"error": "No validation results found for this URL and schema"},
104105
)
105106
events = run_drift_analysis(latest, storage, baseline_count=baseline_count, threshold=threshold)
106-
return JSONResponse(content={
107-
"drift_events": [asdict(e) for e in events],
108-
"baseline_count": baseline_count,
109-
"threshold": threshold,
110-
})
107+
return JSONResponse(
108+
content={
109+
"drift_events": [asdict(e) for e in events],
110+
"baseline_count": baseline_count,
111+
"threshold": threshold,
112+
}
113+
)
111114

112115

113116
@router.get("/report/{run_id}")
@@ -127,7 +130,7 @@ async def get_report(
127130
if url is None:
128131
# Query snapshots associated with this run — we need to find a URL
129132
# The storage doesn't have a list-by-run method, so use the connection directly
130-
if hasattr(storage, '_conn'):
133+
if hasattr(storage, "_conn"):
131134
cursor = storage._conn.execute(
132135
"SELECT url FROM snapshots WHERE run_id = ? LIMIT 1",
133136
(run_id,),
@@ -145,7 +148,7 @@ async def get_report(
145148
storage.get_latest_snapshot(url)
146149
validation_result = storage.get_latest_validation_result(url, schema_name="")
147150
# Try to find any schema name for this URL
148-
if validation_result is None and hasattr(storage, '_conn'):
151+
if validation_result is None and hasattr(storage, "_conn"):
149152
cursor = storage._conn.execute(
150153
"SELECT schema_name FROM validation_results"
151154
" WHERE url = ? ORDER BY timestamp DESC LIMIT 1",
@@ -169,23 +172,25 @@ async def get_report(
169172
url=url,
170173
)
171174

172-
return JSONResponse(content={
173-
"overall_score": report.overall_score,
174-
"status": report.status,
175-
"components": [
176-
{
177-
"name": c.name,
178-
"score": round(c.score, 4),
179-
"weight": c.weight,
180-
"details": c.details,
181-
}
182-
for c in report.components
183-
],
184-
"drift_events": [asdict(e) for e in report.drift_events],
185-
"run_id": report.run_id,
186-
"url": report.url,
187-
"timestamp": report.timestamp.isoformat(),
188-
})
175+
return JSONResponse(
176+
content={
177+
"overall_score": report.overall_score,
178+
"status": report.status,
179+
"components": [
180+
{
181+
"name": c.name,
182+
"score": round(c.score, 4),
183+
"weight": c.weight,
184+
"details": c.details,
185+
}
186+
for c in report.components
187+
],
188+
"drift_events": [asdict(e) for e in report.drift_events],
189+
"run_id": report.run_id,
190+
"url": report.url,
191+
"timestamp": report.timestamp.isoformat(),
192+
}
193+
)
189194

190195

191196
@router.get("/selectors/{url:path}")
@@ -214,6 +219,8 @@ async def get_selector_statuses(
214219
previous_tree = parse_to_tree(snapshots[1].normalized_html) if len(snapshots) > 1 else None
215220

216221
statuses = track_selectors(current_tree, previous_tree, selector_list)
217-
return JSONResponse(content={
218-
"selector_statuses": [asdict(s) for s in statuses],
219-
})
222+
return JSONResponse(
223+
content={
224+
"selector_statuses": [asdict(s) for s in statuses],
225+
}
226+
)

src/scraperguard/cli/main.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import urllib.request
1616
from datetime import UTC, datetime
1717
from pathlib import Path
18+
from typing import Any
1819

1920
import click
2021

@@ -33,7 +34,7 @@
3334
from scraperguard.storage.models import SnapshotMetadata
3435

3536

36-
def _fetch_url(url: str) -> tuple[str, int, dict, float]:
37+
def _fetch_url(url: str) -> tuple[str, int, dict[str, Any], float]:
3738
"""Fetch a URL and return (html, status, headers, latency_ms)."""
3839
start = time.monotonic()
3940
req = urllib.request.Request(url, headers={"User-Agent": "ScraperGuard/1.0"})
@@ -55,7 +56,9 @@ def cli() -> None:
5556
@click.argument("target")
5657
@click.option("--schema", default=None, help="Path to a Python file with a BaseSchema subclass.")
5758
@click.option(
58-
"--config", "config_path", default=None,
59+
"--config",
60+
"config_path",
61+
default=None,
5962
help="Path to scraperguard.yaml config file.",
6063
)
6164
@click.option("--run-id", default=None, help="Run ID to group with (creates new if not provided).")
@@ -89,9 +92,9 @@ def run(
8992
# d) Get HTML and items
9093
url: str
9194
html: str
92-
items: list[dict]
95+
items: list[dict[str, Any]]
9396
http_status: int = 200
94-
headers: dict = {}
97+
headers: dict[str, Any] = {}
9598
latency_ms: float = 0.0
9699

97100
if target.startswith("http://") or target.startswith("https://"):
@@ -148,11 +151,14 @@ def run(
148151
try:
149152
schema_cls = load_schema_from_file(schema)
150153
validation_result = schema_cls.validate_batch(
151-
items, run_id=run_meta.id, url=url,
154+
items,
155+
run_id=run_meta.id,
156+
url=url,
152157
)
153158
try:
154159
drift_events = run_drift_analysis(
155-
validation_result, storage,
160+
validation_result,
161+
storage,
156162
threshold=cfg.schema.null_drift_threshold,
157163
)
158164
except Exception as exc:
@@ -197,7 +203,8 @@ def run(
197203
prev_snapshot_obj = s
198204
break
199205
if prev_snapshot_obj and should_diff(
200-
snapshot.fingerprint, prev_snapshot_obj.fingerprint,
206+
snapshot.fingerprint,
207+
prev_snapshot_obj.fingerprint,
201208
):
202209
before_tree = parse_to_tree(prev_snapshot_obj.normalized_html)
203210
after_tree = parse_to_tree(snapshot.normalized_html)
@@ -214,7 +221,8 @@ def run(
214221
prev_snapshot_obj = s
215222
break
216223
if prev_snapshot_obj and should_diff(
217-
snapshot.fingerprint, prev_snapshot_obj.fingerprint,
224+
snapshot.fingerprint,
225+
prev_snapshot_obj.fingerprint,
218226
):
219227
before_tree = parse_to_tree(prev_snapshot_obj.normalized_html)
220228
after_tree = parse_to_tree(snapshot.normalized_html)
@@ -223,14 +231,16 @@ def run(
223231
click.echo(f"Warning: DOM diff failed: {exc}", err=True)
224232

225233
# i) Failure classification
226-
classifications = classify_failure(ClassificationInput(
227-
validation_result=validation_result,
228-
dom_changes=dom_changes,
229-
selector_statuses=selector_statuses,
230-
raw_html=html,
231-
http_status=http_status,
232-
response_size_bytes=len(html.encode("utf-8")),
233-
))
234+
classifications = classify_failure(
235+
ClassificationInput(
236+
validation_result=validation_result,
237+
dom_changes=dom_changes,
238+
selector_statuses=selector_statuses,
239+
raw_html=html,
240+
http_status=http_status,
241+
response_size_bytes=len(html.encode("utf-8")),
242+
)
243+
)
234244

235245
# j) Health score
236246
report = compute_health_score(
@@ -244,16 +254,21 @@ def run(
244254
)
245255

246256
# k) Alerting
247-
dispatchers = []
257+
from scraperguard.alerts.base import AlertDispatcher
258+
259+
dispatchers: list[AlertDispatcher] = []
248260
if cfg.alerts.slack.enabled and cfg.alerts.slack.webhook:
249261
from scraperguard.alerts.slack import SlackDispatcher
262+
250263
dispatchers.append(SlackDispatcher(cfg.alerts.slack.webhook))
251264
if cfg.alerts.webhook_url:
252265
from scraperguard.alerts.webhook import WebhookDispatcher
266+
253267
dispatchers.append(WebhookDispatcher(cfg.alerts.webhook_url))
254268
if dispatchers:
255269
from scraperguard.alerts.dispatcher import AlertManager
256270
from scraperguard.alerts.models import Alert
271+
257272
alert_mgr = AlertManager(dispatchers, cfg.alerts.thresholds)
258273
for c in classifications:
259274
if c.severity in ("critical", "warning"):
@@ -483,7 +498,9 @@ def report(url: str, run_id: str | None, fmt: str) -> None:
483498
schema_compliance = comp_map.get("Schema Compliance", "")
484499
extraction_completeness = comp_map.get("Extraction Completeness", "")
485500
selector_stability = comp_map.get("Selector Stability", "")
486-
click.echo("url,score,status,schema_compliance,extraction_completeness,selector_stability,timestamp")
501+
click.echo(
502+
"url,score,status,schema_compliance,extraction_completeness,selector_stability,timestamp"
503+
)
487504
click.echo(
488505
f"{url},{health_report.overall_score},{health_report.status},"
489506
f"{schema_compliance},{extraction_completeness},{selector_stability},"
@@ -500,8 +517,7 @@ def serve(host: str, port: int) -> None:
500517
import uvicorn
501518
except ImportError:
502519
click.echo(
503-
"Error: uvicorn not installed. "
504-
"Install API dependencies: pip install scraperguard[api]",
520+
"Error: uvicorn not installed. Install API dependencies: pip install scraperguard[api]",
505521
err=True,
506522
)
507523
raise SystemExit(1)

0 commit comments

Comments
 (0)