1515import urllib .request
1616from datetime import UTC , datetime
1717from pathlib import Path
18+ from typing import Any
1819
1920import click
2021
3334from scraperguard .storage .models import SnapshotMetadata
3435
3536
36- def _fetch_url (url : str ) -> tuple [str , int , dict , float ]:
37+ def _fetch_url (url : str ) -> tuple [str , int , dict [ str , Any ] , float ]:
3738 """Fetch a URL and return (html, status, headers, latency_ms)."""
3839 start = time .monotonic ()
3940 req = urllib .request .Request (url , headers = {"User-Agent" : "ScraperGuard/1.0" })
@@ -55,7 +56,9 @@ def cli() -> None:
5556@click .argument ("target" )
5657@click .option ("--schema" , default = None , help = "Path to a Python file with a BaseSchema subclass." )
5758@click .option (
58- "--config" , "config_path" , default = None ,
59+ "--config" ,
60+ "config_path" ,
61+ default = None ,
5962 help = "Path to scraperguard.yaml config file." ,
6063)
6164@click .option ("--run-id" , default = None , help = "Run ID to group with (creates new if not provided)." )
@@ -89,9 +92,9 @@ def run(
8992 # d) Get HTML and items
9093 url : str
9194 html : str
92- items : list [dict ]
95+ items : list [dict [ str , Any ] ]
9396 http_status : int = 200
94- headers : dict = {}
97+ headers : dict [ str , Any ] = {}
9598 latency_ms : float = 0.0
9699
97100 if target .startswith ("http://" ) or target .startswith ("https://" ):
@@ -148,11 +151,14 @@ def run(
148151 try :
149152 schema_cls = load_schema_from_file (schema )
150153 validation_result = schema_cls .validate_batch (
151- items , run_id = run_meta .id , url = url ,
154+ items ,
155+ run_id = run_meta .id ,
156+ url = url ,
152157 )
153158 try :
154159 drift_events = run_drift_analysis (
155- validation_result , storage ,
160+ validation_result ,
161+ storage ,
156162 threshold = cfg .schema .null_drift_threshold ,
157163 )
158164 except Exception as exc :
@@ -197,7 +203,8 @@ def run(
197203 prev_snapshot_obj = s
198204 break
199205 if prev_snapshot_obj and should_diff (
200- snapshot .fingerprint , prev_snapshot_obj .fingerprint ,
206+ snapshot .fingerprint ,
207+ prev_snapshot_obj .fingerprint ,
201208 ):
202209 before_tree = parse_to_tree (prev_snapshot_obj .normalized_html )
203210 after_tree = parse_to_tree (snapshot .normalized_html )
@@ -214,7 +221,8 @@ def run(
214221 prev_snapshot_obj = s
215222 break
216223 if prev_snapshot_obj and should_diff (
217- snapshot .fingerprint , prev_snapshot_obj .fingerprint ,
224+ snapshot .fingerprint ,
225+ prev_snapshot_obj .fingerprint ,
218226 ):
219227 before_tree = parse_to_tree (prev_snapshot_obj .normalized_html )
220228 after_tree = parse_to_tree (snapshot .normalized_html )
@@ -223,14 +231,16 @@ def run(
223231 click .echo (f"Warning: DOM diff failed: { exc } " , err = True )
224232
225233 # i) Failure classification
226- classifications = classify_failure (ClassificationInput (
227- validation_result = validation_result ,
228- dom_changes = dom_changes ,
229- selector_statuses = selector_statuses ,
230- raw_html = html ,
231- http_status = http_status ,
232- response_size_bytes = len (html .encode ("utf-8" )),
233- ))
234+ classifications = classify_failure (
235+ ClassificationInput (
236+ validation_result = validation_result ,
237+ dom_changes = dom_changes ,
238+ selector_statuses = selector_statuses ,
239+ raw_html = html ,
240+ http_status = http_status ,
241+ response_size_bytes = len (html .encode ("utf-8" )),
242+ )
243+ )
234244
235245 # j) Health score
236246 report = compute_health_score (
@@ -244,16 +254,21 @@ def run(
244254 )
245255
246256 # k) Alerting
247- dispatchers = []
257+ from scraperguard .alerts .base import AlertDispatcher
258+
259+ dispatchers : list [AlertDispatcher ] = []
248260 if cfg .alerts .slack .enabled and cfg .alerts .slack .webhook :
249261 from scraperguard .alerts .slack import SlackDispatcher
262+
250263 dispatchers .append (SlackDispatcher (cfg .alerts .slack .webhook ))
251264 if cfg .alerts .webhook_url :
252265 from scraperguard .alerts .webhook import WebhookDispatcher
266+
253267 dispatchers .append (WebhookDispatcher (cfg .alerts .webhook_url ))
254268 if dispatchers :
255269 from scraperguard .alerts .dispatcher import AlertManager
256270 from scraperguard .alerts .models import Alert
271+
257272 alert_mgr = AlertManager (dispatchers , cfg .alerts .thresholds )
258273 for c in classifications :
259274 if c .severity in ("critical" , "warning" ):
@@ -483,7 +498,9 @@ def report(url: str, run_id: str | None, fmt: str) -> None:
483498 schema_compliance = comp_map .get ("Schema Compliance" , "" )
484499 extraction_completeness = comp_map .get ("Extraction Completeness" , "" )
485500 selector_stability = comp_map .get ("Selector Stability" , "" )
486- click .echo ("url,score,status,schema_compliance,extraction_completeness,selector_stability,timestamp" )
501+ click .echo (
502+ "url,score,status,schema_compliance,extraction_completeness,selector_stability,timestamp"
503+ )
487504 click .echo (
488505 f"{ url } ,{ health_report .overall_score } ,{ health_report .status } ,"
489506 f"{ schema_compliance } ,{ extraction_completeness } ,{ selector_stability } ,"
@@ -500,8 +517,7 @@ def serve(host: str, port: int) -> None:
500517 import uvicorn
501518 except ImportError :
502519 click .echo (
503- "Error: uvicorn not installed. "
504- "Install API dependencies: pip install scraperguard[api]" ,
520+ "Error: uvicorn not installed. Install API dependencies: pip install scraperguard[api]" ,
505521 err = True ,
506522 )
507523 raise SystemExit (1 )
0 commit comments