Skip to content

Commit 5d77ab7

Browse files
committed
Fix all ruff lint errors in CI
1 parent 8824e77 commit 5d77ab7

15 files changed

Lines changed: 81 additions & 53 deletions

File tree

src/scraperguard/alerts/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
from __future__ import annotations
44

55
from dataclasses import dataclass, field
6-
from datetime import datetime, timezone
6+
from datetime import UTC, datetime
77

88

99
def _utcnow() -> datetime:
10-
return datetime.now(timezone.utc)
10+
return datetime.now(UTC)
1111

1212

1313
@dataclass

src/scraperguard/api/routes.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,6 @@ async def get_report(
126126
# If url not specified, find the first snapshot for this run
127127
if url is None:
128128
# Query snapshots associated with this run — we need to find a URL
129-
# Use a simple approach: get the run's snapshots via a direct query
130-
snapshot = None
131-
# Try to get a snapshot from this run via storage
132129
# The storage doesn't have a list-by-run method, so use the connection directly
133130
if hasattr(storage, '_conn'):
134131
cursor = storage._conn.execute(
@@ -145,12 +142,13 @@ async def get_report(
145142
content={"error": "No snapshots found for this run"},
146143
)
147144

148-
snapshot = storage.get_latest_snapshot(url)
145+
storage.get_latest_snapshot(url)
149146
validation_result = storage.get_latest_validation_result(url, schema_name="")
150147
# Try to find any schema name for this URL
151148
if validation_result is None and hasattr(storage, '_conn'):
152149
cursor = storage._conn.execute(
153-
"SELECT schema_name FROM validation_results WHERE url = ? ORDER BY timestamp DESC LIMIT 1",
150+
"SELECT schema_name FROM validation_results"
151+
" WHERE url = ? ORDER BY timestamp DESC LIMIT 1",
154152
(url,),
155153
)
156154
row = cursor.fetchone()

src/scraperguard/cli/main.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import json
1414
import time
1515
import urllib.request
16-
from datetime import datetime, timezone
16+
from datetime import UTC, datetime
1717
from pathlib import Path
1818

1919
import click
@@ -54,7 +54,10 @@ def cli() -> None:
5454
@cli.command()
5555
@click.argument("target")
5656
@click.option("--schema", default=None, help="Path to a Python file with a BaseSchema subclass.")
57-
@click.option("--config", "config_path", default=None, help="Path to scraperguard.yaml config file.")
57+
@click.option(
58+
"--config", "config_path", default=None,
59+
help="Path to scraperguard.yaml config file.",
60+
)
5861
@click.option("--run-id", default=None, help="Run ID to group with (creates new if not provided).")
5962
@click.option("--selectors", default=None, help="Comma-separated CSS selectors to track.")
6063
@click.option("--store-raw-html", is_flag=True, default=False, help="Store raw HTML in snapshot.")
@@ -121,7 +124,7 @@ def run(
121124
metadata = SnapshotMetadata(
122125
http_status=http_status,
123126
latency_ms=latency_ms,
124-
timestamp=datetime.now(timezone.utc),
127+
timestamp=datetime.now(UTC),
125128
headers=headers,
126129
response_size_bytes=len(html.encode("utf-8")),
127130
)
@@ -156,7 +159,8 @@ def run(
156159
click.echo(f"Warning: Drift analysis failed: {exc}", err=True)
157160
storage.save_validation_result(validation_result)
158161
click.echo(
159-
f"Schema validation: {validation_result.passed_count}/{validation_result.total_items} passed"
162+
f"Schema validation: "
163+
f"{validation_result.passed_count}/{validation_result.total_items} passed"
160164
)
161165
except SchemaLoadError as exc:
162166
click.echo(f"Warning: Schema load failed: {exc}", err=True)
@@ -168,7 +172,6 @@ def run(
168172
if selector_list:
169173
try:
170174
current_tree = parse_to_tree(snapshot.normalized_html)
171-
prev_snapshot = storage.get_latest_snapshot(url)
172175
# get_latest_snapshot might return the one we just saved; get the one before
173176
snapshots = storage.list_snapshots(url, limit=2)
174177
prev_tree = None
@@ -193,7 +196,9 @@ def run(
193196
if s.id != snapshot.id:
194197
prev_snapshot_obj = s
195198
break
196-
if prev_snapshot_obj and should_diff(snapshot.fingerprint, prev_snapshot_obj.fingerprint):
199+
if prev_snapshot_obj and should_diff(
200+
snapshot.fingerprint, prev_snapshot_obj.fingerprint,
201+
):
197202
before_tree = parse_to_tree(prev_snapshot_obj.normalized_html)
198203
after_tree = parse_to_tree(snapshot.normalized_html)
199204
dom_changes = diff_trees(before_tree, after_tree)
@@ -208,7 +213,9 @@ def run(
208213
if s.id != snapshot.id:
209214
prev_snapshot_obj = s
210215
break
211-
if prev_snapshot_obj and should_diff(snapshot.fingerprint, prev_snapshot_obj.fingerprint):
216+
if prev_snapshot_obj and should_diff(
217+
snapshot.fingerprint, prev_snapshot_obj.fingerprint,
218+
):
212219
before_tree = parse_to_tree(prev_snapshot_obj.normalized_html)
213220
after_tree = parse_to_tree(snapshot.normalized_html)
214221
dom_changes = diff_trees(before_tree, after_tree)

src/scraperguard/config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
from scraperguard.storage.base import StorageBackend
1919

20-
2120
# ---------------------------------------------------------------------------
2221
# Configuration dataclasses
2322
# ---------------------------------------------------------------------------

src/scraperguard/core/classify/classifier.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99

1010
import re
1111
from dataclasses import dataclass, field
12-
from enum import Enum
12+
from enum import StrEnum
1313

14-
from scraperguard.core.dom_diff.differ import DOMChange, ChangeType
14+
from scraperguard.core.dom_diff.differ import ChangeType, DOMChange
1515
from scraperguard.core.dom_diff.selector_tracker import SelectorStatus
1616
from scraperguard.storage.models import ValidationResult
1717

1818

19-
class FailureType(str, Enum):
19+
class FailureType(StrEnum):
2020
"""Known failure root causes."""
2121

2222
SELECTOR_BREAK = "selector_break"
@@ -96,7 +96,10 @@ def _check_captcha(inp: ClassificationInput) -> Classification | None:
9696
confidence=confidence,
9797
evidence=[f"Found CAPTCHA signature: '{sig}'" for sig in found],
9898
affected_fields=[],
99-
recommended_action="Target site is serving a CAPTCHA. Consider using a CAPTCHA-solving service or rotating IP addresses.",
99+
recommended_action=(
100+
"Target site is serving a CAPTCHA."
101+
" Consider using a CAPTCHA-solving service or rotating IP addresses."
102+
),
100103
severity="critical",
101104
)
102105

@@ -126,9 +129,15 @@ def _check_js_challenge(inp: ClassificationInput) -> Classification | None:
126129
return Classification(
127130
failure_type=FailureType.JS_CHALLENGE,
128131
confidence=0.80,
129-
evidence=[f"Page has minimal text content ({len(visible_text)} chars) but contains {script_count} script tags"],
132+
evidence=[
133+
f"Page has minimal text content ({len(visible_text)} chars)"
134+
f" but contains {script_count} script tags",
135+
],
130136
affected_fields=[],
131-
recommended_action="Page requires JavaScript rendering. Use a browser-based scraper (Playwright, Selenium).",
137+
recommended_action=(
138+
"Page requires JavaScript rendering."
139+
" Use a browser-based scraper (Playwright, Selenium)."
140+
),
132141
severity="critical",
133142
)
134143
return None
@@ -258,9 +267,15 @@ def _check_dom_restructure(inp: ClassificationInput) -> Classification | None:
258267
return Classification(
259268
failure_type=FailureType.DOM_RESTRUCTURE,
260269
confidence=confidence,
261-
evidence=[f"Detected {total} structural DOM changes ({high_count} high severity)"],
270+
evidence=[
271+
f"Detected {total} structural DOM changes"
272+
f" ({high_count} high severity)",
273+
],
262274
affected_fields=[],
263-
recommended_action="Major structural change detected. Review page layout and update scraper selectors.",
275+
recommended_action=(
276+
"Major structural change detected."
277+
" Review page layout and update scraper selectors."
278+
),
264279
severity=severity,
265280
)
266281
return None
@@ -286,7 +301,10 @@ def _check_ab_variant(inp: ClassificationInput) -> Classification | None:
286301
return Classification(
287302
failure_type=FailureType.AB_VARIANT,
288303
confidence=0.55,
289-
evidence=[f"Partial selector failure ({broken_count}/{total_count}) with moderate structural changes suggests A/B variant"],
304+
evidence=[
305+
f"Partial selector failure ({broken_count}/{total_count})"
306+
" with moderate structural changes suggests A/B variant",
307+
],
290308
affected_fields=[],
291309
recommended_action="Possible A/B test variant. Monitor over multiple runs to confirm.",
292310
severity="info",
@@ -305,7 +323,10 @@ def _check_partial_extraction(inp: ClassificationInput) -> Classification | None
305323
confidence=0.65,
306324
evidence=[f"{vr.failed_count}/{vr.total_items} items failed validation"],
307325
affected_fields=[],
308-
recommended_action="Partial extraction failure. Some items are extracting correctly. Check specific failure patterns.",
326+
recommended_action=(
327+
"Partial extraction failure."
328+
" Some items are extracting correctly. Check specific failure patterns."
329+
),
309330
severity="warning",
310331
)
311332
return None

src/scraperguard/core/dom_diff/differ.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88

99
from collections import Counter
1010
from dataclasses import dataclass, field
11-
from enum import Enum
11+
from enum import StrEnum
1212

1313
from scraperguard.core.dom_diff.parser import DOMNode, find_nodes_by_selector
1414

1515

16-
class ChangeType(str, Enum):
16+
class ChangeType(StrEnum):
1717
"""Categories of structural DOM changes."""
1818

1919
NODE_REMOVED = "node_removed"
@@ -202,7 +202,10 @@ def _diff_children(before: DOMNode, after: DOMNode, changes: list[DOMChange]) ->
202202
severity="medium",
203203
details={
204204
"before_order": [before_children[bi].tag for bi, _ in matches],
205-
"after_order": [after_children[ai].tag for _, ai in sorted(matches, key=lambda m: m[1])],
205+
"after_order": [
206+
after_children[ai].tag
207+
for _, ai in sorted(matches, key=lambda m: m[1])
208+
],
206209
},
207210
message=f"Children reordered at {before.path}",
208211
))

src/scraperguard/core/snapshot/normalizer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import lxml.html
1414
from lxml.html import HtmlElement, tostring
1515

16-
1716
# ---------------------------------------------------------------------------
1817
# Configurable attribute removal / retention patterns
1918
# ---------------------------------------------------------------------------

src/scraperguard/health.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from __future__ import annotations
1313

1414
from dataclasses import dataclass, field
15-
from datetime import datetime, timezone
15+
from datetime import UTC, datetime
1616

1717
from scraperguard.core.classify.classifier import Classification
1818
from scraperguard.core.dom_diff.differ import DOMChange
@@ -54,7 +54,7 @@ class HealthReport:
5454
components: list[HealthComponent]
5555
classifications: list[Classification]
5656
drift_events: list[DriftEvent]
57-
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
57+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
5858
run_id: str = ""
5959
url: str = ""
6060

@@ -186,7 +186,10 @@ def compute_structural_stability(
186186
name="Structural Stability",
187187
score=score,
188188
weight=weight,
189-
details=f"{len(dom_changes)} structural changes detected ({high} high, {medium} medium, {low} low severity)",
189+
details=(
190+
f"{len(dom_changes)} structural changes detected"
191+
f" ({high} high, {medium} medium, {low} low severity)"
192+
),
190193
)
191194

192195

src/scraperguard/integrations/playwright/observer.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212

1313
import logging
1414
import time
15+
from collections.abc import AsyncGenerator
1516
from contextlib import asynccontextmanager
16-
from datetime import datetime, timezone
17-
from typing import TYPE_CHECKING, AsyncGenerator
17+
from datetime import UTC, datetime
18+
from typing import TYPE_CHECKING
1819

1920
from scraperguard.core.classify.classifier import ClassificationInput, classify_failure
2021
from scraperguard.core.dom_diff.differ import diff_trees
@@ -130,7 +131,7 @@ async def _run_pipeline(self) -> None:
130131
metadata = SnapshotMetadata(
131132
http_status=200,
132133
latency_ms=latency_ms,
133-
timestamp=datetime.now(timezone.utc),
134+
timestamp=datetime.now(UTC),
134135
headers={},
135136
response_size_bytes=len(self._raw_html.encode("utf-8")),
136137
)

src/scraperguard/integrations/scrapy/middleware.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import logging
1111
import time
12-
from datetime import datetime, timezone
12+
from datetime import UTC, datetime
1313
from typing import TYPE_CHECKING
1414

1515
from scraperguard.config import ScraperGuardConfig, get_storage_backend, load_config
@@ -89,7 +89,10 @@ def from_crawler(cls, crawler: Crawler) -> ScraperGuardObserverMiddleware:
8989

9090
return instance
9191
except Exception:
92-
logger.exception("ScraperGuard: Failed to initialize middleware, creating no-op instance")
92+
logger.exception(
93+
"ScraperGuard: Failed to initialize middleware,"
94+
" creating no-op instance",
95+
)
9396
# Return a minimal instance that will pass-through everything
9497
instance = cls.__new__(cls)
9598
instance.config = ScraperGuardConfig()
@@ -133,7 +136,7 @@ def process_response(self, request: Request, response: Response, spider: Spider)
133136
metadata = SnapshotMetadata(
134137
http_status=response.status,
135138
latency_ms=latency_ms,
136-
timestamp=datetime.now(timezone.utc),
139+
timestamp=datetime.now(UTC),
137140
headers=headers_dict,
138141
response_size_bytes=len(response.body),
139142
)

0 commit comments

Comments
 (0)