Leoglme · Leoglme · May 15, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,7 @@ api/.env
 web/.env
 web/goupixdex.key
 web/goupixdex.key.pub
-integrations
+integrations
+
+# Local Cursor / agent guidelines — not versioned
+CLAUDE.md
diff --git a/api/.env.example b/api/.env.example
@@ -27,3 +27,7 @@ CORS_ORIGINS=*
 # EBAY_CLIENT_SECRET=
 # EBAY_REDIRECT_URI=https://votre-frontend/settings/marketplaces
 # EBAY_USE_SANDBOX=true
+# Optional HTTP proxy for HTML scrape « vendus » (if eBay renvoie 403 depuis le serveur).
+# EBAY_SOLD_SCRAPE_PROXY=http://127.0.0.1:8888
+# Seconds between two « sold-scrape » requests per user (eBay anti-bot / burst traffic).
+# EBAY_SOLD_SCRAPE_MIN_INTERVAL_SECONDS=60
diff --git a/api/config.py b/api/config.py
@@ -79,6 +79,10 @@ class AppSettings(BaseSettings):
     ebay_redirect_uri: str | None = None
     #: Use sandbox API hosts (``auth.sandbox.ebay.com``, ``api.sandbox.ebay.com``).
     ebay_use_sandbox: bool = True
+    #: Optional HTTP(S) proxy for fetching eBay « vendus » HTML (datacenter IPs are often blocked).
+    ebay_sold_scrape_proxy: str | None = None
+    #: Min seconds between two « sold-scrape » calls **per user** (limits burst traffic to eBay).
+    ebay_sold_scrape_min_interval_seconds: float = Field(default=60.0, ge=0, le=3600)
 
 
 @lru_cache

diff --git a/api/requirements.txt b/api/requirements.txt
@@ -19,6 +19,9 @@ python-jose[cryptography]>=3.3.0
 nodriver>=0.48.0
 python-dotenv>=1.0.0
 httpx>=0.27.0
+# curl_cffi: HTTP client with TLS/JA3 fingerprint impersonation (Chrome/Firefox).
+# Used by ebay_sold_scrape_service to bypass datacenter-IP 403s on ebay.fr.
+curl_cffi>=0.7.0
 beautifulsoup4>=4.12.0
 Pillow>=10.0.0
 supabase>=2.0.0

diff --git a/api/routes/ebay_market_route.py b/api/routes/ebay_market_route.py
@@ -3,9 +3,10 @@
 from __future__ import annotations
 
 import logging
-from typing import Annotated
+from typing import Annotated, Any
 
 from fastapi import APIRouter, Depends, HTTPException, Query, status
+from pydantic import BaseModel, Field
 
 from app_types.ebay_browse import (
     ConditionFilter,
@@ -19,6 +20,10 @@
 from services.ebay_app_oauth_service import ebay_app_oauth_configured
 from services.ebay_browse_service import DEFAULT_LIMIT, MAX_LIMIT, browse_search
 from services.ebay_price_aggregator_service import aggregate_prices, partition_outliers
+from services.ebay_sold_scrape_rate_limit import acquire_sold_scrape_slot
+from services.ebay_sold_scrape_service import ebay_fr_sold_search_url, scrape_sold_listings
+from services.ebay_sold_top_service import aggregate_top_sold
+from services.ebay_sold_top_worker import get_job, peek_items_sample, submit_job
 
 logger = logging.getLogger(__name__)
 
@@ -159,3 +164,149 @@ async def search_market(
         "total_matches": total,
         "warnings": warnings,
     }
+
+
+@router.get("/sold-scrape", response_model=None)
+async def sold_scrape_html(
+    user: Annotated[User, Depends(get_current_user)],
+    q: Annotated[str, Query(min_length=2, max_length=256)],
+    window_hours: Annotated[float, Query(ge=1, le=720)] = 168,
+    limit: Annotated[int, Query(ge=1, le=60)] = 50,
+) -> dict[str, Any]:
+    """
+    **Completed listings** (sold) via **public eBay HTML search** — no Marketplace Insights OAuth.
+
+    May fail with bot protection (403); optional ``EBAY_SOLD_SCRAPE_PROXY`` in server env.
+    Rate-limited per user (default: one call every ``EBAY_SOLD_SCRAPE_MIN_INTERVAL_SECONDS``).
+    Window goes up to ``720`` hours (30 days).
+    """
+    app = get_settings()
+
+    # If the worker has a fresh cached top result for the same (q, window),
+    # reuse its items_sample — saves an eBay roundtrip *and* the rate-limit
+    # slot, which matters when the user just searched in Top mode and
+    # switches to List mode.
+    cached_sample = peek_items_sample(q=q.strip(), window_hours=window_hours)
+    if cached_sample is not None:
+        return {
+            "query": q.strip(),
+            "window_hours": window_hours,
+            "items": cached_sample[:limit],
+            "error": None,
+            "ebay_sold_search_url": ebay_fr_sold_search_url(
+                q=q.strip(), page_size=min(60, max(limit, 10)),
+            ),
+            "source": "ebay_html_scrape_cached_from_top",
+            "cached": True,
+        }
+
+    retry_after = await acquire_sold_scrape_slot(user.id, app.ebay_sold_scrape_min_interval_seconds)
+    if retry_after > 0:
+        iv = app.ebay_sold_scrape_min_interval_seconds
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail=(
+                f"Rate limit: at most one eBay sold-search every {iv:g} s "
+                f"(retry in {retry_after} s)."
+            ),
+            headers={"Retry-After": str(retry_after)},
+        )
+    items, err = await scrape_sold_listings(q=q.strip(), window_hours=window_hours, limit=limit, app=app)
+    return {
+        "query": q.strip(),
+        "window_hours": window_hours,
+        "items": items,
+        "error": err,
+        "ebay_sold_search_url": ebay_fr_sold_search_url(q=q.strip(), page_size=min(60, max(limit, 10))),
+        "source": "ebay_html_scrape",
+        "cached": False,
+    }
+
+
+class SoldTopSubmitBody(BaseModel):
+    """Body for ``POST /ebay/market/sold-top`` — schedules a background scrape."""
+
+    q: str = Field(min_length=2, max_length=256)
+    window_hours: float = Field(default=168, ge=1, le=720)
+    pages: int = Field(default=10, ge=1, le=20)
+    scrape_limit: int = Field(default=600, ge=10, le=1000)
+    top_limit: int = Field(default=20, ge=1, le=100)
+    min_count: int = Field(default=1, ge=1, le=20)
+
+
+@router.post("/sold-top", response_model=None, status_code=status.HTTP_202_ACCEPTED)
+async def sold_top_submit(
+    user: Annotated[User, Depends(get_current_user)],
+    body: SoldTopSubmitBody,
+) -> dict[str, Any]:
+    """
+    Submit a background top-sold scrape job and return its ``job_id``
+    (consumed via ``GET /ebay/market/sold-top/{job_id}``).
+
+    When a fresh cached result (TTL 15 min) exists for the same parameters,
+    the job comes back already in ``status="completed"`` with its
+    ``result`` populated — no eBay scrape triggered. The per-user rate-limit
+    only fires when an actual scrape is launched.
+    """
+    app = get_settings()
+    job = submit_job(
+        user_id=user.id,
+        q=body.q.strip(),
+        window_hours=body.window_hours,
+        pages=body.pages,
+        scrape_limit=body.scrape_limit,
+        top_limit=body.top_limit,
+        min_count=body.min_count,
+        app=app,
+    )
+
+    cache_hit = job.status == "completed" and job.result is not None
+    if not cache_hit:
+        retry_after = await acquire_sold_scrape_slot(
+            user.id, app.ebay_sold_scrape_min_interval_seconds,
+        )
+        if retry_after > 0:
+            iv = app.ebay_sold_scrape_min_interval_seconds
+            raise HTTPException(
+                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                detail=(
+                    f"Rate limit: at most one eBay sold-search every {iv:g} s "
+                    f"(retry in {retry_after} s)."
+                ),
+                headers={"Retry-After": str(retry_after)},
+            )
+
+    return {
+        **job.to_public(),
+        "ebay_sold_search_url": ebay_fr_sold_search_url(q=body.q.strip(), page_size=60),
+        "cached": cache_hit,
+    }
+
+
+@router.get("/sold-top/{job_id}", response_model=None)
+async def sold_top_status(
+    user: Annotated[User, Depends(get_current_user)],
+    job_id: str,
+) -> dict[str, Any]:
+    """
+    Return the current state of a ``sold-top`` job.
+
+    The client polls this endpoint while ``status`` is ``pending`` or
+    ``running``. Once ``completed`` (or ``failed``), ``result`` is populated
+    and polling can stop. A job may only be read by its creator.
+    """
+    job = get_job(job_id)
+    if job is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Unknown or expired job.",
+        )
+    if job.user_id != user.id:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="This job does not belong to you.",
+        )
+    return {
+        **job.to_public(),
+        "ebay_sold_search_url": ebay_fr_sold_search_url(q=job.q, page_size=60),
+    }
diff --git a/api/scripts/debug_ebay_scrape.py b/api/scripts/debug_ebay_scrape.py
@@ -0,0 +1,138 @@
+"""
+One-shot diagnostic for ``ebay_sold_scrape_service``.
+
+Fetches the same URL the production service uses, dumps the HTML to
+``/tmp/ebay-sold.html``, and reports how many elements match candidate
+selectors so we can pick the right one when eBay rotates its SRP layout.
+
+Run from the ``api/`` directory:
+
+    python -m scripts.debug_ebay_scrape "carte pokemon"
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from services.ebay_sold_scrape_service import (
+    _parse_sold_rows,
+    fetch_sold_listings_html,
+)
+
+OUT = Path("/tmp/ebay-sold.html")
+
+#: Candidate selectors to probe. Order is informational; we report counts for all.
+_CANDIDATE_SELECTORS = (
+    "li.s-item",
+    "ul.srp-results > li",
+    ".srp-results .s-item",
+    ".srp-results .s-item__wrapper",
+    "li.s-item__pl-on-bottom",
+    "[data-testid='srp-results'] li",
+    "[data-view*='mi:1686'] li",
+    "ul.b-list__items_nofooter li",
+    "li[data-viewport]",
+    "div.s-card",
+)
+
+#: Selectors that often hold the relative/absolute « sold » caption.
+_CAPTION_SELECTORS = (
+    ".s-item__caption--signal",
+    ".s-item__title--tagblock",
+    ".s-item__subtitle",
+    ".s-card__caption",
+    "[class*='caption']",
+)
+
+
+async def main(q: str) -> None:
+    html = await fetch_sold_listings_html(q=q, page_size=50)
+    OUT.write_text(html, encoding="utf-8")
+    print(f"saved html ({len(html)} bytes) → {OUT}")
+
+    soup = BeautifulSoup(html, "html.parser")
+
+    title = soup.select_one("title")
+    print(f"<title>: {title.get_text(strip=True) if title else '(none)'}")
+
+    h1 = soup.select_one("h1")
+    print(f"<h1>: {h1.get_text(' ', strip=True)[:120] if h1 else '(none)'}")
+
+    # Quick consent-page heuristic
+    consent_markers = ("consent", "consentement", "accepter", "vos choix")
+    head_excerpt = html[:4000].lower()
+    if any(tok in head_excerpt for tok in consent_markers):
+        print("⚠️  consent-related token found in first 4 KB — possible CMP page")
+
+    print("\n-- selector probe --")
+    for sel in _CANDIDATE_SELECTORS:
+        try:
+            n = len(soup.select(sel))
+        except Exception as exc:  # invalid selector etc.
+            n = f"ERR({exc})"
+        print(f"  {sel:55s} → {n}")
+
+    print("\n-- existing parser --")
+    rows = _parse_sold_rows(html)
+    print(f"  _parse_sold_rows: {len(rows)} rows")
+    for r in rows[:3]:
+        print(f"    title={r.title[:60]!r}  caption={r.sold_caption!r}  hours_ago={r.approx_hours_ago}")
+
+    # If selector probe found something useful, sample captions
+    print("\n-- sample captions from first li.s-item or fallback --")
+    sample_lis = soup.select("li.s-item") or soup.select("li.s-item__pl-on-bottom") or soup.select("ul.srp-results > li")
+    for i, li in enumerate(sample_lis[:5]):
+        for csel in _CAPTION_SELECTORS:
+            cap = li.select_one(csel)
+            if cap:
+                print(f"  li#{i} via {csel}: {cap.get_text(' ', strip=True)[:120]!r}")
+                break
+        else:
+            print(f"  li#{i} (no caption matched any selector)")
+
+    # Field probes on the first 2 LIs so we can pin down the new s-card selectors
+    field_probes: dict[str, tuple[str, ...]] = {
+        "title": (
+            ".s-card__title", ".s-card__title-link",
+            "[role='heading']", "[role=heading]",
+            "a .su-styled-text", ".s-item__title", ".s-item__title span",
+        ),
+        "price": (".s-card__price", ".s-item__price", "[class*='price']"),
+        "link": ("a.su-link", "a[href*='/itm/']", "a.s-item__link"),
+        "image": (
+            ".s-card__image img", ".s-card__image-wrapper img",
+            "img.s-item__image-img", ".image-treatment img", "img",
+        ),
+        "caption": _CAPTION_SELECTORS,
+    }
+
+    print("\n-- field selector probe (first 2 LIs) --")
+    for i, li in enumerate(sample_lis[:2]):
+        print(f"\n[li #{i}]")
+        for field, sels in field_probes.items():
+            for s in sels:
+                el = li.select_one(s)
+                if not el:
+                    continue
+                if field == "link":
+                    snippet = (el.get("href") or "")[:120]
+                elif field == "image":
+                    snippet = (el.get("src") or el.get("data-src") or "")[:120]
+                else:
+                    snippet = el.get_text(" ", strip=True)[:120]
+                print(f"  {field:7s} via {s:35s} → {snippet!r}")
+                break
+            else:
+                print(f"  {field:7s} no match")
+        # Also dump the LI's outer HTML head (200 chars) so we can see attributes
+        outer = str(li)[:300].replace("\n", " ")
+        print(f"  outer[:300]: {outer}")
+
+
+if __name__ == "__main__":
+    query = sys.argv[1] if len(sys.argv) > 1 else "carte pokemon"
+    asyncio.run(main(query))
diff --git a/api/services/ebay_app_oauth_service.py b/api/services/ebay_app_oauth_service.py
@@ -55,11 +55,12 @@ async def _request_app_token(app: AppSettings) -> dict[str, Any]:
     async with httpx.AsyncClient(timeout=30.0) as client:
         resp = await client.post(_token_url(app), data=data, headers=headers)
     if resp.status_code >= 400:
-        logger.warning(
-            "eBay app token request failed: %s %s",
-            resp.status_code,
-            resp.text[:500],
-        )
+        if resp.status_code >= 500:
+            logger.warning(
+                "eBay app token request failed: %s %s",
+                resp.status_code,
+                resp.text[:500],
+            )
         resp.raise_for_status()
     return resp.json()
 

diff --git a/api/services/ebay_sold_scrape_rate_limit.py b/api/services/ebay_sold_scrape_rate_limit.py
@@ -0,0 +1,27 @@
+"""In-memory per-user rate limit for eBay « vendus » HTML scrape (reduces bot flags on eBay)."""
+
+from __future__ import annotations
+
+import asyncio
+import time
+
+_lock = asyncio.Lock()
+_last: dict[int, float] = {}
+
+
+async def acquire_sold_scrape_slot(user_id: int, min_interval_sec: float) -> int:
+    """
+    Enforce at most one allowed request per ``min_interval_sec`` per user (monotonic clock).
+
+    :returns: ``0`` if the caller may proceed; else whole seconds to wait (for ``Retry-After``).
+    """
+    if min_interval_sec <= 0:
+        return 0
+    now = time.monotonic()
+    async with _lock:
+        last = _last.get(user_id)
+        if last is not None and (now - last) < min_interval_sec:
+            wait = min_interval_sec - (now - last)
+            return max(1, int(wait + 0.999))
+        _last[user_id] = now
+        return 0