Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ api/.env
web/.env
web/goupixdex.key
web/goupixdex.key.pub
integrations
integrations

# Local Cursor / agent guidelines — not versioned
CLAUDE.md
4 changes: 4 additions & 0 deletions api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ CORS_ORIGINS=*
# EBAY_CLIENT_SECRET=
# EBAY_REDIRECT_URI=https://votre-frontend/settings/marketplaces
# EBAY_USE_SANDBOX=true
# Optional HTTP proxy for HTML scrape « vendus » (if eBay renvoie 403 depuis le serveur).
# EBAY_SOLD_SCRAPE_PROXY=http://127.0.0.1:8888
# Seconds between two « sold-scrape » requests per user (eBay anti-bot / burst traffic).
# EBAY_SOLD_SCRAPE_MIN_INTERVAL_SECONDS=60
4 changes: 4 additions & 0 deletions api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ class AppSettings(BaseSettings):
ebay_redirect_uri: str | None = None
#: Use sandbox API hosts (``auth.sandbox.ebay.com``, ``api.sandbox.ebay.com``).
ebay_use_sandbox: bool = True
#: Optional HTTP(S) proxy for fetching eBay « vendus » HTML (datacenter IPs are often blocked).
ebay_sold_scrape_proxy: str | None = None
#: Min seconds between two « sold-scrape » calls **per user** (limits burst traffic to eBay).
ebay_sold_scrape_min_interval_seconds: float = Field(default=60.0, ge=0, le=3600)


@lru_cache
Expand Down
3 changes: 3 additions & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ python-jose[cryptography]>=3.3.0
nodriver>=0.48.0
python-dotenv>=1.0.0
httpx>=0.27.0
# curl_cffi: HTTP client with TLS/JA3 fingerprint impersonation (Chrome/Firefox).
# Used by ebay_sold_scrape_service to bypass datacenter-IP 403s on ebay.fr.
curl_cffi>=0.7.0
beautifulsoup4>=4.12.0
Pillow>=10.0.0
supabase>=2.0.0
Expand Down
153 changes: 152 additions & 1 deletion api/routes/ebay_market_route.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from __future__ import annotations

import logging
from typing import Annotated
from typing import Annotated, Any

from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel, Field

from app_types.ebay_browse import (
ConditionFilter,
Expand All @@ -19,6 +20,10 @@
from services.ebay_app_oauth_service import ebay_app_oauth_configured
from services.ebay_browse_service import DEFAULT_LIMIT, MAX_LIMIT, browse_search
from services.ebay_price_aggregator_service import aggregate_prices, partition_outliers
from services.ebay_sold_scrape_rate_limit import acquire_sold_scrape_slot
from services.ebay_sold_scrape_service import ebay_fr_sold_search_url, scrape_sold_listings
from services.ebay_sold_top_service import aggregate_top_sold
from services.ebay_sold_top_worker import get_job, peek_items_sample, submit_job

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -159,3 +164,149 @@ async def search_market(
"total_matches": total,
"warnings": warnings,
}


@router.get("/sold-scrape", response_model=None)
async def sold_scrape_html(
user: Annotated[User, Depends(get_current_user)],
q: Annotated[str, Query(min_length=2, max_length=256)],
window_hours: Annotated[float, Query(ge=1, le=720)] = 168,
limit: Annotated[int, Query(ge=1, le=60)] = 50,
) -> dict[str, Any]:
"""
**Completed listings** (sold) via **public eBay HTML search** — no Marketplace Insights OAuth.

May fail with bot protection (403); optional ``EBAY_SOLD_SCRAPE_PROXY`` in server env.
Rate-limited per user (default: one call every ``EBAY_SOLD_SCRAPE_MIN_INTERVAL_SECONDS``).
Window goes up to ``720`` hours (30 days).
"""
app = get_settings()

# If the worker has a fresh cached top result for the same (q, window),
# reuse its items_sample — saves an eBay roundtrip *and* the rate-limit
# slot, which matters when the user just searched in Top mode and
# switches to List mode.
cached_sample = peek_items_sample(q=q.strip(), window_hours=window_hours)
if cached_sample is not None:
return {
"query": q.strip(),
"window_hours": window_hours,
"items": cached_sample[:limit],
"error": None,
"ebay_sold_search_url": ebay_fr_sold_search_url(
q=q.strip(), page_size=min(60, max(limit, 10)),
),
"source": "ebay_html_scrape_cached_from_top",
"cached": True,
}

retry_after = await acquire_sold_scrape_slot(user.id, app.ebay_sold_scrape_min_interval_seconds)
if retry_after > 0:
iv = app.ebay_sold_scrape_min_interval_seconds
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail=(
f"Rate limit: at most one eBay sold-search every {iv:g} s "
f"(retry in {retry_after} s)."
),
headers={"Retry-After": str(retry_after)},
)
items, err = await scrape_sold_listings(q=q.strip(), window_hours=window_hours, limit=limit, app=app)
return {
"query": q.strip(),
"window_hours": window_hours,
"items": items,
"error": err,
"ebay_sold_search_url": ebay_fr_sold_search_url(q=q.strip(), page_size=min(60, max(limit, 10))),
"source": "ebay_html_scrape",
"cached": False,
}


class SoldTopSubmitBody(BaseModel):
"""Body for ``POST /ebay/market/sold-top`` — schedules a background scrape."""

q: str = Field(min_length=2, max_length=256)
window_hours: float = Field(default=168, ge=1, le=720)
pages: int = Field(default=10, ge=1, le=20)
scrape_limit: int = Field(default=600, ge=10, le=1000)
top_limit: int = Field(default=20, ge=1, le=100)
min_count: int = Field(default=1, ge=1, le=20)


@router.post("/sold-top", response_model=None, status_code=status.HTTP_202_ACCEPTED)
async def sold_top_submit(
user: Annotated[User, Depends(get_current_user)],
body: SoldTopSubmitBody,
) -> dict[str, Any]:
"""
Submit a background top-sold scrape job and return its ``job_id``
(consumed via ``GET /ebay/market/sold-top/{job_id}``).

When a fresh cached result (TTL 15 min) exists for the same parameters,
the job comes back already in ``status="completed"`` with its
``result`` populated — no eBay scrape triggered. The per-user rate-limit
only fires when an actual scrape is launched.
"""
app = get_settings()
job = submit_job(
user_id=user.id,
q=body.q.strip(),
window_hours=body.window_hours,
pages=body.pages,
scrape_limit=body.scrape_limit,
top_limit=body.top_limit,
min_count=body.min_count,
app=app,
)

cache_hit = job.status == "completed" and job.result is not None
if not cache_hit:
retry_after = await acquire_sold_scrape_slot(
user.id, app.ebay_sold_scrape_min_interval_seconds,
)
if retry_after > 0:
iv = app.ebay_sold_scrape_min_interval_seconds
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail=(
f"Rate limit: at most one eBay sold-search every {iv:g} s "
f"(retry in {retry_after} s)."
),
headers={"Retry-After": str(retry_after)},
)

return {
**job.to_public(),
"ebay_sold_search_url": ebay_fr_sold_search_url(q=body.q.strip(), page_size=60),
"cached": cache_hit,
}


@router.get("/sold-top/{job_id}", response_model=None)
async def sold_top_status(
user: Annotated[User, Depends(get_current_user)],
job_id: str,
) -> dict[str, Any]:
"""
Return the current state of a ``sold-top`` job.

The client polls this endpoint while ``status`` is ``pending`` or
``running``. Once ``completed`` (or ``failed``), ``result`` is populated
and polling can stop. A job may only be read by its creator.
"""
job = get_job(job_id)
if job is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Unknown or expired job.",
)
if job.user_id != user.id:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="This job does not belong to you.",
)
return {
**job.to_public(),
"ebay_sold_search_url": ebay_fr_sold_search_url(q=job.q, page_size=60),
}
138 changes: 138 additions & 0 deletions api/scripts/debug_ebay_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
One-shot diagnostic for ``ebay_sold_scrape_service``.

Fetches the same URL the production service uses, dumps the HTML to
``/tmp/ebay-sold.html``, and reports how many elements match candidate
selectors so we can pick the right one when eBay rotates its SRP layout.

Run from the ``api/`` directory:

python -m scripts.debug_ebay_scrape "carte pokemon"
"""

from __future__ import annotations

import asyncio
import sys
from pathlib import Path

from bs4 import BeautifulSoup

from services.ebay_sold_scrape_service import (
_parse_sold_rows,
fetch_sold_listings_html,
)

OUT = Path("/tmp/ebay-sold.html")

#: Candidate selectors to probe. Order is informational; we report counts for all.
_CANDIDATE_SELECTORS = (
"li.s-item",
"ul.srp-results > li",
".srp-results .s-item",
".srp-results .s-item__wrapper",
"li.s-item__pl-on-bottom",
"[data-testid='srp-results'] li",
"[data-view*='mi:1686'] li",
"ul.b-list__items_nofooter li",
"li[data-viewport]",
"div.s-card",
)

#: Selectors that often hold the relative/absolute « sold » caption.
_CAPTION_SELECTORS = (
".s-item__caption--signal",
".s-item__title--tagblock",
".s-item__subtitle",
".s-card__caption",
"[class*='caption']",
)


async def main(q: str) -> None:
html = await fetch_sold_listings_html(q=q, page_size=50)
OUT.write_text(html, encoding="utf-8")
print(f"saved html ({len(html)} bytes) → {OUT}")

soup = BeautifulSoup(html, "html.parser")

title = soup.select_one("title")
print(f"<title>: {title.get_text(strip=True) if title else '(none)'}")

h1 = soup.select_one("h1")
print(f"<h1>: {h1.get_text(' ', strip=True)[:120] if h1 else '(none)'}")

# Quick consent-page heuristic
consent_markers = ("consent", "consentement", "accepter", "vos choix")
head_excerpt = html[:4000].lower()
if any(tok in head_excerpt for tok in consent_markers):
print("⚠️ consent-related token found in first 4 KB — possible CMP page")

print("\n-- selector probe --")
for sel in _CANDIDATE_SELECTORS:
try:
n = len(soup.select(sel))
except Exception as exc: # invalid selector etc.
n = f"ERR({exc})"
print(f" {sel:55s} → {n}")

print("\n-- existing parser --")
rows = _parse_sold_rows(html)
print(f" _parse_sold_rows: {len(rows)} rows")
for r in rows[:3]:
print(f" title={r.title[:60]!r} caption={r.sold_caption!r} hours_ago={r.approx_hours_ago}")

# If selector probe found something useful, sample captions
print("\n-- sample captions from first li.s-item or fallback --")
sample_lis = soup.select("li.s-item") or soup.select("li.s-item__pl-on-bottom") or soup.select("ul.srp-results > li")
for i, li in enumerate(sample_lis[:5]):
for csel in _CAPTION_SELECTORS:
cap = li.select_one(csel)
if cap:
print(f" li#{i} via {csel}: {cap.get_text(' ', strip=True)[:120]!r}")
break
else:
print(f" li#{i} (no caption matched any selector)")

# Field probes on the first 2 LIs so we can pin down the new s-card selectors
field_probes: dict[str, tuple[str, ...]] = {
"title": (
".s-card__title", ".s-card__title-link",
"[role='heading']", "[role=heading]",
"a .su-styled-text", ".s-item__title", ".s-item__title span",
),
"price": (".s-card__price", ".s-item__price", "[class*='price']"),
"link": ("a.su-link", "a[href*='/itm/']", "a.s-item__link"),
"image": (
".s-card__image img", ".s-card__image-wrapper img",
"img.s-item__image-img", ".image-treatment img", "img",
),
"caption": _CAPTION_SELECTORS,
}

print("\n-- field selector probe (first 2 LIs) --")
for i, li in enumerate(sample_lis[:2]):
print(f"\n[li #{i}]")
for field, sels in field_probes.items():
for s in sels:
el = li.select_one(s)
if not el:
continue
if field == "link":
snippet = (el.get("href") or "")[:120]
elif field == "image":
snippet = (el.get("src") or el.get("data-src") or "")[:120]
else:
snippet = el.get_text(" ", strip=True)[:120]
print(f" {field:7s} via {s:35s} → {snippet!r}")
break
else:
print(f" {field:7s} no match")
# Also dump the LI's outer HTML head (200 chars) so we can see attributes
outer = str(li)[:300].replace("\n", " ")
print(f" outer[:300]: {outer}")


if __name__ == "__main__":
query = sys.argv[1] if len(sys.argv) > 1 else "carte pokemon"
asyncio.run(main(query))
11 changes: 6 additions & 5 deletions api/services/ebay_app_oauth_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ async def _request_app_token(app: AppSettings) -> dict[str, Any]:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(_token_url(app), data=data, headers=headers)
if resp.status_code >= 400:
logger.warning(
"eBay app token request failed: %s %s",
resp.status_code,
resp.text[:500],
)
if resp.status_code >= 500:
logger.warning(
"eBay app token request failed: %s %s",
resp.status_code,
resp.text[:500],
)
resp.raise_for_status()
return resp.json()

Expand Down
27 changes: 27 additions & 0 deletions api/services/ebay_sold_scrape_rate_limit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""In-memory per-user rate limit for eBay « vendus » HTML scrape (reduces bot flags on eBay)."""

from __future__ import annotations

import asyncio
import time

_lock = asyncio.Lock()
_last: dict[int, float] = {}


async def acquire_sold_scrape_slot(user_id: int, min_interval_sec: float) -> int:
"""
Enforce at most one allowed request per ``min_interval_sec`` per user (monotonic clock).

:returns: ``0`` if the caller may proceed; else whole seconds to wait (for ``Retry-After``).
"""
if min_interval_sec <= 0:
return 0
now = time.monotonic()
async with _lock:
last = _last.get(user_id)
if last is not None and (now - last) < min_interval_sec:
wait = min_interval_sec - (now - last)
return max(1, int(wait + 0.999))
_last[user_id] = now
return 0
Loading
Loading