Skip to content

Commit 1a9bb02

Browse files
committed
Merge remote-tracking branch 'origin/feature/ebay-item-sales-trends-test'
2 parents 7b2b94d + e5855e7 commit 1a9bb02

23 files changed

Lines changed: 2963 additions & 10 deletions

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,7 @@ api/.env
77
web/.env
88
web/goupixdex.key
99
web/goupixdex.key.pub
10-
integrations
10+
integrations
11+
12+
# Local Cursor / agent guidelines — not versioned
13+
CLAUDE.md

api/.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,7 @@ CORS_ORIGINS=*
2727
# EBAY_CLIENT_SECRET=
2828
# EBAY_REDIRECT_URI=https://votre-frontend/settings/marketplaces
2929
# EBAY_USE_SANDBOX=true
30+
# Optional HTTP proxy for HTML scrape « vendus » (if eBay renvoie 403 depuis le serveur).
31+
# EBAY_SOLD_SCRAPE_PROXY=http://127.0.0.1:8888
32+
# Seconds between two « sold-scrape » requests per user (eBay anti-bot / burst traffic).
33+
# EBAY_SOLD_SCRAPE_MIN_INTERVAL_SECONDS=60

api/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ class AppSettings(BaseSettings):
7979
ebay_redirect_uri: str | None = None
8080
#: Use sandbox API hosts (``auth.sandbox.ebay.com``, ``api.sandbox.ebay.com``).
8181
ebay_use_sandbox: bool = True
82+
#: Optional HTTP(S) proxy for fetching eBay « vendus » HTML (datacenter IPs are often blocked).
83+
ebay_sold_scrape_proxy: str | None = None
84+
#: Min seconds between two « sold-scrape » calls **per user** (limits burst traffic to eBay).
85+
ebay_sold_scrape_min_interval_seconds: float = Field(default=60.0, ge=0, le=3600)
8286

8387

8488
@lru_cache

api/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ python-jose[cryptography]>=3.3.0
1919
nodriver>=0.48.0
2020
python-dotenv>=1.0.0
2121
httpx>=0.27.0
22+
# curl_cffi: HTTP client with TLS/JA3 fingerprint impersonation (Chrome/Firefox).
23+
# Used by ebay_sold_scrape_service to bypass datacenter-IP 403s on ebay.fr.
24+
curl_cffi>=0.7.0
2225
beautifulsoup4>=4.12.0
2326
Pillow>=10.0.0
2427
supabase>=2.0.0

api/routes/ebay_market_route.py

Lines changed: 152 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
from __future__ import annotations
44

55
import logging
6-
from typing import Annotated
6+
from typing import Annotated, Any
77

88
from fastapi import APIRouter, Depends, HTTPException, Query, status
9+
from pydantic import BaseModel, Field
910

1011
from app_types.ebay_browse import (
1112
ConditionFilter,
@@ -19,6 +20,10 @@
1920
from services.ebay_app_oauth_service import ebay_app_oauth_configured
2021
from services.ebay_browse_service import DEFAULT_LIMIT, MAX_LIMIT, browse_search
2122
from services.ebay_price_aggregator_service import aggregate_prices, partition_outliers
23+
from services.ebay_sold_scrape_rate_limit import acquire_sold_scrape_slot
24+
from services.ebay_sold_scrape_service import ebay_fr_sold_search_url, scrape_sold_listings
25+
from services.ebay_sold_top_service import aggregate_top_sold
26+
from services.ebay_sold_top_worker import get_job, peek_items_sample, submit_job
2227

2328
logger = logging.getLogger(__name__)
2429

@@ -159,3 +164,149 @@ async def search_market(
159164
"total_matches": total,
160165
"warnings": warnings,
161166
}
167+
168+
169+
@router.get("/sold-scrape", response_model=None)
170+
async def sold_scrape_html(
171+
user: Annotated[User, Depends(get_current_user)],
172+
q: Annotated[str, Query(min_length=2, max_length=256)],
173+
window_hours: Annotated[float, Query(ge=1, le=720)] = 168,
174+
limit: Annotated[int, Query(ge=1, le=60)] = 50,
175+
) -> dict[str, Any]:
176+
"""
177+
**Completed listings** (sold) via **public eBay HTML search** — no Marketplace Insights OAuth.
178+
179+
May fail with bot protection (403); optional ``EBAY_SOLD_SCRAPE_PROXY`` in server env.
180+
Rate-limited per user (default: one call every ``EBAY_SOLD_SCRAPE_MIN_INTERVAL_SECONDS``).
181+
Window goes up to ``720`` hours (30 days).
182+
"""
183+
app = get_settings()
184+
185+
# If the worker has a fresh cached top result for the same (q, window),
186+
# reuse its items_sample — saves an eBay roundtrip *and* the rate-limit
187+
# slot, which matters when the user just searched in Top mode and
188+
# switches to List mode.
189+
cached_sample = peek_items_sample(q=q.strip(), window_hours=window_hours)
190+
if cached_sample is not None:
191+
return {
192+
"query": q.strip(),
193+
"window_hours": window_hours,
194+
"items": cached_sample[:limit],
195+
"error": None,
196+
"ebay_sold_search_url": ebay_fr_sold_search_url(
197+
q=q.strip(), page_size=min(60, max(limit, 10)),
198+
),
199+
"source": "ebay_html_scrape_cached_from_top",
200+
"cached": True,
201+
}
202+
203+
retry_after = await acquire_sold_scrape_slot(user.id, app.ebay_sold_scrape_min_interval_seconds)
204+
if retry_after > 0:
205+
iv = app.ebay_sold_scrape_min_interval_seconds
206+
raise HTTPException(
207+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
208+
detail=(
209+
f"Rate limit: at most one eBay sold-search every {iv:g} s "
210+
f"(retry in {retry_after} s)."
211+
),
212+
headers={"Retry-After": str(retry_after)},
213+
)
214+
items, err = await scrape_sold_listings(q=q.strip(), window_hours=window_hours, limit=limit, app=app)
215+
return {
216+
"query": q.strip(),
217+
"window_hours": window_hours,
218+
"items": items,
219+
"error": err,
220+
"ebay_sold_search_url": ebay_fr_sold_search_url(q=q.strip(), page_size=min(60, max(limit, 10))),
221+
"source": "ebay_html_scrape",
222+
"cached": False,
223+
}
224+
225+
226+
class SoldTopSubmitBody(BaseModel):
227+
"""Body for ``POST /ebay/market/sold-top`` — schedules a background scrape."""
228+
229+
q: str = Field(min_length=2, max_length=256)
230+
window_hours: float = Field(default=168, ge=1, le=720)
231+
pages: int = Field(default=10, ge=1, le=20)
232+
scrape_limit: int = Field(default=600, ge=10, le=1000)
233+
top_limit: int = Field(default=20, ge=1, le=100)
234+
min_count: int = Field(default=1, ge=1, le=20)
235+
236+
237+
@router.post("/sold-top", response_model=None, status_code=status.HTTP_202_ACCEPTED)
238+
async def sold_top_submit(
239+
user: Annotated[User, Depends(get_current_user)],
240+
body: SoldTopSubmitBody,
241+
) -> dict[str, Any]:
242+
"""
243+
Submit a background top-sold scrape job and return its ``job_id``
244+
(consumed via ``GET /ebay/market/sold-top/{job_id}``).
245+
246+
When a fresh cached result (TTL 15 min) exists for the same parameters,
247+
the job comes back already in ``status="completed"`` with its
248+
``result`` populated — no eBay scrape triggered. The per-user rate-limit
249+
only fires when an actual scrape is launched.
250+
"""
251+
app = get_settings()
252+
job = submit_job(
253+
user_id=user.id,
254+
q=body.q.strip(),
255+
window_hours=body.window_hours,
256+
pages=body.pages,
257+
scrape_limit=body.scrape_limit,
258+
top_limit=body.top_limit,
259+
min_count=body.min_count,
260+
app=app,
261+
)
262+
263+
cache_hit = job.status == "completed" and job.result is not None
264+
if not cache_hit:
265+
retry_after = await acquire_sold_scrape_slot(
266+
user.id, app.ebay_sold_scrape_min_interval_seconds,
267+
)
268+
if retry_after > 0:
269+
iv = app.ebay_sold_scrape_min_interval_seconds
270+
raise HTTPException(
271+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
272+
detail=(
273+
f"Rate limit: at most one eBay sold-search every {iv:g} s "
274+
f"(retry in {retry_after} s)."
275+
),
276+
headers={"Retry-After": str(retry_after)},
277+
)
278+
279+
return {
280+
**job.to_public(),
281+
"ebay_sold_search_url": ebay_fr_sold_search_url(q=body.q.strip(), page_size=60),
282+
"cached": cache_hit,
283+
}
284+
285+
286+
@router.get("/sold-top/{job_id}", response_model=None)
287+
async def sold_top_status(
288+
user: Annotated[User, Depends(get_current_user)],
289+
job_id: str,
290+
) -> dict[str, Any]:
291+
"""
292+
Return the current state of a ``sold-top`` job.
293+
294+
The client polls this endpoint while ``status`` is ``pending`` or
295+
``running``. Once ``completed`` (or ``failed``), ``result`` is populated
296+
and polling can stop. A job may only be read by its creator.
297+
"""
298+
job = get_job(job_id)
299+
if job is None:
300+
raise HTTPException(
301+
status_code=status.HTTP_404_NOT_FOUND,
302+
detail="Unknown or expired job.",
303+
)
304+
if job.user_id != user.id:
305+
raise HTTPException(
306+
status_code=status.HTTP_403_FORBIDDEN,
307+
detail="This job does not belong to you.",
308+
)
309+
return {
310+
**job.to_public(),
311+
"ebay_sold_search_url": ebay_fr_sold_search_url(q=job.q, page_size=60),
312+
}

api/scripts/debug_ebay_scrape.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""
2+
One-shot diagnostic for ``ebay_sold_scrape_service``.
3+
4+
Fetches the same URL the production service uses, dumps the HTML to
5+
``/tmp/ebay-sold.html``, and reports how many elements match candidate
6+
selectors so we can pick the right one when eBay rotates its SRP layout.
7+
8+
Run from the ``api/`` directory:
9+
10+
python -m scripts.debug_ebay_scrape "carte pokemon"
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import asyncio
16+
import sys
17+
from pathlib import Path
18+
19+
from bs4 import BeautifulSoup
20+
21+
from services.ebay_sold_scrape_service import (
22+
_parse_sold_rows,
23+
fetch_sold_listings_html,
24+
)
25+
26+
OUT = Path("/tmp/ebay-sold.html")
27+
28+
#: Candidate selectors to probe. Order is informational; we report counts for all.
29+
_CANDIDATE_SELECTORS = (
30+
"li.s-item",
31+
"ul.srp-results > li",
32+
".srp-results .s-item",
33+
".srp-results .s-item__wrapper",
34+
"li.s-item__pl-on-bottom",
35+
"[data-testid='srp-results'] li",
36+
"[data-view*='mi:1686'] li",
37+
"ul.b-list__items_nofooter li",
38+
"li[data-viewport]",
39+
"div.s-card",
40+
)
41+
42+
#: Selectors that often hold the relative/absolute « sold » caption.
43+
_CAPTION_SELECTORS = (
44+
".s-item__caption--signal",
45+
".s-item__title--tagblock",
46+
".s-item__subtitle",
47+
".s-card__caption",
48+
"[class*='caption']",
49+
)
50+
51+
52+
async def main(q: str) -> None:
53+
html = await fetch_sold_listings_html(q=q, page_size=50)
54+
OUT.write_text(html, encoding="utf-8")
55+
print(f"saved html ({len(html)} bytes) → {OUT}")
56+
57+
soup = BeautifulSoup(html, "html.parser")
58+
59+
title = soup.select_one("title")
60+
print(f"<title>: {title.get_text(strip=True) if title else '(none)'}")
61+
62+
h1 = soup.select_one("h1")
63+
print(f"<h1>: {h1.get_text(' ', strip=True)[:120] if h1 else '(none)'}")
64+
65+
# Quick consent-page heuristic
66+
consent_markers = ("consent", "consentement", "accepter", "vos choix")
67+
head_excerpt = html[:4000].lower()
68+
if any(tok in head_excerpt for tok in consent_markers):
69+
print("⚠️ consent-related token found in first 4 KB — possible CMP page")
70+
71+
print("\n-- selector probe --")
72+
for sel in _CANDIDATE_SELECTORS:
73+
try:
74+
n = len(soup.select(sel))
75+
except Exception as exc: # invalid selector etc.
76+
n = f"ERR({exc})"
77+
print(f" {sel:55s}{n}")
78+
79+
print("\n-- existing parser --")
80+
rows = _parse_sold_rows(html)
81+
print(f" _parse_sold_rows: {len(rows)} rows")
82+
for r in rows[:3]:
83+
print(f" title={r.title[:60]!r} caption={r.sold_caption!r} hours_ago={r.approx_hours_ago}")
84+
85+
# If selector probe found something useful, sample captions
86+
print("\n-- sample captions from first li.s-item or fallback --")
87+
sample_lis = soup.select("li.s-item") or soup.select("li.s-item__pl-on-bottom") or soup.select("ul.srp-results > li")
88+
for i, li in enumerate(sample_lis[:5]):
89+
for csel in _CAPTION_SELECTORS:
90+
cap = li.select_one(csel)
91+
if cap:
92+
print(f" li#{i} via {csel}: {cap.get_text(' ', strip=True)[:120]!r}")
93+
break
94+
else:
95+
print(f" li#{i} (no caption matched any selector)")
96+
97+
# Field probes on the first 2 LIs so we can pin down the new s-card selectors
98+
field_probes: dict[str, tuple[str, ...]] = {
99+
"title": (
100+
".s-card__title", ".s-card__title-link",
101+
"[role='heading']", "[role=heading]",
102+
"a .su-styled-text", ".s-item__title", ".s-item__title span",
103+
),
104+
"price": (".s-card__price", ".s-item__price", "[class*='price']"),
105+
"link": ("a.su-link", "a[href*='/itm/']", "a.s-item__link"),
106+
"image": (
107+
".s-card__image img", ".s-card__image-wrapper img",
108+
"img.s-item__image-img", ".image-treatment img", "img",
109+
),
110+
"caption": _CAPTION_SELECTORS,
111+
}
112+
113+
print("\n-- field selector probe (first 2 LIs) --")
114+
for i, li in enumerate(sample_lis[:2]):
115+
print(f"\n[li #{i}]")
116+
for field, sels in field_probes.items():
117+
for s in sels:
118+
el = li.select_one(s)
119+
if not el:
120+
continue
121+
if field == "link":
122+
snippet = (el.get("href") or "")[:120]
123+
elif field == "image":
124+
snippet = (el.get("src") or el.get("data-src") or "")[:120]
125+
else:
126+
snippet = el.get_text(" ", strip=True)[:120]
127+
print(f" {field:7s} via {s:35s}{snippet!r}")
128+
break
129+
else:
130+
print(f" {field:7s} no match")
131+
# Also dump the LI's outer HTML head (200 chars) so we can see attributes
132+
outer = str(li)[:300].replace("\n", " ")
133+
print(f" outer[:300]: {outer}")
134+
135+
136+
if __name__ == "__main__":
137+
query = sys.argv[1] if len(sys.argv) > 1 else "carte pokemon"
138+
asyncio.run(main(query))

api/services/ebay_app_oauth_service.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,12 @@ async def _request_app_token(app: AppSettings) -> dict[str, Any]:
5555
async with httpx.AsyncClient(timeout=30.0) as client:
5656
resp = await client.post(_token_url(app), data=data, headers=headers)
5757
if resp.status_code >= 400:
58-
logger.warning(
59-
"eBay app token request failed: %s %s",
60-
resp.status_code,
61-
resp.text[:500],
62-
)
58+
if resp.status_code >= 500:
59+
logger.warning(
60+
"eBay app token request failed: %s %s",
61+
resp.status_code,
62+
resp.text[:500],
63+
)
6364
resp.raise_for_status()
6465
return resp.json()
6566

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""In-memory per-user rate limit for eBay « vendus » HTML scrape (reduces bot flags on eBay)."""
2+
3+
from __future__ import annotations
4+
5+
import asyncio
6+
import time
7+
8+
_lock = asyncio.Lock()
9+
_last: dict[int, float] = {}
10+
11+
12+
async def acquire_sold_scrape_slot(user_id: int, min_interval_sec: float) -> int:
13+
"""
14+
Enforce at most one allowed request per ``min_interval_sec`` per user (monotonic clock).
15+
16+
:returns: ``0`` if the caller may proceed; else whole seconds to wait (for ``Retry-After``).
17+
"""
18+
if min_interval_sec <= 0:
19+
return 0
20+
now = time.monotonic()
21+
async with _lock:
22+
last = _last.get(user_id)
23+
if last is not None and (now - last) < min_interval_sec:
24+
wait = min_interval_sec - (now - last)
25+
return max(1, int(wait + 0.999))
26+
_last[user_id] = now
27+
return 0

0 commit comments

Comments
 (0)