Skip to content

Commit 78e18fb

Browse files
feat(ebay): async sold-top jobs with polling, shared sold-scrape cache, and UI refresh
1 parent 3843aa7 commit 78e18fb

12 files changed

Lines changed: 990 additions & 167 deletions

api/routes/ebay_market_route.py

Lines changed: 101 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Annotated, Any
77

88
from fastapi import APIRouter, Depends, HTTPException, Query, status
9+
from pydantic import BaseModel, Field
910

1011
from app_types.ebay_browse import (
1112
ConditionFilter,
@@ -22,6 +23,7 @@
2223
from services.ebay_sold_scrape_rate_limit import acquire_sold_scrape_slot
2324
from services.ebay_sold_scrape_service import ebay_fr_sold_search_url, scrape_sold_listings
2425
from services.ebay_sold_top_service import aggregate_top_sold
26+
from services.ebay_sold_top_worker import get_job, peek_items_sample, submit_job
2527

2628
logger = logging.getLogger(__name__)
2729

@@ -179,14 +181,33 @@ async def sold_scrape_html(
179181
Window goes up to ``720`` hours (30 days).
180182
"""
181183
app = get_settings()
184+
185+
# If the worker has a fresh cached top result for the same (q, window),
186+
# reuse its items_sample — saves an eBay roundtrip *and* the rate-limit
187+
# slot, which matters when the user just searched in Top mode and
188+
# switches to List mode.
189+
cached_sample = peek_items_sample(q=q.strip(), window_hours=window_hours)
190+
if cached_sample is not None:
191+
return {
192+
"query": q.strip(),
193+
"window_hours": window_hours,
194+
"items": cached_sample[:limit],
195+
"error": None,
196+
"ebay_sold_search_url": ebay_fr_sold_search_url(
197+
q=q.strip(), page_size=min(60, max(limit, 10)),
198+
),
199+
"source": "ebay_html_scrape_cached_from_top",
200+
"cached": True,
201+
}
202+
182203
retry_after = await acquire_sold_scrape_slot(user.id, app.ebay_sold_scrape_min_interval_seconds)
183204
if retry_after > 0:
184205
iv = app.ebay_sold_scrape_min_interval_seconds
185206
raise HTTPException(
186207
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
187208
detail=(
188-
f"Limite : une recherche « vendus eBay » toutes les {iv:g} s "
189-
f"(réessayez dans {retry_after} s)."
209+
f"Rate limit: at most one eBay sold-search every {iv:g} s "
210+
f"(retry in {retry_after} s)."
190211
),
191212
headers={"Retry-After": str(retry_after)},
192213
)
@@ -198,64 +219,94 @@ async def sold_scrape_html(
198219
"error": err,
199220
"ebay_sold_search_url": ebay_fr_sold_search_url(q=q.strip(), page_size=min(60, max(limit, 10))),
200221
"source": "ebay_html_scrape",
222+
"cached": False,
201223
}
202224

203225

204-
@router.get("/sold-top", response_model=None)
205-
async def sold_top(
226+
class SoldTopSubmitBody(BaseModel):
227+
"""Body for ``POST /ebay/market/sold-top`` — schedules a background scrape."""
228+
229+
q: str = Field(min_length=2, max_length=256)
230+
window_hours: float = Field(default=168, ge=1, le=720)
231+
pages: int = Field(default=10, ge=1, le=20)
232+
scrape_limit: int = Field(default=600, ge=10, le=1000)
233+
top_limit: int = Field(default=20, ge=1, le=100)
234+
min_count: int = Field(default=1, ge=1, le=20)
235+
236+
237+
@router.post("/sold-top", response_model=None, status_code=status.HTTP_202_ACCEPTED)
238+
async def sold_top_submit(
206239
user: Annotated[User, Depends(get_current_user)],
207-
q: Annotated[str, Query(min_length=2, max_length=256)],
208-
window_hours: Annotated[float, Query(ge=1, le=720)] = 168,
209-
pages: Annotated[int, Query(ge=1, le=5)] = 2,
210-
scrape_limit: Annotated[int, Query(ge=10, le=300)] = 180,
211-
top_limit: Annotated[int, Query(ge=1, le=100)] = 30,
212-
min_count: Annotated[int, Query(ge=1, le=20)] = 1,
240+
body: SoldTopSubmitBody,
213241
) -> dict[str, Any]:
214242
"""
215-
Top des cartes les plus vendues dans la fenêtre, agrégées depuis le
216-
scrape HTML public eBay.fr.
243+
Submit a background top-sold scrape job and return its ``job_id``
244+
(consumed via ``GET /ebay/market/sold-top/{job_id}``).
217245
218-
Le résultat est trié par ``count`` (puis valeur cumulée). Fenêtre par
219-
défaut : 7 jours (168 h) ; valeurs autorisées de 1 h à 720 h (30 j).
220-
``pages`` (1-5) déclenche autant de requêtes paginées vers eBay (60
221-
annonces / page), avec déduplication par ``item_id``. Même rate-limit
222-
utilisateur que ``/sold-scrape`` — un appel utilisateur peut donc
223-
générer plusieurs requêtes vers eBay.
246+
When a fresh cached result (TTL 15 min) exists for the same parameters,
247+
the job comes back already in ``status="completed"`` with its
248+
``result`` populated — no eBay scrape triggered. The per-user rate-limit
249+
only fires when an actual scrape is launched.
224250
"""
225251
app = get_settings()
226-
retry_after = await acquire_sold_scrape_slot(user.id, app.ebay_sold_scrape_min_interval_seconds)
227-
if retry_after > 0:
228-
iv = app.ebay_sold_scrape_min_interval_seconds
229-
raise HTTPException(
230-
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
231-
detail=(
232-
f"Limite : une recherche « vendus eBay » toutes les {iv:g} s "
233-
f"(réessayez dans {retry_after} s)."
234-
),
235-
headers={"Retry-After": str(retry_after)},
236-
)
237-
items, err = await scrape_sold_listings(
238-
q=q.strip(),
239-
window_hours=window_hours,
240-
limit=scrape_limit,
241-
pages=pages,
252+
job = submit_job(
253+
user_id=user.id,
254+
q=body.q.strip(),
255+
window_hours=body.window_hours,
256+
pages=body.pages,
257+
scrape_limit=body.scrape_limit,
258+
top_limit=body.top_limit,
259+
min_count=body.min_count,
242260
app=app,
243261
)
244-
grouped = aggregate_top_sold(items, min_count=min_count, limit_per_category=top_limit)
262+
263+
cache_hit = job.status == "completed" and job.result is not None
264+
if not cache_hit:
265+
retry_after = await acquire_sold_scrape_slot(
266+
user.id, app.ebay_sold_scrape_min_interval_seconds,
267+
)
268+
if retry_after > 0:
269+
iv = app.ebay_sold_scrape_min_interval_seconds
270+
raise HTTPException(
271+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
272+
detail=(
273+
f"Rate limit: at most one eBay sold-search every {iv:g} s "
274+
f"(retry in {retry_after} s)."
275+
),
276+
headers={"Retry-After": str(retry_after)},
277+
)
278+
245279
return {
246-
"query": q.strip(),
247-
"window_hours": window_hours,
248-
"pages_requested": pages,
249-
"total_observed": len(items),
250-
"cards": grouped["cards"],
251-
"graded": grouped["graded"],
252-
"sealed": grouped["sealed"],
253-
"groups_count": {
254-
"cards": len(grouped["cards"]),
255-
"graded": len(grouped["graded"]),
256-
"sealed": len(grouped["sealed"]),
257-
},
258-
"error": err,
259-
"ebay_sold_search_url": ebay_fr_sold_search_url(q=q.strip(), page_size=60),
260-
"source": "ebay_html_scrape_aggregated",
280+
**job.to_public(),
281+
"ebay_sold_search_url": ebay_fr_sold_search_url(q=body.q.strip(), page_size=60),
282+
"cached": cache_hit,
283+
}
284+
285+
286+
@router.get("/sold-top/{job_id}", response_model=None)
287+
async def sold_top_status(
288+
user: Annotated[User, Depends(get_current_user)],
289+
job_id: str,
290+
) -> dict[str, Any]:
291+
"""
292+
Return the current state of a ``sold-top`` job.
293+
294+
The client polls this endpoint while ``status`` is ``pending`` or
295+
``running``. Once ``completed`` (or ``failed``), ``result`` is populated
296+
and polling can stop. A job may only be read by its creator.
297+
"""
298+
job = get_job(job_id)
299+
if job is None:
300+
raise HTTPException(
301+
status_code=status.HTTP_404_NOT_FOUND,
302+
detail="Unknown or expired job.",
303+
)
304+
if job.user_id != user.id:
305+
raise HTTPException(
306+
status_code=status.HTTP_403_FORBIDDEN,
307+
detail="This job does not belong to you.",
308+
)
309+
return {
310+
**job.to_public(),
311+
"ebay_sold_search_url": ebay_fr_sold_search_url(q=job.q, page_size=60),
261312
}

api/services/ebay_sold_scrape_service.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import time
2020
from dataclasses import dataclass
2121
from datetime import datetime, timezone
22-
from typing import Any
22+
from typing import Any, Awaitable, Callable
2323
from urllib.parse import urlencode
2424

2525
from bs4 import BeautifulSoup
@@ -477,6 +477,7 @@ async def scrape_sold_listings(
477477
limit: int = 50,
478478
pages: int = 1,
479479
app: AppSettings | None = None,
480+
on_page_done: Callable[[int, int], Awaitable[None]] | None = None,
480481
) -> tuple[list[dict[str, Any]], str | None]:
481482
"""
482483
Return ``(items_as_dicts, error_message_or_none)``.
@@ -487,9 +488,13 @@ async def scrape_sold_listings(
487488
along with the matching error message. A small politeness pause sits
488489
between pages.
489490
491+
``on_page_done`` is awaited after each successfully parsed page with
492+
``(page_num, total_unique_observed_so_far)`` — useful for surfacing
493+
progress to a long-running job consumer.
494+
490495
On success ``error_message_or_none`` is ``None``.
491496
"""
492-
pages_total = max(1, min(int(pages), 5))
497+
pages_total = max(1, min(int(pages), 20))
493498
page_size = min(60, max(limit, 10))
494499

495500
raw_rows: list[Any] = []
@@ -537,6 +542,12 @@ async def scrape_sold_listings(
537542
raw_rows.append(r)
538543
new_in_page += 1
539544

545+
if on_page_done is not None:
546+
try:
547+
await on_page_done(page_num, len(raw_rows))
548+
except Exception:
549+
logger.exception("on_page_done callback raised — ignoring")
550+
540551
# eBay quietly stops returning new listings past the available pages —
541552
# break early once a page yields nothing new.
542553
if new_in_page == 0 and page_num > 1:

api/services/ebay_sold_top_service.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,7 @@
5454

5555
#: Strong sealed-product signals (matched on the diacritic-stripped lowercased
5656
#: title). Order does not matter — first hit classifies the listing.
57-
_SEALED_SIGNALS: tuple[str, ...] = (
58-
"scelle",
59-
"sealed",
57+
_SEALED_STRONG_SIGNALS: tuple[str, ...] = (
6058
"etb",
6159
"elite trainer",
6260
"trainer box",
@@ -83,6 +81,20 @@
8381
"pokebox",
8482
)
8583

84+
#: Weak sealed hints often used for single cards in blister/sleeve.
85+
#: We only use these when no card-level hint is detected.
86+
_SEALED_WEAK_SIGNALS: tuple[str, ...] = (
87+
"scelle",
88+
"sealed",
89+
)
90+
91+
#: Single-card set/promo codes commonly seen in French listings
92+
#: (e.g. "SWSH291", "SVP 052", "MEP031", "TG07").
93+
_CARD_CODE_RX = re.compile(
94+
r"\b(?:svp|swsh|tg|gg|mep|sm|xy|bw|dp|hgss|sw|svp)\s*-?\s*\d{1,3}\b",
95+
re.IGNORECASE,
96+
)
97+
8698
_STOPWORDS: frozenset[str] = frozenset(
8799
{
88100
"pokemon",
@@ -175,28 +187,43 @@ def _significant_tokens(title_norm: str) -> list[str]:
175187
return out
176188

177189

178-
def _classify(title_norm: str, *, has_grade: bool, has_card_number: bool) -> Category:
190+
def _classify(
191+
title_norm: str,
192+
*,
193+
has_grade: bool,
194+
has_card_number: bool,
195+
has_card_code: bool,
196+
) -> Category:
179197
"""
180198
Decide if the listing is a graded card, a sealed product, or a raw card.
181199
182200
Priority:
183201
184202
1. ``has_grade`` — graded items always win, even if they happen to mention
185203
a sealed-product keyword (graded sealed boosters are ultra-rare).
186-
2. ``has_card_number`` — a ``\\d+/\\d+`` pattern in the title denotes a
187-
single card. Listings using the word « scellé(e) » in this case mean
188-
*the card is sealed in plastic*, not « unopened booster product » — so
189-
we never classify them as sealed.
190-
3. Sealed-product keyword (ETB, display, blister, …) — sealed.
191-
4. Default — raw card.
204+
2. ``has_card_number`` or ``has_card_code`` — single-card hints (``12/102``,
205+
``SWSH291``, ``SVP052``, ``TG07``, …). Listings using « scellé(e) » in
206+
this case mean *the card is sealed in plastic*, not unopened sealed
207+
product — we keep them as cards.
208+
3. Strong sealed-product keywords (ETB, display, booster box, coffret…) — sealed.
209+
4. Weak sealed hints ("scellé"/"sealed"):
210+
- if single-card hints are present (promo/code set), keep as card;
211+
- otherwise sealed.
212+
5. Default — raw card.
192213
"""
193214
if has_grade:
194215
return "graded"
195-
if has_card_number:
216+
if has_card_number or has_card_code:
196217
return "cards"
197-
for sig in _SEALED_SIGNALS:
218+
for sig in _SEALED_STRONG_SIGNALS:
198219
if sig in title_norm:
199220
return "sealed"
221+
has_weak_sealed = any(sig in title_norm for sig in _SEALED_WEAK_SIGNALS)
222+
if has_weak_sealed:
223+
has_card_hint = "promo" in title_norm or bool(_CARD_CODE_RX.search(title_norm))
224+
if has_card_hint:
225+
return "cards"
226+
return "sealed"
200227
return "cards"
201228

202229

@@ -213,10 +240,12 @@ def _build_fingerprint(title: str) -> tuple[str, str | None, Category]:
213240
norm = _GRADE_RX.sub(" ", norm)
214241

215242
card_match = _CARD_NUMBER_RX.search(norm)
243+
has_card_code = _CARD_CODE_RX.search(norm) is not None
216244
category = _classify(
217245
norm,
218246
has_grade=grade is not None,
219247
has_card_number=card_match is not None,
248+
has_card_code=has_card_code,
220249
)
221250
tokens = _significant_tokens(norm)
222251

0 commit comments

Comments
 (0)