Skip to content

Commit d4e5053

Browse files
authored
restart notify, probe_success_rate, diff sort (#44)
* seed/restart notify, probe_success_rate, diff sort * fixed lint errors * addressed ai review
1 parent dc319cd commit d4e5053

4 files changed

Lines changed: 148 additions & 18 deletions

File tree

src/paperscout/__main__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,15 @@ def _pool_status(p) -> dict:
143143
def _extra_health_fields() -> dict:
144144
lsp = scheduler._last_successful_poll
145145
s = scheduler._last_probe_stats
146-
total = sum(s.get(k, 0) for k in ("hit_recent", "hit_old", "hit_no_lm", "miss", "error"))
147-
hit_rate = (s.get("hit_recent", 0) + s.get("hit_old", 0)) / total if total > 0 else None
146+
# HTTP 200 outcomes / non-skipped probe attempts (excludes skipped_discovered, skipped_in_index).
147+
hits = s.get("hit_recent", 0) + s.get("hit_old", 0) + s.get("hit_no_lm", 0)
148+
attempted = hits + s.get("miss", 0) + s.get("error", 0)
149+
probe_success_rate = hits / attempted if attempted > 0 else None
148150
return {
149151
"last_successful_poll": (
150152
datetime.fromtimestamp(lsp, tz=timezone.utc).isoformat() if lsp else None
151153
),
152-
"probe_hit_rate": hit_rate,
154+
"probe_success_rate": probe_success_rate,
153155
"mq_depth": mq.depth(),
154156
"db_pool": _pool_status(pool),
155157
}

src/paperscout/monitor.py

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,11 @@ def diff_snapshots(
5454
):
5555
updated_papers.append(paper)
5656

57-
new_papers.sort(key=lambda p: p.date or "", reverse=True)
57+
def _paper_sort_key(p: Paper) -> tuple[str, str]:
58+
return (p.date or "", p.id)
59+
60+
new_papers.sort(key=_paper_sort_key, reverse=True)
61+
updated_papers.sort(key=_paper_sort_key, reverse=True)
5862
return DiffResult(new_papers=new_papers, updated_papers=updated_papers)
5963

6064

@@ -71,6 +75,14 @@ class DPTransition:
7175
discovered_at: float
7276

7377

78+
@dataclass(slots=True)
79+
class SeedResult:
80+
"""Outcome of ``seed()``: probe hits from the seed cycle and whether DB had prior state."""
81+
82+
probe_hits: list[ProbeHit]
83+
had_prior_state: bool
84+
85+
7486
class PollResult:
7587
"""Outcome of one poll: index diff, probe hits, D→P transitions, per-user matches."""
7688

@@ -117,8 +129,13 @@ def __init__(
117129
self._last_probe_stats: dict[str, int] = {}
118130
self._last_ops_alert: float | None = None
119131

120-
async def seed(self) -> None:
121-
"""First-run: gather all current papers from all sources without notifying."""
132+
async def seed(self) -> SeedResult:
133+
"""Gather current index and probe state.
134+
135+
Cold first deploy: no notifications from seed. On restart (prior poll or
136+
discovered URLs), ``poll_once`` may notify for recent probe hits from this seed cycle.
137+
"""
138+
had_prior_state = self.state.last_poll > 0 or len(self.state.get_all_discovered()) > 0
122139
t0 = time.monotonic()
123140
log.info("SEED-START seeding local database from all sources")
124141

@@ -128,19 +145,20 @@ async def seed(self) -> None:
128145

129146
self._previous_papers = dict(self.index.papers)
130147

148+
hits: list[ProbeHit] = []
131149
if self.cfg.enable_iso_probe:
132150
hits = await self.prober.run_cycle()
133-
for hit in hits:
134-
self.state.mark_discovered(hit.url)
135151
log.info("SEED isocpp.org probe existing=%d", len(hits))
136152

137153
self._seeded = True
138154
log.info(
139-
"SEED-DONE elapsed=%.1fs papers=%d discovered=%d",
155+
"SEED-DONE elapsed=%.1fs papers=%d discovered=%d had_prior_state=%s",
140156
time.monotonic() - t0,
141157
len(self._previous_papers),
142158
len(self.state.get_all_discovered()),
159+
had_prior_state,
143160
)
161+
return SeedResult(probe_hits=hits, had_prior_state=had_prior_state)
144162

145163
async def poll_once(self) -> PollResult:
146164
"""Refresh index (if enabled), diff, probe isocpp, compute matches, notify."""
@@ -149,13 +167,50 @@ async def poll_once(self) -> PollResult:
149167
log.info("POLL-START poll=%d", self._poll_count)
150168

151169
if not self._seeded:
152-
await self.seed()
153-
self._last_successful_poll = time.time()
154-
self._last_probe_stats = self.prober.snapshot_stats()
155-
return PollResult(
170+
seed_result = await self.seed()
171+
if not seed_result.had_prior_state:
172+
self._last_successful_poll = time.time()
173+
self._last_probe_stats = self.prober.snapshot_stats()
174+
return PollResult(
175+
diff=DiffResult(new_papers=[], updated_papers=[]),
176+
probe_hits=[],
177+
)
178+
179+
probe_hits = seed_result.probe_hits
180+
recent_hits = [h for h in probe_hits if h.is_recent]
181+
old_hits = [h for h in probe_hits if not h.is_recent]
182+
if old_hits:
183+
log.info(
184+
"PROBE-OLD %d hits with Last-Modified outside %dh window "
185+
"(recorded to discovered, no alert)",
186+
len(old_hits),
187+
self.cfg.alert_modified_hours,
188+
)
189+
190+
per_user_matches = await run_blocking_io(
191+
self.user_watchlist.matches_for_users,
192+
[],
193+
recent_hits,
194+
)
195+
for uid, m in per_user_matches.items():
196+
log.info(
197+
"WATCHLIST-MATCH user=%s papers=%d probe_hits=%d",
198+
uid,
199+
len(m.papers),
200+
len(m.probe_hits),
201+
)
202+
203+
result = PollResult(
156204
diff=DiffResult(new_papers=[], updated_papers=[]),
157-
probe_hits=[],
205+
probe_hits=recent_hits,
206+
dp_transitions=[],
207+
per_user_matches=per_user_matches,
158208
)
209+
if self.notify_callback:
210+
self.notify_callback(result)
211+
self._last_successful_poll = time.time()
212+
self._last_probe_stats = self.prober.snapshot_stats()
213+
return result
159214

160215
previous = dict(self._previous_papers)
161216

tests/test_health.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def health_url_with_extras():
4040
lambda: 42,
4141
extra_fields_fn=lambda: {
4242
"last_successful_poll": "2026-03-16T12:00:00+00:00",
43-
"probe_hit_rate": 0.5,
43+
"probe_success_rate": 0.5,
4444
"mq_depth": 3,
4545
"db_pool": {"max": 10, "in_use": 1, "available": 9},
4646
},
@@ -108,6 +108,6 @@ def test_health_extra_fields_merged(self, health_url_with_extras):
108108
assert "version" in data
109109
assert "last_successful_poll" in data
110110
assert data["last_successful_poll"] == "2026-03-16T12:00:00+00:00"
111-
assert data["probe_hit_rate"] == 0.5
111+
assert data["probe_success_rate"] == 0.5
112112
assert data["mq_depth"] == 3
113113
assert data["db_pool"] == {"max": 10, "in_use": 1, "available": 9}

tests/test_monitor.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,21 @@ def test_new_papers_sorted_by_date_descending(self):
108108
dates = [p.date for p in result.new_papers]
109109
assert dates == sorted(dates, reverse=True)
110110

111+
def test_updated_papers_sorted_by_date_descending(self):
112+
prev = {
113+
"P2300R10": self._paper("P2300R10", title="Old A", date="2024-01-01"),
114+
"P2301R0": self._paper("P2301R0", title="Old B", date="2024-03-01"),
115+
"P2302R0": self._paper("P2302R0", title="Old C", date="2024-06-01"),
116+
}
117+
curr = {
118+
"P2300R10": self._paper("P2300R10", title="New A", date="2024-01-01"),
119+
"P2301R0": self._paper("P2301R0", title="New B", date="2024-06-01"),
120+
"P2302R0": self._paper("P2302R0", title="New C", date="2024-03-01"),
121+
}
122+
result = diff_snapshots(prev, curr)
123+
dates = [p.date for p in result.updated_papers]
124+
assert dates == sorted(dates, reverse=True)
125+
111126
def test_empty_to_empty(self):
112127
result = diff_snapshots({}, {})
113128
assert result.new_papers == [] and result.updated_papers == []
@@ -168,6 +183,7 @@ def _make_scheduler(fake_pool, **cfg_overrides):
168183
index.papers = {}
169184
prober = MagicMock(spec=ISOProber)
170185
prober.run_cycle = AsyncMock(return_value=[])
186+
prober.snapshot_stats = MagicMock(return_value={})
171187
prober._stats = {}
172188
user_watchlist = MagicMock(spec=UserWatchlist)
173189
user_watchlist.matches_for_users.return_value = {}
@@ -351,6 +367,57 @@ async def test_poll_once_calls_notify_callback(self, fake_pool):
351367
await scheduler.poll_once() # real poll
352368
assert len(notified) == 1
353369

370+
async def test_cold_start_first_poll_does_not_notify(self, fake_pool):
371+
notified = []
372+
scheduler, _, _, _, _ = _make_scheduler(fake_pool)
373+
scheduler.notify_callback = notified.append
374+
result = await scheduler.poll_once()
375+
assert notified == []
376+
assert result.probe_hits == []
377+
378+
async def test_restart_with_prior_poll_notifies_seed_hits(self, fake_pool):
379+
notified = []
380+
scheduler, _, prober, user_watchlist, state = _make_scheduler(fake_pool)
381+
scheduler.notify_callback = notified.append
382+
state.touch_poll()
383+
hit = _recent_hit()
384+
prober.run_cycle = AsyncMock(return_value=[hit])
385+
user_watchlist.matches_for_users.return_value = {
386+
"U123": PerUserMatches(papers=[], probe_hits=[(hit, "author")])
387+
}
388+
result = await scheduler.poll_once()
389+
assert len(notified) == 1
390+
assert len(result.probe_hits) == 1
391+
assert result.probe_hits[0].is_recent is True
392+
393+
async def test_restart_with_discovered_urls_notifies(self, fake_pool):
394+
notified = []
395+
scheduler, _, prober, user_watchlist, state = _make_scheduler(fake_pool)
396+
scheduler.notify_callback = notified.append
397+
state.mark_discovered("https://isocpp.org/files/papers/D1111R0.pdf")
398+
hit = _recent_hit()
399+
prober.run_cycle = AsyncMock(return_value=[hit])
400+
user_watchlist.matches_for_users.return_value = {
401+
"U123": PerUserMatches(papers=[], probe_hits=[(hit, "author")])
402+
}
403+
result = await scheduler.poll_once()
404+
assert len(notified) == 1
405+
assert len(result.probe_hits) == 1
406+
407+
async def test_restart_seed_old_hits_not_in_result(self, fake_pool, caplog):
408+
import logging
409+
410+
notified = []
411+
scheduler, _, prober, _, state = _make_scheduler(fake_pool)
412+
scheduler.notify_callback = notified.append
413+
state.touch_poll()
414+
old = _old_hit()
415+
prober.run_cycle = AsyncMock(return_value=[old])
416+
with caplog.at_level(logging.INFO):
417+
result = await scheduler.poll_once()
418+
assert result.probe_hits == []
419+
assert "PROBE-OLD" in caplog.text
420+
354421
async def test_poll_once_skips_refresh_when_disabled(self, fake_pool):
355422
scheduler, index, _, _, _ = _make_scheduler(fake_pool, enable_bulk_wg21=False)
356423
scheduler._seeded = True
@@ -368,8 +435,14 @@ async def test_poll_once_skips_probe_when_disabled(self, fake_pool):
368435
async def test_seed_marks_discovered(self, fake_pool):
369436
scheduler, _, prober, _, state = _make_scheduler(fake_pool)
370437
hit = _recent_hit()
371-
prober.run_cycle = AsyncMock(return_value=[hit])
372-
await scheduler.seed()
438+
439+
async def fake_run_cycle():
440+
state.mark_discovered(hit.url)
441+
return [hit]
442+
443+
prober.run_cycle = AsyncMock(side_effect=fake_run_cycle)
444+
seed_result = await scheduler.seed()
445+
assert seed_result.probe_hits == [hit]
373446
assert state.is_discovered(hit.url)
374447

375448
async def test_run_forever_calls_poll_and_breaks_on_cancel(self, fake_pool):

0 commit comments

Comments
 (0)