|
41 | 41 | MAX_RETRIES = 3 # retries per API call on 429 |
42 | 42 | DOWNLOAD_DELAY = 2.0 # seconds between PDF downloads |
43 | 43 | SCIHUB_DELAY = 5.0 # seconds between Sci-Hub requests (be polite) |
44 | | -SCIHUB_DOMAINS = ["sci-hub.se", "sci-hub.st", "sci-hub.ru"] |
| 44 | +SCIHUB_DOMAINS = ["sci-hub.ru", "sci-hub.do", "sci-hub.it.nf", "sci-hub.es.ht", "sci-hub.se", "sci-hub.st"] |
45 | 45 |
|
46 | 46 |
|
47 | 47 | def parse_bib(path: Path) -> list[dict]: |
@@ -310,7 +310,14 @@ def download_pdfs(manifest_entries: list[dict]): |
310 | 310 |
|
311 | 311 | print(f"[{downloaded+1}] {key}: {url[:70]}...") |
312 | 312 | try: |
313 | | - req = urllib.request.Request(url, headers={"User-Agent": "problemreductions/1.0"}) |
| 313 | + # Use browser-like headers to avoid 403 from publisher sites |
| 314 | + headers = { |
| 315 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " |
| 316 | + "AppleWebKit/537.36 (KHTML, like Gecko) " |
| 317 | + "Chrome/120.0.0.0 Safari/537.36", |
| 318 | + "Accept": "application/pdf,*/*", |
| 319 | + } |
| 320 | + req = urllib.request.Request(url, headers=headers) |
314 | 321 | with urllib.request.urlopen(req, timeout=60) as resp: |
315 | 322 | data = resp.read() |
316 | 323 |
|
@@ -354,32 +361,63 @@ def _try_scihub_download(doi: str, dest: Path) -> bool: |
354 | 361 | dest.write_bytes(content) |
355 | 362 | return True |
356 | 363 |
|
357 | | - # Parse page for embedded PDF iframe/link |
| 364 | + # Parse page for embedded PDF link |
358 | 365 | html = content.decode("utf-8", errors="ignore") |
359 | | - # Look for iframe src or direct PDF link |
360 | | - pdf_match = re.search( |
361 | | - r'(?:iframe|embed)[^>]+src\s*=\s*["\']([^"\']*\.pdf[^"\']*)["\']', |
362 | | - html, re.IGNORECASE |
| 366 | + pdf_path = None |
| 367 | + |
| 368 | + # Strategy A: citation_pdf_url meta tag (sci-hub.ru pattern) |
| 369 | + m = re.search( |
| 370 | + r'citation_pdf_url["\']?\s+content\s*=\s*["\']([^"\']+)', |
| 371 | + html, re.IGNORECASE, |
363 | 372 | ) |
364 | | - if not pdf_match: |
365 | | - pdf_match = re.search( |
| 373 | + if m: |
| 374 | + pdf_path = m.group(1) |
| 375 | + |
| 376 | + # Strategy B: /storage/ path in page |
| 377 | + if not pdf_path: |
| 378 | + m = re.search(r'(/storage/[^\s"\'<>,]+\.pdf)', html) |
| 379 | + if m: |
| 380 | + pdf_path = m.group(1) |
| 381 | + |
| 382 | + # Strategy C: iframe/embed src with .pdf |
| 383 | + if not pdf_path: |
| 384 | + m = re.search( |
| 385 | + r'(?:iframe|embed)[^>]+src\s*=\s*["\']([^"\']*\.pdf[^"\']*)["\']', |
| 386 | + html, re.IGNORECASE, |
| 387 | + ) |
| 388 | + if m: |
| 389 | + pdf_path = m.group(1) |
| 390 | + |
| 391 | + # Strategy D: any absolute PDF URL |
| 392 | + if not pdf_path: |
| 393 | + m = re.search( |
366 | 394 | r'(https?://[^\s"\'<>]+\.pdf(?:\?[^\s"\'<>]*)?)', |
367 | | - html, re.IGNORECASE |
| 395 | + html, re.IGNORECASE, |
368 | 396 | ) |
369 | | - if not pdf_match: |
370 | | - # Try //domain/path pattern (protocol-relative) |
371 | | - pdf_match = re.search( |
| 397 | + if m: |
| 398 | + pdf_path = m.group(1) |
| 399 | + |
| 400 | + # Strategy E: protocol-relative PDF URL |
| 401 | + if not pdf_path: |
| 402 | + m = re.search( |
372 | 403 | r'src\s*=\s*["\']?(//[^\s"\'<>]+\.pdf[^\s"\'<>]*)', |
373 | | - html, re.IGNORECASE |
| 404 | + html, re.IGNORECASE, |
374 | 405 | ) |
| 406 | + if m: |
| 407 | + pdf_path = m.group(1) |
| 408 | + |
| 409 | + pdf_match = pdf_path # unify variable name |
375 | 410 |
|
376 | 411 | if pdf_match: |
377 | | - pdf_url = pdf_match.group(1) |
| 412 | + pdf_url = pdf_match |
378 | 413 | if pdf_url.startswith("//"): |
379 | 414 | pdf_url = "https:" + pdf_url |
| 415 | + elif pdf_url.startswith("/"): |
| 416 | + pdf_url = f"https://{domain}{pdf_url}" |
380 | 417 |
|
381 | 418 | pdf_req = urllib.request.Request(pdf_url, headers={ |
382 | | - "User-Agent": "Mozilla/5.0", |
| 419 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " |
| 420 | + "AppleWebKit/537.36", |
383 | 421 | "Referer": url, |
384 | 422 | }) |
385 | 423 | with urllib.request.urlopen(pdf_req, timeout=60) as pdf_resp: |
@@ -448,31 +486,54 @@ def show_status(): |
448 | 486 | return |
449 | 487 |
|
450 | 488 | all_entries = list(manifest.values()) |
451 | | - found = [r for r in all_entries if r.get("pdf_url")] |
452 | | - arxiv = [r for r in all_entries if r.get("arxiv_id")] |
453 | | - oa_only = [r for r in found if not r.get("arxiv_id")] |
454 | | - missing = [r for r in all_entries if not r.get("pdf_url")] |
455 | 489 |
|
| 490 | + # What's actually on disk (the ground truth) |
456 | 491 | pdfs = list(OUTPUT_DIR.glob("*.pdf")) if OUTPUT_DIR.exists() else [] |
457 | | - pdf_keys = {p.stem for p in pdfs} |
458 | | - total_size = sum(p.stat().st_size for p in pdfs) |
459 | | - |
460 | | - print(f"=== MANIFEST ===") |
461 | | - print(f"Total entries: {len(all_entries)}") |
462 | | - print(f"Arxiv: {len(arxiv)}") |
463 | | - print(f"OA (non-arxiv): {len(oa_only)}") |
464 | | - print(f"Not found: {len(missing)}") |
465 | | - print() |
466 | | - print(f"=== DOWNLOADS ===") |
467 | | - print(f"PDFs on disk: {len(pdfs)}") |
468 | | - print(f"Total size: {total_size / 1024 / 1024:.1f} MB") |
469 | | - print(f"Pending download: {len(found) - len(pdf_keys & {e['key'] for e in found})}") |
470 | | - |
471 | | - if missing: |
472 | | - print(f"\n=== NOT FOUND ({len(missing)}) ===") |
473 | | - for r in sorted(missing, key=lambda r: r.get("year", "0")): |
474 | | - doi_str = f" doi:{r.get('doi','')}" if r.get("doi") else "" |
475 | | - print(f" {r['key']} ({r.get('year','?')}): {r.get('title','')[:55]}{doi_str}") |
| 492 | + pdf_keys = {p.stem for p in pdfs if p.stat().st_size > 1000} |
| 493 | + total_size = sum(p.stat().st_size for p in pdfs if p.stat().st_size > 1000) |
| 494 | + all_keys = {e["key"] for e in all_entries} |
| 495 | + |
| 496 | + # Truly missing = in manifest but no PDF on disk |
| 497 | + truly_missing = [] |
| 498 | + for e in all_entries: |
| 499 | + if e["key"] not in pdf_keys: |
| 500 | + truly_missing.append(e) |
| 501 | + |
| 502 | + # Categorize missing |
| 503 | + textbooks = {"garey1979", "sipser2012", "cormen2022", "conway1967"} |
| 504 | + missing_with_doi = [e for e in truly_missing if e.get("doi") and e["key"] not in textbooks] |
| 505 | + missing_no_doi = [e for e in truly_missing if not e.get("doi") and e["key"] not in textbooks] |
| 506 | + missing_textbooks = [e for e in truly_missing if e["key"] in textbooks] |
| 507 | + |
| 508 | + print(f"=== COLLECTION ===") |
| 509 | + print(f"Total in references.bib: {len(all_entries)}") |
| 510 | + print(f"PDFs on disk: {len(pdf_keys)} ({total_size / 1024 / 1024:.1f} MB)") |
| 511 | + print(f"Truly missing: {len(truly_missing)}") |
| 512 | + print(f" With DOI (retry): {len(missing_with_doi)}") |
| 513 | + print(f" No DOI (manual): {len(missing_no_doi)}") |
| 514 | + print(f" Textbooks: {len(missing_textbooks)}") |
| 515 | + |
| 516 | + # Remote status |
| 517 | + if PAPERS_REMOTE: |
| 518 | + print(f"\nRemote: {PAPERS_REMOTE}") |
| 519 | + else: |
| 520 | + print(f"\nRemote: not configured (set PAPERS_REMOTE)") |
| 521 | + |
| 522 | + if truly_missing: |
| 523 | + if missing_with_doi: |
| 524 | + print(f"\n=== MISSING WITH DOI — retry with 'make papers-scihub' ({len(missing_with_doi)}) ===") |
| 525 | + for r in sorted(missing_with_doi, key=lambda r: r.get("year", "0")): |
| 526 | + print(f" {r['key']} ({r.get('year','?')}): {r.get('title','')[:55]} doi:{r['doi']}") |
| 527 | + |
| 528 | + if missing_no_doi: |
| 529 | + print(f"\n=== MISSING WITHOUT DOI — manual web search needed ({len(missing_no_doi)}) ===") |
| 530 | + for r in sorted(missing_no_doi, key=lambda r: r.get("year", "0")): |
| 531 | + print(f" {r['key']} ({r.get('year','?')}): {r.get('title','')[:60]}") |
| 532 | + |
| 533 | + if missing_textbooks: |
| 534 | + print(f"\n=== TEXTBOOKS — not downloadable as PDF ({len(missing_textbooks)}) ===") |
| 535 | + for r in sorted(missing_textbooks, key=lambda r: r.get("year", "0")): |
| 536 | + print(f" {r['key']} ({r.get('year','?')}): {r.get('title','')[:60]}") |
476 | 537 |
|
477 | 538 |
|
478 | 539 | def _require_remote() -> str: |
|
0 commit comments