|
| 1 | +import { |
| 2 | + loadConfig, |
| 3 | + hasCloudflareCredentials, |
| 4 | + listFilteredCrawls, |
| 5 | + getAllCrawlIds, |
| 6 | + invokeCdxFilter, |
| 7 | +} from "@docx-corpus/scraper"; |
| 8 | +import { header, section, keyValue, blank } from "@docx-corpus/shared"; |
| 9 | + |
| 10 | +const HELP = ` |
| 11 | +corpus cdx-filter - Filter Common Crawl indexes for .docx URLs via Lambda |
| 12 | +
|
| 13 | +Usage |
| 14 | + corpus cdx-filter [options] |
| 15 | +
|
| 16 | +With no flags, shows available vs already-filtered crawls. |
| 17 | +Pass --crawl or --all to invoke the Lambda and process new crawls. |
| 18 | +
|
| 19 | +Options |
| 20 | + --crawl <id> Filter a specific crawl (e.g. CC-MAIN-2026-08) |
| 21 | + --all Filter all missing crawls |
| 22 | + --latest <n> Filter the latest N missing crawls (default: 1) |
| 23 | + --region <r> AWS region for Lambda (default: us-east-1) |
| 24 | + --concurrency <n> Parallel Lambda invocations per crawl (default: 10) |
| 25 | + --help, -h Show this help |
| 26 | +
|
| 27 | +Environment Variables |
| 28 | + AWS_REGION / AWS_PROFILE AWS credentials for Lambda invocation |
| 29 | + CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (for R2 lookup) |
| 30 | + R2_ACCESS_KEY_ID R2 access key |
| 31 | + R2_SECRET_ACCESS_KEY R2 secret key |
| 32 | +
|
| 33 | +Examples |
| 34 | + corpus cdx-filter # Show missing crawls |
| 35 | + corpus cdx-filter --crawl CC-MAIN-2026-08 # Filter one crawl |
| 36 | + corpus cdx-filter --latest 3 # Filter 3 newest missing |
| 37 | + corpus cdx-filter --all # Filter everything missing |
| 38 | +`; |
| 39 | + |
| 40 | +function parseFlag(args: string[], flag: string): string | undefined { |
| 41 | + const idx = args.indexOf(flag); |
| 42 | + return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined; |
| 43 | +} |
| 44 | + |
| 45 | +export async function runCdxFilter(args: string[]) { |
| 46 | + if (args.includes("--help") || args.includes("-h")) { |
| 47 | + console.log(HELP); |
| 48 | + process.exit(0); |
| 49 | + } |
| 50 | + |
| 51 | + const config = loadConfig(); |
| 52 | + const crawlId = parseFlag(args, "--crawl"); |
| 53 | + const filterAll = args.includes("--all"); |
| 54 | + const latest = parseFlag(args, "--latest"); |
| 55 | + const region = parseFlag(args, "--region") || "us-east-1"; |
| 56 | + const concurrency = parseInt(parseFlag(args, "--concurrency") || "10", 10); |
| 57 | + |
| 58 | + header("docx-corpus", "cdx-filter"); |
| 59 | + |
| 60 | + // Fetch available crawls from Common Crawl API |
| 61 | + console.log("Fetching crawl index from Common Crawl..."); |
| 62 | + const allCrawlIds = await getAllCrawlIds(); |
| 63 | + console.log(`Found ${allCrawlIds.length} crawls available\n`); |
| 64 | + |
| 65 | + // If filtering a specific crawl, just do it |
| 66 | + if (crawlId) { |
| 67 | + if (!allCrawlIds.includes(crawlId)) { |
| 68 | + console.error(`Unknown crawl ID: ${crawlId}`); |
| 69 | + console.error("Use 'corpus cdx-filter' to see available crawls"); |
| 70 | + process.exit(1); |
| 71 | + } |
| 72 | + |
| 73 | + section(`Filtering ${crawlId}`); |
| 74 | + const result = await invokeCdxFilter(crawlId, { region, concurrency }); |
| 75 | + console.log(`Queued ${result.invoked} Lambda invocations for ${crawlId}`); |
| 76 | + blank(); |
| 77 | + console.log("Monitor progress: aws logs tail /aws/lambda/cdx-filter --follow --region us-east-1"); |
| 78 | + return; |
| 79 | + } |
| 80 | + |
| 81 | + // Compare against what's already filtered in R2 |
| 82 | + if (!hasCloudflareCredentials(config)) { |
| 83 | + console.error("Cloudflare R2 credentials required to check filtered crawls"); |
| 84 | + console.error("Set CLOUDFLARE_ACCOUNT_ID, R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY"); |
| 85 | + process.exit(1); |
| 86 | + } |
| 87 | + |
| 88 | + const filtered = await listFilteredCrawls(config); |
| 89 | + const filteredIds = new Set(filtered.map((c) => c.id)); |
| 90 | + const missing = allCrawlIds.filter((id) => !filteredIds.has(id)); |
| 91 | + |
| 92 | + section(`Crawl status (${allCrawlIds.length} total)`); |
| 93 | + keyValue("Filtered", `${filtered.length} crawls`); |
| 94 | + keyValue("Missing", `${missing.length} crawls`); |
| 95 | + blank(); |
| 96 | + |
| 97 | + if (missing.length > 0) { |
| 98 | + section("Missing crawls (newest first)"); |
| 99 | + for (const id of missing.slice(0, 20)) { |
| 100 | + console.log(` ${id}`); |
| 101 | + } |
| 102 | + if (missing.length > 20) { |
| 103 | + console.log(` ... and ${missing.length - 20} more`); |
| 104 | + } |
| 105 | + blank(); |
| 106 | + } |
| 107 | + |
| 108 | + // If --all or --latest, process missing crawls |
| 109 | + if (!filterAll && !latest) { |
| 110 | + if (missing.length > 0) { |
| 111 | + console.log("To filter missing crawls:"); |
| 112 | + console.log(` corpus cdx-filter --latest 1 # newest missing`); |
| 113 | + console.log(` corpus cdx-filter --all # all ${missing.length} missing`); |
| 114 | + console.log(` corpus cdx-filter --crawl ${missing[0]} # specific crawl`); |
| 115 | + } |
| 116 | + return; |
| 117 | + } |
| 118 | + |
| 119 | + const toProcess = filterAll ? missing : missing.slice(0, parseInt(latest || "1", 10)); |
| 120 | + |
| 121 | + section(`Filtering ${toProcess.length} crawl${toProcess.length > 1 ? "s" : ""}`); |
| 122 | + for (const id of toProcess) { |
| 123 | + console.log(`\nProcessing ${id}...`); |
| 124 | + const result = await invokeCdxFilter(id, { region, concurrency }); |
| 125 | + console.log(` Queued ${result.invoked} Lambda invocations`); |
| 126 | + } |
| 127 | + |
| 128 | + blank(); |
| 129 | + console.log(`Done! ${toProcess.length} crawl(s) queued for filtering.`); |
| 130 | + console.log("Monitor: aws logs tail /aws/lambda/cdx-filter --follow --region us-east-1"); |
| 131 | +} |
0 commit comments