Skip to content

Commit 3ea378f

Browse files
committed
feat: add corpus cdx-filter command for Lambda-based CDX filtering
Auto-discovers missing crawls by comparing Common Crawl's index API against what's already filtered in R2. Supports --crawl, --latest, --all.
1 parent e4fa91e commit 3ea378f

6 files changed

Lines changed: 418 additions & 2 deletions

File tree

apps/cli/commands/cdx-filter.ts

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import {
2+
loadConfig,
3+
hasCloudflareCredentials,
4+
listFilteredCrawls,
5+
getAllCrawlIds,
6+
invokeCdxFilter,
7+
} from "@docx-corpus/scraper";
8+
import { header, section, keyValue, blank } from "@docx-corpus/shared";
9+
10+
const HELP = `
11+
corpus cdx-filter - Filter Common Crawl indexes for .docx URLs via Lambda
12+
13+
Usage
14+
corpus cdx-filter [options]
15+
16+
With no flags, shows available vs already-filtered crawls.
17+
Pass --crawl or --all to invoke the Lambda and process new crawls.
18+
19+
Options
20+
--crawl <id> Filter a specific crawl (e.g. CC-MAIN-2026-08)
21+
--all Filter all missing crawls
22+
--latest <n> Filter the latest N missing crawls (default: 1)
23+
--region <r> AWS region for Lambda (default: us-east-1)
24+
--concurrency <n> Parallel Lambda invocations per crawl (default: 10)
25+
--help, -h Show this help
26+
27+
Environment Variables
28+
AWS_REGION / AWS_PROFILE AWS credentials for Lambda invocation
29+
CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (for R2 lookup)
30+
R2_ACCESS_KEY_ID R2 access key
31+
R2_SECRET_ACCESS_KEY R2 secret key
32+
33+
Examples
34+
corpus cdx-filter # Show missing crawls
35+
corpus cdx-filter --crawl CC-MAIN-2026-08 # Filter one crawl
36+
corpus cdx-filter --latest 3 # Filter 3 newest missing
37+
corpus cdx-filter --all # Filter everything missing
38+
`;
39+
40+
function parseFlag(args: string[], flag: string): string | undefined {
41+
const idx = args.indexOf(flag);
42+
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
43+
}
44+
45+
export async function runCdxFilter(args: string[]) {
46+
if (args.includes("--help") || args.includes("-h")) {
47+
console.log(HELP);
48+
process.exit(0);
49+
}
50+
51+
const config = loadConfig();
52+
const crawlId = parseFlag(args, "--crawl");
53+
const filterAll = args.includes("--all");
54+
const latest = parseFlag(args, "--latest");
55+
const region = parseFlag(args, "--region") || "us-east-1";
56+
const concurrency = parseInt(parseFlag(args, "--concurrency") || "10", 10);
57+
58+
header("docx-corpus", "cdx-filter");
59+
60+
// Fetch available crawls from Common Crawl API
61+
console.log("Fetching crawl index from Common Crawl...");
62+
const allCrawlIds = await getAllCrawlIds();
63+
console.log(`Found ${allCrawlIds.length} crawls available\n`);
64+
65+
// If filtering a specific crawl, just do it
66+
if (crawlId) {
67+
if (!allCrawlIds.includes(crawlId)) {
68+
console.error(`Unknown crawl ID: ${crawlId}`);
69+
console.error("Use 'corpus cdx-filter' to see available crawls");
70+
process.exit(1);
71+
}
72+
73+
section(`Filtering ${crawlId}`);
74+
const result = await invokeCdxFilter(crawlId, { region, concurrency });
75+
console.log(`Queued ${result.invoked} Lambda invocations for ${crawlId}`);
76+
blank();
77+
console.log("Monitor progress: aws logs tail /aws/lambda/cdx-filter --follow --region us-east-1");
78+
return;
79+
}
80+
81+
// Compare against what's already filtered in R2
82+
if (!hasCloudflareCredentials(config)) {
83+
console.error("Cloudflare R2 credentials required to check filtered crawls");
84+
console.error("Set CLOUDFLARE_ACCOUNT_ID, R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY");
85+
process.exit(1);
86+
}
87+
88+
const filtered = await listFilteredCrawls(config);
89+
const filteredIds = new Set(filtered.map((c) => c.id));
90+
const missing = allCrawlIds.filter((id) => !filteredIds.has(id));
91+
92+
section(`Crawl status (${allCrawlIds.length} total)`);
93+
keyValue("Filtered", `${filtered.length} crawls`);
94+
keyValue("Missing", `${missing.length} crawls`);
95+
blank();
96+
97+
if (missing.length > 0) {
98+
section("Missing crawls (newest first)");
99+
for (const id of missing.slice(0, 20)) {
100+
console.log(` ${id}`);
101+
}
102+
if (missing.length > 20) {
103+
console.log(` ... and ${missing.length - 20} more`);
104+
}
105+
blank();
106+
}
107+
108+
// If --all or --latest, process missing crawls
109+
if (!filterAll && !latest) {
110+
if (missing.length > 0) {
111+
console.log("To filter missing crawls:");
112+
console.log(` corpus cdx-filter --latest 1 # newest missing`);
113+
console.log(` corpus cdx-filter --all # all ${missing.length} missing`);
114+
console.log(` corpus cdx-filter --crawl ${missing[0]} # specific crawl`);
115+
}
116+
return;
117+
}
118+
119+
const toProcess = filterAll ? missing : missing.slice(0, parseInt(latest || "1", 10));
120+
121+
section(`Filtering ${toProcess.length} crawl${toProcess.length > 1 ? "s" : ""}`);
122+
for (const id of toProcess) {
123+
console.log(`\nProcessing ${id}...`);
124+
const result = await invokeCdxFilter(id, { region, concurrency });
125+
console.log(` Queued ${result.invoked} Lambda invocations`);
126+
}
127+
128+
blank();
129+
console.log(`Done! ${toProcess.length} crawl(s) queued for filtering.`);
130+
console.log("Monitor: aws logs tail /aws/lambda/cdx-filter --follow --region us-east-1");
131+
}

apps/cli/index.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { runExtract } from "./commands/extract";
55
import { runEmbed } from "./commands/embed";
66
import { runClassify } from "./commands/classify";
77
import { runCrawls } from "./commands/crawls";
8+
import { runCdxFilter } from "./commands/cdx-filter";
89
import { runExport } from "./commands/export";
910
import { runStatus } from "./commands/status";
1011

@@ -17,6 +18,7 @@ Usage
1718
corpus <command> [options]
1819
1920
Commands
21+
cdx-filter Filter Common Crawl indexes for .docx URLs (Lambda)
2022
scrape Download .docx files from Common Crawl
2123
extract Extract text from DOCX files using Docling
2224
embed Generate embeddings for extracted documents
@@ -55,6 +57,9 @@ async function main() {
5557
const commandArgs = args.slice(1);
5658

5759
switch (command) {
60+
case "cdx-filter":
61+
await runCdxFilter(commandArgs);
62+
break;
5863
case "scrape":
5964
await runScrape(commandArgs);
6065
break;

0 commit comments

Comments
 (0)