Skip to content

Commit 1fb083a

Browse files
committed
chore: replace husky with lefthook, fix crawls typecheck
- Replace husky with lefthook for git hooks - pre-commit: lint, format, typecheck (parallel) - pre-push: test, build (sequential) - Move AWS SDK usage from CLI to scraper package (listFilteredCrawls) - Fix TS2307 error in crawls.ts
1 parent c50489b commit 1fb083a

8 files changed

Lines changed: 109 additions & 157 deletions

File tree

.husky/pre-commit

Lines changed: 0 additions & 4 deletions
This file was deleted.

.husky/pre-push

Lines changed: 0 additions & 2 deletions
This file was deleted.

apps/cli/commands/crawls.ts

Lines changed: 6 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
import { loadConfig, hasCloudflareCredentials } from "@docx-corpus/scraper";
2-
import {
3-
S3Client,
4-
ListObjectsV2Command,
5-
} from "@aws-sdk/client-s3";
1+
import { loadConfig, hasCloudflareCredentials, listFilteredCrawls } from "@docx-corpus/scraper";
62
import { header, section, keyValue, blank } from "@docx-corpus/shared";
73

84
const HELP = `
@@ -27,12 +23,6 @@ Examples
2723
corpus crawls
2824
`;
2925

30-
interface CrawlInfo {
31-
id: string;
32-
files: number;
33-
totalSize: number;
34-
}
35-
3626
export async function runCrawls(args: string[]) {
3727
if (args.includes("--help") || args.includes("-h")) {
3828
console.log(HELP);
@@ -49,61 +39,20 @@ export async function runCrawls(args: string[]) {
4939

5040
header("docx-corpus", "crawls");
5141

52-
const client = new S3Client({
53-
region: "auto",
54-
endpoint: `https://${config.cloudflare.accountId}.r2.cloudflarestorage.com`,
55-
credentials: {
56-
accessKeyId: config.cloudflare.r2AccessKeyId,
57-
secretAccessKey: config.cloudflare.r2SecretAccessKey,
58-
},
59-
});
60-
61-
// List all objects under cdx-filtered/
62-
const crawls = new Map<string, CrawlInfo>();
63-
let continuationToken: string | undefined;
64-
65-
do {
66-
const response = await client.send(
67-
new ListObjectsV2Command({
68-
Bucket: config.cloudflare.r2BucketName,
69-
Prefix: "cdx-filtered/",
70-
ContinuationToken: continuationToken,
71-
})
72-
);
42+
const crawls = await listFilteredCrawls(config);
7343

74-
for (const obj of response.Contents || []) {
75-
if (!obj.Key) continue;
76-
// Key format: cdx-filtered/CC-MAIN-2025-51/part-00001.jsonl
77-
const parts = obj.Key.split("/");
78-
if (parts.length < 3) continue;
79-
80-
const crawlId = parts[1];
81-
if (!crawls.has(crawlId)) {
82-
crawls.set(crawlId, { id: crawlId, files: 0, totalSize: 0 });
83-
}
84-
const info = crawls.get(crawlId)!;
85-
info.files++;
86-
info.totalSize += obj.Size || 0;
87-
}
88-
89-
continuationToken = response.NextContinuationToken;
90-
} while (continuationToken);
91-
92-
if (crawls.size === 0) {
44+
if (crawls.length === 0) {
9345
console.log("No filtered crawls found in R2.");
9446
console.log("Run the cdx-filter Lambda first: cd apps/cdx-filter && ./invoke-all.sh CC-MAIN-2025-51");
9547
process.exit(0);
9648
}
9749

98-
// Sort by crawl ID (newest first)
99-
const sorted = [...crawls.values()].sort((a, b) => b.id.localeCompare(a.id));
100-
101-
section(`Available crawls (${sorted.length})`);
102-
for (const crawl of sorted) {
50+
section(`Available crawls (${crawls.length})`);
51+
for (const crawl of crawls) {
10352
const sizeMb = (crawl.totalSize / (1024 * 1024)).toFixed(1);
10453
keyValue(crawl.id, `${crawl.files} files, ${sizeMb} MB`);
10554
}
10655

10756
blank();
108-
console.log(`Scrape the latest: corpus scrape --crawl ${sorted[0].id}`);
57+
console.log(`Scrape the latest: corpus scrape --crawl ${crawls[0].id}`);
10958
}

bun.lock

Lines changed: 27 additions & 92 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lefthook.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
pre-commit:
2+
parallel: true
3+
jobs:
4+
- name: lint
5+
run: bun run lint
6+
- name: format
7+
run: bun run format
8+
- name: typecheck
9+
run: bun run typecheck
10+
11+
pre-push:
12+
jobs:
13+
- name: test
14+
run: bun run test
15+
- name: build
16+
run: bun run build

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
"build": "bun run --cwd packages/scraper build && bun run --cwd apps/cli build",
1616
"release:cli": "bun run --cwd apps/cli release",
1717
"setup:extractor": "bun run --cwd packages/extractor setup",
18-
"prepare": "husky"
18+
"prepare": "lefthook install"
1919
},
2020
"devDependencies": {
2121
"@biomejs/biome": "^2.4.6",
22-
"husky": "^9.1.7"
22+
"lefthook": "^1.11.13"
2323
}
2424
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import { S3Client, ListObjectsV2Command } from "@aws-sdk/client-s3";
2+
import type { Config } from "../config";
3+
4+
export interface FilteredCrawl {
5+
id: string;
6+
files: number;
7+
totalSize: number;
8+
}
9+
10+
/**
11+
* List all CDX-filtered crawls available in R2.
12+
* Returns crawls sorted newest-first.
13+
*/
14+
export async function listFilteredCrawls(config: Config): Promise<FilteredCrawl[]> {
15+
const { cloudflare } = config;
16+
17+
const client = new S3Client({
18+
region: "auto",
19+
endpoint: `https://${cloudflare.accountId}.r2.cloudflarestorage.com`,
20+
credentials: {
21+
accessKeyId: cloudflare.r2AccessKeyId,
22+
secretAccessKey: cloudflare.r2SecretAccessKey,
23+
},
24+
});
25+
26+
const crawls = new Map<string, FilteredCrawl>();
27+
let continuationToken: string | undefined;
28+
29+
do {
30+
const response = await client.send(
31+
new ListObjectsV2Command({
32+
Bucket: cloudflare.r2BucketName,
33+
Prefix: "cdx-filtered/",
34+
ContinuationToken: continuationToken,
35+
})
36+
);
37+
38+
for (const obj of response.Contents || []) {
39+
if (!obj.Key) continue;
40+
// Key format: cdx-filtered/CC-MAIN-2025-51/part-00001.jsonl
41+
const parts = obj.Key.split("/");
42+
if (parts.length < 3) continue;
43+
44+
const crawlId = parts[1];
45+
if (!crawls.has(crawlId)) {
46+
crawls.set(crawlId, { id: crawlId, files: 0, totalSize: 0 });
47+
}
48+
const info = crawls.get(crawlId)!;
49+
info.files++;
50+
info.totalSize += obj.Size || 0;
51+
}
52+
53+
continuationToken = response.NextContinuationToken;
54+
} while (continuationToken);
55+
56+
return [...crawls.values()].sort((a, b) => b.id.localeCompare(a.id));
57+
}

packages/scraper/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ export {
2323
// Common Crawl utilities
2424
export { getLatestCrawlId, getCrawlIds } from "./commoncrawl/index";
2525
export { streamCdxFromR2, type CdxRecord } from "./commoncrawl/cdx-r2";
26+
export { listFilteredCrawls, type FilteredCrawl } from "./commoncrawl/crawls";
2627
export { fetchWarcRecord, parseWarcRecord, findPattern, type WarcResult, type FetchOptions } from "./commoncrawl/warc";
2728

2829
// Validation utilities

0 commit comments

Comments
 (0)