superdoc-dev
diff --git a/‎.husky/pre-commit‎
Lines changed: 0 additions & 4 deletions b/‎.husky/pre-commit‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎.husky/pre-push‎
Lines changed: 0 additions & 2 deletions b/‎.husky/pre-push‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎apps/cli/commands/crawls.ts‎
Lines changed: 6 additions & 57 deletions b/‎apps/cli/commands/crawls.ts‎
Lines changed: 6 additions & 57 deletions
diff --git a/‎bun.lock‎
Lines changed: 27 additions & 92 deletions b/‎bun.lock‎
Lines changed: 27 additions & 92 deletions
diff --git a/‎lefthook.yml‎
Lines changed: 16 additions & 0 deletions b/‎lefthook.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎package.json‎
Lines changed: 2 additions & 2 deletions b/‎package.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/scraper/commoncrawl/crawls.ts‎
Lines changed: 57 additions & 0 deletions b/‎packages/scraper/commoncrawl/crawls.ts‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎packages/scraper/index.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/scraper/index.ts‎
Lines changed: 1 addition & 0 deletions
@@ -1,8 +1,4 @@
-import { loadConfig, hasCloudflareCredentials } from "@docx-corpus/scraper";
-import {
-  S3Client,
-  ListObjectsV2Command,
-} from "@aws-sdk/client-s3";
+import { loadConfig, hasCloudflareCredentials, listFilteredCrawls } from "@docx-corpus/scraper";
 import { header, section, keyValue, blank } from "@docx-corpus/shared";
 
 const HELP = `
@@ -27,12 +23,6 @@ Examples
   corpus crawls
 `;
 
-interface CrawlInfo {
-  id: string;
-  files: number;
-  totalSize: number;
-}
-
 export async function runCrawls(args: string[]) {
   if (args.includes("--help") || args.includes("-h")) {
     console.log(HELP);
@@ -49,61 +39,20 @@ export async function runCrawls(args: string[]) {
 
   header("docx-corpus", "crawls");
 
-  const client = new S3Client({
-    region: "auto",
-    endpoint: `https://${config.cloudflare.accountId}.r2.cloudflarestorage.com`,
-    credentials: {
-      accessKeyId: config.cloudflare.r2AccessKeyId,
-      secretAccessKey: config.cloudflare.r2SecretAccessKey,
-    },
-  });
-
-  // List all objects under cdx-filtered/
-  const crawls = new Map<string, CrawlInfo>();
-  let continuationToken: string | undefined;
-
-  do {
-    const response = await client.send(
-      new ListObjectsV2Command({
-        Bucket: config.cloudflare.r2BucketName,
-        Prefix: "cdx-filtered/",
-        ContinuationToken: continuationToken,
-      })
-    );
+  const crawls = await listFilteredCrawls(config);
 
-    for (const obj of response.Contents || []) {
-      if (!obj.Key) continue;
-      // Key format: cdx-filtered/CC-MAIN-2025-51/part-00001.jsonl
-      const parts = obj.Key.split("/");
-      if (parts.length < 3) continue;
-
-      const crawlId = parts[1];
-      if (!crawls.has(crawlId)) {
-        crawls.set(crawlId, { id: crawlId, files: 0, totalSize: 0 });
-      }
-      const info = crawls.get(crawlId)!;
-      info.files++;
-      info.totalSize += obj.Size || 0;
-    }
-
-    continuationToken = response.NextContinuationToken;
-  } while (continuationToken);
-
-  if (crawls.size === 0) {
+  if (crawls.length === 0) {
     console.log("No filtered crawls found in R2.");
     console.log("Run the cdx-filter Lambda first: cd apps/cdx-filter && ./invoke-all.sh CC-MAIN-2025-51");
     process.exit(0);
   }
 
-  // Sort by crawl ID (newest first)
-  const sorted = [...crawls.values()].sort((a, b) => b.id.localeCompare(a.id));
-
-  section(`Available crawls (${sorted.length})`);
-  for (const crawl of sorted) {
+  section(`Available crawls (${crawls.length})`);
+  for (const crawl of crawls) {
     const sizeMb = (crawl.totalSize / (1024 * 1024)).toFixed(1);
     keyValue(crawl.id, `${crawl.files} files, ${sizeMb} MB`);
   }
 
   blank();
-  console.log(`Scrape the latest: corpus scrape --crawl ${sorted[0].id}`);
+  console.log(`Scrape the latest: corpus scrape --crawl ${crawls[0].id}`);
 }
@@ -0,0 +1,16 @@
+pre-commit:
+  parallel: true
+  jobs:
+    - name: lint
+      run: bun run lint
+    - name: format
+      run: bun run format
+    - name: typecheck
+      run: bun run typecheck
+
+pre-push:
+  jobs:
+    - name: test
+      run: bun run test
+    - name: build
+      run: bun run build
@@ -15,10 +15,10 @@
     "build": "bun run --cwd packages/scraper build && bun run --cwd apps/cli build",
     "release:cli": "bun run --cwd apps/cli release",
     "setup:extractor": "bun run --cwd packages/extractor setup",
-    "prepare": "husky"
+    "prepare": "lefthook install"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.6",
-    "husky": "^9.1.7"
+    "lefthook": "^1.11.13"
   }
 }
@@ -0,0 +1,57 @@
+import { S3Client, ListObjectsV2Command } from "@aws-sdk/client-s3";
+import type { Config } from "../config";
+
+export interface FilteredCrawl {
+  id: string;
+  files: number;
+  totalSize: number;
+}
+
+/**
+ * List all CDX-filtered crawls available in R2.
+ * Returns crawls sorted newest-first.
+ */
+export async function listFilteredCrawls(config: Config): Promise<FilteredCrawl[]> {
+  const { cloudflare } = config;
+
+  const client = new S3Client({
+    region: "auto",
+    endpoint: `https://${cloudflare.accountId}.r2.cloudflarestorage.com`,
+    credentials: {
+      accessKeyId: cloudflare.r2AccessKeyId,
+      secretAccessKey: cloudflare.r2SecretAccessKey,
+    },
+  });
+
+  const crawls = new Map<string, FilteredCrawl>();
+  let continuationToken: string | undefined;
+
+  do {
+    const response = await client.send(
+      new ListObjectsV2Command({
+        Bucket: cloudflare.r2BucketName,
+        Prefix: "cdx-filtered/",
+        ContinuationToken: continuationToken,
+      })
+    );
+
+    for (const obj of response.Contents || []) {
+      if (!obj.Key) continue;
+      // Key format: cdx-filtered/CC-MAIN-2025-51/part-00001.jsonl
+      const parts = obj.Key.split("/");
+      if (parts.length < 3) continue;
+
+      const crawlId = parts[1];
+      if (!crawls.has(crawlId)) {
+        crawls.set(crawlId, { id: crawlId, files: 0, totalSize: 0 });
+      }
+      const info = crawls.get(crawlId)!;
+      info.files++;
+      info.totalSize += obj.Size || 0;
+    }
+
+    continuationToken = response.NextContinuationToken;
+  } while (continuationToken);
+
+  return [...crawls.values()].sort((a, b) => b.id.localeCompare(a.id));
+}
@@ -23,6 +23,7 @@ export {
 // Common Crawl utilities
 export { getLatestCrawlId, getCrawlIds } from "./commoncrawl/index";
 export { streamCdxFromR2, type CdxRecord } from "./commoncrawl/cdx-r2";
+export { listFilteredCrawls, type FilteredCrawl } from "./commoncrawl/crawls";
 export { fetchWarcRecord, parseWarcRecord, findPattern, type WarcResult, type FetchOptions } from "./commoncrawl/warc";
 
 // Validation utilities
Original file line number	Diff line number	Diff line change
`@@ -15,10 +15,10 @@`
`15`	`15`	`"build": "bun run --cwd packages/scraper build && bun run --cwd apps/cli build",`
`16`	`16`	`"release:cli": "bun run --cwd apps/cli release",`
`17`	`17`	`"setup:extractor": "bun run --cwd packages/extractor setup",`
`18`		`- "prepare": "husky"`
	`18`	`+ "prepare": "lefthook install"`
`19`	`19`	`},`
`20`	`20`	`"devDependencies": {`
`21`	`21`	`"@biomejs/biome": "^2.4.6",`
`22`		`- "husky": "^9.1.7"`
	`22`	`+ "lefthook": "^1.11.13"`
`23`	`23`	`}`
`24`	`24`	`}`