Skip to content

Commit d1929f1

Browse files
committed
feat: embedder package
1 parent cc73d7e commit d1929f1

15 files changed

Lines changed: 3326 additions & 2 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Dependencies
22
node_modules/
3+
.venv/
34

45
# Build output
56
dist/

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,22 @@ bun run corpus scrape --crawl CC-MAIN-2025-51 --force
118118
bun run corpus status
119119
```
120120

121+
### 3. Extract text from documents
122+
123+
```bash
124+
# Extract all documents
125+
bun run corpus extract
126+
127+
# Extract with batch limit
128+
bun run corpus extract --batch 100
129+
130+
# Extract with custom workers
131+
bun run corpus extract --batch 50 --workers 8
132+
133+
# Verbose output
134+
bun run corpus extract --verbose
135+
```
136+
121137
### Docker
122138

123139
Run the CLI in a container:

apps/cli/commands/embed.ts

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import {
2+
processEmbeddings,
3+
loadEmbedderConfig,
4+
hasCloudflareCredentials,
5+
hasVoyageCredentials,
6+
type EmbedConfig,
7+
type EmbeddingModel,
8+
} from "@docx-corpus/embedder";
9+
import { createLocalStorage, createR2Storage } from "@docx-corpus/shared";
10+
11+
interface ParsedFlags {
12+
model?: EmbeddingModel;
13+
batchSize?: number;
14+
workers?: number;
15+
verbose: boolean;
16+
}
17+
18+
function parseFlags(args: string[]): ParsedFlags {
19+
const flags: ParsedFlags = {
20+
verbose: false,
21+
};
22+
23+
for (let i = 0; i < args.length; i++) {
24+
const arg = args[i];
25+
const next = args[i + 1];
26+
27+
switch (arg) {
28+
case "--model":
29+
case "-m":
30+
flags.model = next as EmbeddingModel;
31+
i++;
32+
break;
33+
case "--batch":
34+
case "-b":
35+
flags.batchSize = parseInt(next || "", 10);
36+
i++;
37+
break;
38+
case "--workers":
39+
case "-w":
40+
flags.workers = parseInt(next || "", 10);
41+
i++;
42+
break;
43+
case "--verbose":
44+
case "-v":
45+
flags.verbose = true;
46+
break;
47+
}
48+
}
49+
50+
return flags;
51+
}
52+
53+
const HELP = `
54+
corpus embed - Generate embeddings for extracted documents
55+
56+
Usage
57+
corpus embed [options]
58+
59+
Storage is auto-selected based on environment:
60+
- With R2 credentials: reads from r2://extracted/, writes to r2://embeddings/
61+
- Without R2 credentials: reads from ./corpus/extracted/, writes to ./corpus/embeddings/
62+
63+
Already-embedded files are automatically skipped (tracked in index.jsonl).
64+
65+
Options
66+
--model, -m <name> Embedding model (default: minilm)
67+
minilm - all-MiniLM-L6-v2 (fast, 384 dims)
68+
bge-m3 - BAAI/bge-m3 (better quality, 1024 dims)
69+
voyage-lite - Voyage 3.5 lite (best, requires API key)
70+
--batch, -b <n> Limit to n documents (default: all)
71+
--workers, -w <n> Number of parallel workers (default: 4)
72+
--verbose, -v Show detailed progress
73+
--help, -h Show this help
74+
75+
Environment Variables
76+
STORAGE_PATH Local storage path (default: ./corpus)
77+
CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (enables R2)
78+
R2_ACCESS_KEY_ID R2 access key
79+
R2_SECRET_ACCESS_KEY R2 secret key
80+
R2_BUCKET_NAME R2 bucket (default: docx-corpus)
81+
EMBED_INPUT_PREFIX Input prefix (default: extracted)
82+
EMBED_OUTPUT_PREFIX Output prefix (default: embeddings)
83+
EMBED_MODEL Default model (default: minilm)
84+
EMBED_WORKERS Worker count (default: 4)
85+
VOYAGE_API_KEY Voyage AI API key (required for voyage-lite)
86+
87+
Examples
88+
corpus embed # Embed all documents with minilm
89+
corpus embed -m bge-m3 # Use BGE-M3 model
90+
corpus embed -m voyage-lite # Use Voyage API (requires VOYAGE_API_KEY)
91+
corpus embed -b 100 -v # Limit to 100, verbose output
92+
`;
93+
94+
export async function runEmbed(args: string[]) {
95+
if (args.includes("--help") || args.includes("-h")) {
96+
console.log(HELP);
97+
process.exit(0);
98+
}
99+
100+
const flags = parseFlags(args);
101+
const envConfig = loadEmbedderConfig();
102+
const useCloud = hasCloudflareCredentials(envConfig);
103+
const model = flags.model ?? envConfig.embed.model;
104+
105+
// Validate Voyage API key if needed
106+
if (model === "voyage-lite" && !hasVoyageCredentials(envConfig)) {
107+
console.error("Error: VOYAGE_API_KEY environment variable required for voyage-lite model");
108+
process.exit(1);
109+
}
110+
111+
// Create storage based on credentials
112+
const storage = useCloud
113+
? createR2Storage({
114+
accountId: envConfig.cloudflare.accountId,
115+
accessKeyId: envConfig.cloudflare.r2AccessKeyId,
116+
secretAccessKey: envConfig.cloudflare.r2SecretAccessKey,
117+
bucket: envConfig.cloudflare.r2BucketName,
118+
})
119+
: createLocalStorage(envConfig.storage.localPath);
120+
121+
const config: EmbedConfig = {
122+
storage,
123+
inputPrefix: envConfig.embed.inputPrefix,
124+
outputPrefix: envConfig.embed.outputPrefix,
125+
model,
126+
batchSize: flags.batchSize ?? Infinity,
127+
workers: flags.workers ?? envConfig.embed.workers,
128+
};
129+
130+
console.log("Document Embedder");
131+
console.log("=================");
132+
console.log(
133+
`Storage: ${useCloud ? `R2 (${envConfig.cloudflare.r2BucketName})` : `local (${envConfig.storage.localPath})`}`
134+
);
135+
console.log(`Input: ${config.inputPrefix}/`);
136+
console.log(`Output: ${config.outputPrefix}/`);
137+
console.log(`Model: ${config.model}`);
138+
console.log(`Workers: ${config.workers}`);
139+
console.log(`Batch: ${config.batchSize === Infinity ? "all" : config.batchSize}`);
140+
console.log("");
141+
142+
try {
143+
await processEmbeddings(config, flags.verbose);
144+
} catch (err) {
145+
console.error("Fatal error:", err);
146+
process.exit(1);
147+
}
148+
}

apps/cli/index.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import { runScrape } from "./commands/scrape";
44
import { runExtract } from "./commands/extract";
5+
import { runEmbed } from "./commands/embed";
56
import { runStatus } from "./commands/status";
67

78
const VERSION = "0.1.0";
@@ -15,6 +16,7 @@ Usage
1516
Commands
1617
scrape Download .docx files from Common Crawl
1718
extract Extract text from DOCX files using Docling
19+
embed Generate embeddings for extracted documents
1820
status Show corpus statistics
1921
2022
Options
@@ -50,6 +52,9 @@ async function main() {
5052
case "extract":
5153
await runExtract(commandArgs);
5254
break;
55+
case "embed":
56+
await runEmbed(commandArgs);
57+
break;
5358
case "status":
5459
await runStatus(commandArgs);
5560
break;

apps/cli/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"dependencies": {
1616
"@docx-corpus/scraper": "workspace:*",
1717
"@docx-corpus/extractor": "workspace:*",
18+
"@docx-corpus/embedder": "workspace:*",
1819
"@docx-corpus/shared": "workspace:*"
1920
},
2021
"devDependencies": {

bun.lock

Lines changed: 16 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/embedder/config.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import type { EmbeddingModel } from "./types";
2+
3+
export interface EmbedderConfig {
4+
storage: {
5+
localPath: string;
6+
};
7+
cloudflare: {
8+
accountId: string;
9+
r2AccessKeyId: string;
10+
r2SecretAccessKey: string;
11+
r2BucketName: string;
12+
};
13+
embed: {
14+
inputPrefix: string;
15+
outputPrefix: string;
16+
model: EmbeddingModel;
17+
batchSize: number;
18+
workers: number;
19+
};
20+
voyage: {
21+
apiKey: string;
22+
};
23+
}
24+
25+
export function loadEmbedderConfig(): EmbedderConfig {
26+
const env = process.env;
27+
28+
return {
29+
storage: {
30+
localPath: env.STORAGE_PATH || "./corpus",
31+
},
32+
cloudflare: {
33+
accountId: env.CLOUDFLARE_ACCOUNT_ID || "",
34+
r2AccessKeyId: env.R2_ACCESS_KEY_ID || "",
35+
r2SecretAccessKey: env.R2_SECRET_ACCESS_KEY || "",
36+
r2BucketName: env.R2_BUCKET_NAME || "docx-corpus",
37+
},
38+
embed: {
39+
inputPrefix: env.EMBED_INPUT_PREFIX || "extracted",
40+
outputPrefix: env.EMBED_OUTPUT_PREFIX || "embeddings",
41+
model: (env.EMBED_MODEL as EmbeddingModel) || "minilm",
42+
batchSize: parseInt(env.EMBED_BATCH_SIZE || "100", 10),
43+
workers: parseInt(env.EMBED_WORKERS || "4", 10),
44+
},
45+
voyage: {
46+
apiKey: env.VOYAGE_API_KEY || "",
47+
},
48+
};
49+
}
50+
51+
/**
52+
* Check if Cloudflare credentials are configured
53+
*/
54+
export function hasCloudflareCredentials(config: EmbedderConfig): boolean {
55+
return !!(
56+
config.cloudflare.accountId &&
57+
config.cloudflare.r2AccessKeyId &&
58+
config.cloudflare.r2SecretAccessKey
59+
);
60+
}
61+
62+
/**
63+
* Check if Voyage API key is configured
64+
*/
65+
export function hasVoyageCredentials(config: EmbedderConfig): boolean {
66+
return !!config.voyage.apiKey;
67+
}

packages/embedder/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export { processEmbeddings } from "./processor";
2+
export type { EmbedConfig, EmbeddedDocument, EmbeddingIndexEntry, EmbeddingModel } from "./types";
3+
export { loadEmbedderConfig, hasCloudflareCredentials, hasVoyageCredentials, type EmbedderConfig } from "./config";

packages/embedder/package.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"name": "@docx-corpus/embedder",
3+
"version": "0.1.0",
4+
"type": "module",
5+
"exports": {
6+
".": "./index.ts"
7+
},
8+
"scripts": {
9+
"typecheck": "tsc --noEmit",
10+
"setup": "cd python && uv sync"
11+
},
12+
"dependencies": {
13+
"@docx-corpus/shared": "workspace:*"
14+
},
15+
"devDependencies": {
16+
"@types/bun": "latest",
17+
"typescript": "^5.9.3"
18+
}
19+
}

0 commit comments

Comments
 (0)