|
| 1 | +import { |
| 2 | + processEmbeddings, |
| 3 | + loadEmbedderConfig, |
| 4 | + hasCloudflareCredentials, |
| 5 | + hasVoyageCredentials, |
| 6 | + type EmbedConfig, |
| 7 | + type EmbeddingModel, |
| 8 | +} from "@docx-corpus/embedder"; |
| 9 | +import { createLocalStorage, createR2Storage } from "@docx-corpus/shared"; |
| 10 | + |
| 11 | +interface ParsedFlags { |
| 12 | + model?: EmbeddingModel; |
| 13 | + batchSize?: number; |
| 14 | + workers?: number; |
| 15 | + verbose: boolean; |
| 16 | +} |
| 17 | + |
| 18 | +function parseFlags(args: string[]): ParsedFlags { |
| 19 | + const flags: ParsedFlags = { |
| 20 | + verbose: false, |
| 21 | + }; |
| 22 | + |
| 23 | + for (let i = 0; i < args.length; i++) { |
| 24 | + const arg = args[i]; |
| 25 | + const next = args[i + 1]; |
| 26 | + |
| 27 | + switch (arg) { |
| 28 | + case "--model": |
| 29 | + case "-m": |
| 30 | + flags.model = next as EmbeddingModel; |
| 31 | + i++; |
| 32 | + break; |
| 33 | + case "--batch": |
| 34 | + case "-b": |
| 35 | + flags.batchSize = parseInt(next || "", 10); |
| 36 | + i++; |
| 37 | + break; |
| 38 | + case "--workers": |
| 39 | + case "-w": |
| 40 | + flags.workers = parseInt(next || "", 10); |
| 41 | + i++; |
| 42 | + break; |
| 43 | + case "--verbose": |
| 44 | + case "-v": |
| 45 | + flags.verbose = true; |
| 46 | + break; |
| 47 | + } |
| 48 | + } |
| 49 | + |
| 50 | + return flags; |
| 51 | +} |
| 52 | + |
| 53 | +const HELP = ` |
| 54 | +corpus embed - Generate embeddings for extracted documents |
| 55 | +
|
| 56 | +Usage |
| 57 | + corpus embed [options] |
| 58 | +
|
| 59 | +Storage is auto-selected based on environment: |
| 60 | + - With R2 credentials: reads from r2://extracted/, writes to r2://embeddings/ |
| 61 | + - Without R2 credentials: reads from ./corpus/extracted/, writes to ./corpus/embeddings/ |
| 62 | +
|
| 63 | +Already-embedded files are automatically skipped (tracked in index.jsonl). |
| 64 | +
|
| 65 | +Options |
| 66 | + --model, -m <name> Embedding model (default: minilm) |
| 67 | + minilm - all-MiniLM-L6-v2 (fast, 384 dims) |
| 68 | + bge-m3 - BAAI/bge-m3 (better quality, 1024 dims) |
| 69 | + voyage-lite - Voyage 3.5 lite (best, requires API key) |
| 70 | + --batch, -b <n> Limit to n documents (default: all) |
| 71 | + --workers, -w <n> Number of parallel workers (default: 4) |
| 72 | + --verbose, -v Show detailed progress |
| 73 | + --help, -h Show this help |
| 74 | +
|
| 75 | +Environment Variables |
| 76 | + STORAGE_PATH Local storage path (default: ./corpus) |
| 77 | + CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (enables R2) |
| 78 | + R2_ACCESS_KEY_ID R2 access key |
| 79 | + R2_SECRET_ACCESS_KEY R2 secret key |
| 80 | + R2_BUCKET_NAME R2 bucket (default: docx-corpus) |
| 81 | + EMBED_INPUT_PREFIX Input prefix (default: extracted) |
| 82 | + EMBED_OUTPUT_PREFIX Output prefix (default: embeddings) |
| 83 | + EMBED_MODEL Default model (default: minilm) |
| 84 | + EMBED_WORKERS Worker count (default: 4) |
| 85 | + VOYAGE_API_KEY Voyage AI API key (required for voyage-lite) |
| 86 | +
|
| 87 | +Examples |
| 88 | + corpus embed # Embed all documents with minilm |
| 89 | + corpus embed -m bge-m3 # Use BGE-M3 model |
| 90 | + corpus embed -m voyage-lite # Use Voyage API (requires VOYAGE_API_KEY) |
| 91 | + corpus embed -b 100 -v # Limit to 100, verbose output |
| 92 | +`; |
| 93 | + |
| 94 | +export async function runEmbed(args: string[]) { |
| 95 | + if (args.includes("--help") || args.includes("-h")) { |
| 96 | + console.log(HELP); |
| 97 | + process.exit(0); |
| 98 | + } |
| 99 | + |
| 100 | + const flags = parseFlags(args); |
| 101 | + const envConfig = loadEmbedderConfig(); |
| 102 | + const useCloud = hasCloudflareCredentials(envConfig); |
| 103 | + const model = flags.model ?? envConfig.embed.model; |
| 104 | + |
| 105 | + // Validate Voyage API key if needed |
| 106 | + if (model === "voyage-lite" && !hasVoyageCredentials(envConfig)) { |
| 107 | + console.error("Error: VOYAGE_API_KEY environment variable required for voyage-lite model"); |
| 108 | + process.exit(1); |
| 109 | + } |
| 110 | + |
| 111 | + // Create storage based on credentials |
| 112 | + const storage = useCloud |
| 113 | + ? createR2Storage({ |
| 114 | + accountId: envConfig.cloudflare.accountId, |
| 115 | + accessKeyId: envConfig.cloudflare.r2AccessKeyId, |
| 116 | + secretAccessKey: envConfig.cloudflare.r2SecretAccessKey, |
| 117 | + bucket: envConfig.cloudflare.r2BucketName, |
| 118 | + }) |
| 119 | + : createLocalStorage(envConfig.storage.localPath); |
| 120 | + |
| 121 | + const config: EmbedConfig = { |
| 122 | + storage, |
| 123 | + inputPrefix: envConfig.embed.inputPrefix, |
| 124 | + outputPrefix: envConfig.embed.outputPrefix, |
| 125 | + model, |
| 126 | + batchSize: flags.batchSize ?? Infinity, |
| 127 | + workers: flags.workers ?? envConfig.embed.workers, |
| 128 | + }; |
| 129 | + |
| 130 | + console.log("Document Embedder"); |
| 131 | + console.log("================="); |
| 132 | + console.log( |
| 133 | + `Storage: ${useCloud ? `R2 (${envConfig.cloudflare.r2BucketName})` : `local (${envConfig.storage.localPath})`}` |
| 134 | + ); |
| 135 | + console.log(`Input: ${config.inputPrefix}/`); |
| 136 | + console.log(`Output: ${config.outputPrefix}/`); |
| 137 | + console.log(`Model: ${config.model}`); |
| 138 | + console.log(`Workers: ${config.workers}`); |
| 139 | + console.log(`Batch: ${config.batchSize === Infinity ? "all" : config.batchSize}`); |
| 140 | + console.log(""); |
| 141 | + |
| 142 | + try { |
| 143 | + await processEmbeddings(config, flags.verbose); |
| 144 | + } catch (err) { |
| 145 | + console.error("Fatal error:", err); |
| 146 | + process.exit(1); |
| 147 | + } |
| 148 | +} |
0 commit comments