Skip to content

Commit 848d7a8

Browse files
committed
feat: move storage to a dedicated package
1 parent 8bf65f9 commit 848d7a8

18 files changed

Lines changed: 798 additions & 237 deletions

File tree

apps/cli/commands/extract.ts

Lines changed: 55 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
1-
import { processDirectory, type ExtractConfig } from "@docx-corpus/extractor";
1+
import {
2+
processDirectory,
3+
loadExtractorConfig,
4+
hasCloudflareCredentials,
5+
type ExtractConfig,
6+
} from "@docx-corpus/extractor";
7+
import { createLocalStorage, createR2Storage } from "@docx-corpus/shared";
28

39
interface ParsedFlags {
4-
inputDir: string;
5-
outputDir: string;
6-
batchSize: number;
7-
workers: number;
10+
batchSize?: number;
11+
workers?: number;
812
resume: boolean;
913
verbose: boolean;
1014
}
1115

1216
function parseFlags(args: string[]): ParsedFlags {
1317
const flags: ParsedFlags = {
14-
inputDir: "",
15-
outputDir: "",
16-
batchSize: 100,
17-
workers: 4,
1818
resume: false,
1919
verbose: false,
2020
};
@@ -24,24 +24,14 @@ function parseFlags(args: string[]): ParsedFlags {
2424
const next = args[i + 1];
2525

2626
switch (arg) {
27-
case "--input":
28-
case "-i":
29-
flags.inputDir = next || "";
30-
i++;
31-
break;
32-
case "--output":
33-
case "-o":
34-
flags.outputDir = next || "";
35-
i++;
36-
break;
3727
case "--batch-size":
3828
case "-b":
39-
flags.batchSize = parseInt(next || "100", 10);
29+
flags.batchSize = parseInt(next || "", 10);
4030
i++;
4131
break;
4232
case "--workers":
4333
case "-w":
44-
flags.workers = parseInt(next || "4", 10);
34+
flags.workers = parseInt(next || "", 10);
4535
i++;
4636
break;
4737
case "--resume":
@@ -58,35 +48,38 @@ function parseFlags(args: string[]): ParsedFlags {
5848
return flags;
5949
}
6050

61-
function validateFlags(flags: ParsedFlags): string | null {
62-
if (!flags.inputDir) return "Error: --input (-i) is required";
63-
if (!flags.outputDir) return "Error: --output (-o) is required";
64-
if (flags.batchSize < 1 || flags.batchSize > 10000) {
65-
return "Error: --batch-size must be between 1 and 10000";
66-
}
67-
if (flags.workers < 1 || flags.workers > 32) {
68-
return "Error: --workers must be between 1 and 32";
69-
}
70-
return null;
71-
}
72-
7351
const HELP = `
7452
corpus extract - Extract text from DOCX files using Docling
7553
7654
Usage
7755
corpus extract [options]
7856
57+
Storage is auto-selected based on environment:
58+
- With R2 credentials: reads from r2://documents/, writes to r2://extracted/
59+
- Without R2 credentials: reads from ./corpus/documents/, writes to ./corpus/extracted/
60+
7961
Options
80-
--input, -i <dir> Input directory containing DOCX files (required)
81-
--output, -o <dir> Output directory for extracted data (required)
82-
--batch-size, -b <n> Number of files per batch (default: 100)
83-
--workers, -w <n> Number of parallel workers (default: 4)
62+
--batch-size, -b <n> Number of files per batch (default: from EXTRACT_BATCH_SIZE or 100)
63+
--workers, -w <n> Number of parallel workers (default: from EXTRACT_WORKERS or 4)
8464
--resume, -r Resume from last checkpoint
8565
--verbose, -v Show detailed progress
66+
--help, -h Show this help
67+
68+
Environment Variables
69+
STORAGE_PATH Local storage path (default: ./corpus)
70+
CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (enables R2)
71+
R2_ACCESS_KEY_ID R2 access key
72+
R2_SECRET_ACCESS_KEY R2 secret key
73+
R2_BUCKET_NAME R2 bucket (default: docx-corpus)
74+
EXTRACT_INPUT_PREFIX Input prefix (default: documents)
75+
EXTRACT_OUTPUT_PREFIX Output prefix (default: extracted)
76+
EXTRACT_BATCH_SIZE Batch size (default: 100)
77+
EXTRACT_WORKERS Worker count (default: 4)
8678
8779
Examples
88-
corpus extract -i ./docs -o ./output
89-
corpus extract -i ./docs -o ./output --resume -v
80+
corpus extract # Use defaults from env
81+
corpus extract --resume -v # Resume with verbose output
82+
corpus extract -b 50 -w 8 # Custom batch/workers
9083
`;
9184

9285
export async function runExtract(args: string[]) {
@@ -96,27 +89,37 @@ export async function runExtract(args: string[]) {
9689
}
9790

9891
const flags = parseFlags(args);
99-
const error = validateFlags(flags);
100-
101-
if (error) {
102-
console.error(error);
103-
console.error("Use 'corpus extract --help' for usage information");
104-
process.exit(1);
105-
}
92+
const envConfig = loadExtractorConfig();
93+
const useCloud = hasCloudflareCredentials(envConfig);
94+
95+
// Create storage based on credentials
96+
const storage = useCloud
97+
? createR2Storage({
98+
accountId: envConfig.cloudflare.accountId,
99+
accessKeyId: envConfig.cloudflare.r2AccessKeyId,
100+
secretAccessKey: envConfig.cloudflare.r2SecretAccessKey,
101+
bucket: envConfig.cloudflare.r2BucketName,
102+
})
103+
: createLocalStorage(envConfig.storage.localPath);
106104

107105
const config: ExtractConfig = {
108-
inputDir: flags.inputDir,
109-
outputDir: flags.outputDir,
110-
batchSize: flags.batchSize,
111-
workers: flags.workers,
106+
storage,
107+
inputPrefix: envConfig.extract.inputPrefix,
108+
outputPrefix: envConfig.extract.outputPrefix,
109+
batchSize: flags.batchSize ?? envConfig.extract.batchSize,
110+
workers: flags.workers ?? envConfig.extract.workers,
112111
resume: flags.resume,
113112
};
114113

115114
console.log("Text Extractor");
116115
console.log("==============");
117-
console.log(`Input: ${config.inputDir}`);
118-
console.log(`Output: ${config.outputDir}`);
116+
console.log(
117+
`Storage: ${useCloud ? `R2 (${envConfig.cloudflare.r2BucketName})` : `local (${envConfig.storage.localPath})`}`
118+
);
119+
console.log(`Input: ${config.inputPrefix}/`);
120+
console.log(`Output: ${config.outputPrefix}/`);
119121
console.log(`Workers: ${config.workers}`);
122+
console.log(`Batch: ${config.batchSize}`);
120123
if (config.resume) console.log("Resume: enabled");
121124
console.log("");
122125

0 commit comments

Comments
 (0)