1- import { processDirectory , type ExtractConfig } from "@docx-corpus/extractor" ;
1+ import {
2+ processDirectory ,
3+ loadExtractorConfig ,
4+ hasCloudflareCredentials ,
5+ type ExtractConfig ,
6+ } from "@docx-corpus/extractor" ;
7+ import { createLocalStorage , createR2Storage } from "@docx-corpus/shared" ;
28
39interface ParsedFlags {
4- inputDir : string ;
5- outputDir : string ;
6- batchSize : number ;
7- workers : number ;
10+ batchSize ?: number ;
11+ workers ?: number ;
812 resume : boolean ;
913 verbose : boolean ;
1014}
1115
1216function parseFlags ( args : string [ ] ) : ParsedFlags {
1317 const flags : ParsedFlags = {
14- inputDir : "" ,
15- outputDir : "" ,
16- batchSize : 100 ,
17- workers : 4 ,
1818 resume : false ,
1919 verbose : false ,
2020 } ;
@@ -24,24 +24,14 @@ function parseFlags(args: string[]): ParsedFlags {
2424 const next = args [ i + 1 ] ;
2525
2626 switch ( arg ) {
27- case "--input" :
28- case "-i" :
29- flags . inputDir = next || "" ;
30- i ++ ;
31- break ;
32- case "--output" :
33- case "-o" :
34- flags . outputDir = next || "" ;
35- i ++ ;
36- break ;
3727 case "--batch-size" :
3828 case "-b" :
39- flags . batchSize = parseInt ( next || "100 " , 10 ) ;
29+ flags . batchSize = parseInt ( next || "" , 10 ) ;
4030 i ++ ;
4131 break ;
4232 case "--workers" :
4333 case "-w" :
44- flags . workers = parseInt ( next || "4 " , 10 ) ;
34+ flags . workers = parseInt ( next || "" , 10 ) ;
4535 i ++ ;
4636 break ;
4737 case "--resume" :
@@ -58,35 +48,38 @@ function parseFlags(args: string[]): ParsedFlags {
5848 return flags ;
5949}
6050
61- function validateFlags ( flags : ParsedFlags ) : string | null {
62- if ( ! flags . inputDir ) return "Error: --input (-i) is required" ;
63- if ( ! flags . outputDir ) return "Error: --output (-o) is required" ;
64- if ( flags . batchSize < 1 || flags . batchSize > 10000 ) {
65- return "Error: --batch-size must be between 1 and 10000" ;
66- }
67- if ( flags . workers < 1 || flags . workers > 32 ) {
68- return "Error: --workers must be between 1 and 32" ;
69- }
70- return null ;
71- }
72-
7351const HELP = `
7452corpus extract - Extract text from DOCX files using Docling
7553
7654Usage
7755 corpus extract [options]
7856
57+ Storage is auto-selected based on environment:
58+ - With R2 credentials: reads from r2://documents/, writes to r2://extracted/
59+ - Without R2 credentials: reads from ./corpus/documents/, writes to ./corpus/extracted/
60+
7961Options
80- --input, -i <dir> Input directory containing DOCX files (required)
81- --output, -o <dir> Output directory for extracted data (required)
82- --batch-size, -b <n> Number of files per batch (default: 100)
83- --workers, -w <n> Number of parallel workers (default: 4)
62+ --batch-size, -b <n> Number of files per batch (default: from EXTRACT_BATCH_SIZE or 100)
63+ --workers, -w <n> Number of parallel workers (default: from EXTRACT_WORKERS or 4)
8464 --resume, -r Resume from last checkpoint
8565 --verbose, -v Show detailed progress
66+ --help, -h Show this help
67+
68+ Environment Variables
69+ STORAGE_PATH Local storage path (default: ./corpus)
70+ CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (enables R2)
71+ R2_ACCESS_KEY_ID R2 access key
72+ R2_SECRET_ACCESS_KEY R2 secret key
73+ R2_BUCKET_NAME R2 bucket (default: docx-corpus)
74+ EXTRACT_INPUT_PREFIX Input prefix (default: documents)
75+ EXTRACT_OUTPUT_PREFIX Output prefix (default: extracted)
76+ EXTRACT_BATCH_SIZE Batch size (default: 100)
77+ EXTRACT_WORKERS Worker count (default: 4)
8678
8779Examples
88- corpus extract -i ./docs -o ./output
89- corpus extract -i ./docs -o ./output --resume -v
80+ corpus extract # Use defaults from env
81+ corpus extract --resume -v # Resume with verbose output
82+ corpus extract -b 50 -w 8 # Custom batch/workers
9083` ;
9184
9285export async function runExtract ( args : string [ ] ) {
@@ -96,27 +89,37 @@ export async function runExtract(args: string[]) {
9689 }
9790
9891 const flags = parseFlags ( args ) ;
99- const error = validateFlags ( flags ) ;
100-
101- if ( error ) {
102- console . error ( error ) ;
103- console . error ( "Use 'corpus extract --help' for usage information" ) ;
104- process . exit ( 1 ) ;
105- }
92+ const envConfig = loadExtractorConfig ( ) ;
93+ const useCloud = hasCloudflareCredentials ( envConfig ) ;
94+
95+ // Create storage based on credentials
96+ const storage = useCloud
97+ ? createR2Storage ( {
98+ accountId : envConfig . cloudflare . accountId ,
99+ accessKeyId : envConfig . cloudflare . r2AccessKeyId ,
100+ secretAccessKey : envConfig . cloudflare . r2SecretAccessKey ,
101+ bucket : envConfig . cloudflare . r2BucketName ,
102+ } )
103+ : createLocalStorage ( envConfig . storage . localPath ) ;
106104
107105 const config : ExtractConfig = {
108- inputDir : flags . inputDir ,
109- outputDir : flags . outputDir ,
110- batchSize : flags . batchSize ,
111- workers : flags . workers ,
106+ storage,
107+ inputPrefix : envConfig . extract . inputPrefix ,
108+ outputPrefix : envConfig . extract . outputPrefix ,
109+ batchSize : flags . batchSize ?? envConfig . extract . batchSize ,
110+ workers : flags . workers ?? envConfig . extract . workers ,
112111 resume : flags . resume ,
113112 } ;
114113
115114 console . log ( "Text Extractor" ) ;
116115 console . log ( "==============" ) ;
117- console . log ( `Input: ${ config . inputDir } ` ) ;
118- console . log ( `Output: ${ config . outputDir } ` ) ;
116+ console . log (
117+ `Storage: ${ useCloud ? `R2 (${ envConfig . cloudflare . r2BucketName } )` : `local (${ envConfig . storage . localPath } )` } `
118+ ) ;
119+ console . log ( `Input: ${ config . inputPrefix } /` ) ;
120+ console . log ( `Output: ${ config . outputPrefix } /` ) ;
119121 console . log ( `Workers: ${ config . workers } ` ) ;
122+ console . log ( `Batch: ${ config . batchSize } ` ) ;
120123 if ( config . resume ) console . log ( "Resume: enabled" ) ;
121124 console . log ( "" ) ;
122125
0 commit comments