11import { mkdir , rm } from "node:fs/promises" ;
22import { join , basename , dirname } from "node:path" ;
33import { tmpdir } from "node:os" ;
4- import type { ExtractConfig , ExtractedDocument , ExtractionProgress } from "./types" ;
4+ import type { ExtractConfig , ExtractedDocument } from "./types" ;
55
66const PYTHON_DIR = join ( dirname ( import . meta. path ) , "python" ) ;
77const PYTHON_PATH = join ( PYTHON_DIR , ".venv" , "bin" , "python" ) ;
88const SCRIPT_PATH = join ( PYTHON_DIR , "extract.py" ) ;
99
10- const PROGRESS_FILE = "progress.json " ;
10+ const INDEX_FILE = "index.jsonl " ;
1111const ERRORS_FILE = "errors.jsonl" ;
12- const OUTPUT_FILE = "documents.jsonl" ;
1312
1413export async function processDirectory (
1514 config : ExtractConfig ,
1615 verbose : boolean = false
1716) : Promise < void > {
18- const { storage, inputPrefix, outputPrefix } = config ;
17+ const { storage, inputPrefix, outputPrefix, batchSize } = config ;
1918
20- // List all .docx files in input prefix
19+ // Load index to get already-processed IDs
20+ const processedIds = await loadProcessedIds ( storage , outputPrefix ) ;
21+
22+ if ( processedIds . size > 0 ) {
23+ console . log ( `Already extracted: ${ processedIds . size } documents` ) ;
24+ }
25+
26+ // List .docx files, filtering out already-processed ones
2127 const files : string [ ] = [ ] ;
28+
29+ console . log ( `Scanning ${ inputPrefix } /...` ) ;
2230 for await ( const key of storage . list ( inputPrefix ) ) {
2331 if ( key . toLowerCase ( ) . endsWith ( ".docx" ) && ! basename ( key ) . startsWith ( "~$" ) ) {
24- files . push ( key ) ;
32+ const id = extractIdFromKey ( key ) ;
33+ if ( ! processedIds . has ( id ) ) {
34+ files . push ( key ) ;
35+ if ( files . length >= batchSize ) {
36+ console . log ( `Listed ${ files . length } files to process (batch limit)` ) ;
37+ break ;
38+ }
39+ }
2540 }
2641 }
2742 files . sort ( ) ;
2843
2944 if ( files . length === 0 ) {
30- console . log ( `No DOCX files found in ${ inputPrefix } ` ) ;
45+ console . log ( `No new DOCX files to process in ${ inputPrefix } ` ) ;
3146 return ;
3247 }
3348
34- console . log ( `Found ${ files . length } DOCX files` ) ;
35-
36- // Load progress
37- const progress = await loadProgress ( storage , outputPrefix ) ;
38- const startIndex = config . resume ? progress . processedFiles : 0 ;
39-
40- if ( config . resume && startIndex > 0 ) {
41- console . log ( `Resuming from file ${ startIndex + 1 } ` ) ;
42- }
43-
44- progress . totalFiles = files . length ;
45- progress . startedAt = progress . startedAt || new Date ( ) . toISOString ( ) ;
49+ console . log ( `Found ${ files . length } DOCX files to extract` ) ;
4650
4751 // Create temp directory for processing
4852 const tempDir = join ( tmpdir ( ) , `docx-extract-${ Date . now ( ) } ` ) ;
4953 await mkdir ( tempDir , { recursive : true } ) ;
5054
51- // Collect output lines in memory, write at end
52- const outputLines : string [ ] = [ ] ;
53-
54- // Load existing output if resuming
55- if ( config . resume && startIndex > 0 ) {
56- const existingOutput = await storage . read ( `${ outputPrefix } /${ OUTPUT_FILE } ` ) ;
57- if ( existingOutput ) {
58- const text = new TextDecoder ( ) . decode ( existingOutput ) ;
59- outputLines . push ( ...text . trim ( ) . split ( "\n" ) . filter ( Boolean ) ) ;
60- }
61- }
62-
63- const batches = chunkArray ( files . slice ( startIndex ) , config . batchSize ) ;
64- let totalProcessed = startIndex ;
55+ let successCount = 0 ;
56+ let errorCount = 0 ;
6557
6658 try {
67- for ( const batch of batches ) {
68- const results = await processBatch ( batch , storage , tempDir , config . workers , verbose ) ;
69-
70- for ( const result of results ) {
71- if ( result . success && result . document ) {
72- outputLines . push ( JSON . stringify ( result . document ) ) ;
73- progress . successCount ++ ;
74- } else {
75- progress . errorCount ++ ;
76- await appendError ( storage , outputPrefix , result . error || "Unknown error" , result . sourceKey ) ;
77- }
59+ const results = await processBatch ( files , storage , tempDir , config . workers , verbose ) ;
60+
61+ for ( const result of results ) {
62+ if ( result . success && result . document ) {
63+ // Write text file
64+ await storage . write (
65+ `${ outputPrefix } /${ result . document . id } .txt` ,
66+ result . document . text
67+ ) ;
68+ // Append to index (metadata)
69+ await appendToIndex ( storage , outputPrefix , result . document ) ;
70+ successCount ++ ;
71+ } else {
72+ errorCount ++ ;
73+ await appendError ( storage , outputPrefix , result . error || "Unknown error" , result . sourceKey ) ;
7874 }
79-
80- totalProcessed += batch . length ;
81- progress . processedFiles = totalProcessed ;
82- progress . lastProcessedKey = batch [ batch . length - 1 ] ;
83- progress . updatedAt = new Date ( ) . toISOString ( ) ;
84-
85- // Save progress and output
86- await saveProgress ( storage , outputPrefix , progress ) ;
87- await storage . write (
88- `${ outputPrefix } /${ OUTPUT_FILE } ` ,
89- outputLines . join ( "\n" ) + "\n"
90- ) ;
91-
92- const percent = ( ( totalProcessed / files . length ) * 100 ) . toFixed ( 1 ) ;
93- console . log (
94- `Progress: ${ totalProcessed } /${ files . length } (${ percent } %) - ` +
95- `Success: ${ progress . successCount } , Errors: ${ progress . errorCount } `
96- ) ;
9775 }
76+
77+ console . log ( `Processed: ${ successCount } success, ${ errorCount } errors` ) ;
9878 } finally {
9979 // Cleanup temp directory
10080 await rm ( tempDir , { recursive : true , force : true } ) ;
10181 }
10282
10383 console . log ( "\nExtraction complete!" ) ;
104- console . log ( ` Total: ${ files . length } ` ) ;
105- console . log ( ` Success: ${ progress . successCount } ` ) ;
106- console . log ( ` Errors: ${ progress . errorCount } ` ) ;
107- console . log ( ` Output: ${ outputPrefix } /${ OUTPUT_FILE } ` ) ;
84+ console . log ( ` Output: ${ outputPrefix } /{hash}.txt` ) ;
85+ console . log ( ` Index: ${ outputPrefix } /${ INDEX_FILE } ` ) ;
10886}
10987
11088interface ProcessResult {
@@ -133,7 +111,7 @@ async function extractWithPython(
133111 }
134112
135113 const result = JSON . parse ( stdout ) ;
136- const id = generateId ( sourceKey ) ;
114+ const id = extractIdFromKey ( sourceKey ) ;
137115
138116 return {
139117 id,
@@ -169,7 +147,7 @@ async function processBatch(
169147 throw new Error ( `File not found: ${ sourceKey } ` ) ;
170148 }
171149
172- const tempFile = join ( tempDir , `${ generateId ( sourceKey ) } .docx` ) ;
150+ const tempFile = join ( tempDir , `${ extractIdFromKey ( sourceKey ) } .docx` ) ;
173151 await Bun . write ( tempFile , content ) ;
174152
175153 // Extract using Python
@@ -201,44 +179,10 @@ async function processBatch(
201179 return results ;
202180}
203181
204- function generateId ( key : string ) : string {
205- const hasher = new Bun . CryptoHasher ( "sha256" ) ;
206- hasher . update ( key ) ;
207- return hasher . digest ( "hex" ) . slice ( 0 , 16 ) ;
208- }
209-
210- async function loadProgress (
211- storage : ExtractConfig [ "storage" ] ,
212- outputPrefix : string
213- ) : Promise < ExtractionProgress > {
214- try {
215- const content = await storage . read ( `${ outputPrefix } /${ PROGRESS_FILE } ` ) ;
216- if ( content ) {
217- return JSON . parse ( new TextDecoder ( ) . decode ( content ) ) ;
218- }
219- } catch {
220- // Ignore errors, return fresh progress
221- }
222-
223- return {
224- totalFiles : 0 ,
225- processedFiles : 0 ,
226- successCount : 0 ,
227- errorCount : 0 ,
228- startedAt : new Date ( ) . toISOString ( ) ,
229- updatedAt : new Date ( ) . toISOString ( ) ,
230- } ;
231- }
232-
233- async function saveProgress (
234- storage : ExtractConfig [ "storage" ] ,
235- outputPrefix : string ,
236- progress : ExtractionProgress
237- ) : Promise < void > {
238- await storage . write (
239- `${ outputPrefix } /${ PROGRESS_FILE } ` ,
240- JSON . stringify ( progress , null , 2 )
241- ) ;
182+ function extractIdFromKey ( key : string ) : string {
183+ // Extract hash from filename: "documents/abc123.docx" → "abc123"
184+ const filename = basename ( key ) ;
185+ return filename . replace ( / \. d o c x $ / i, "" ) ;
242186}
243187
244188async function appendError (
@@ -256,10 +200,45 @@ async function appendError(
256200 await storage . write ( errorsKey , existingText + line ) ;
257201}
258202
259- function chunkArray < T > ( array : T [ ] , size : number ) : T [ ] [ ] {
260- const chunks : T [ ] [ ] = [ ] ;
261- for ( let i = 0 ; i < array . length ; i += size ) {
262- chunks . push ( array . slice ( i , i + size ) ) ;
203+ async function loadProcessedIds (
204+ storage : ExtractConfig [ "storage" ] ,
205+ outputPrefix : string
206+ ) : Promise < Set < string > > {
207+ const ids = new Set < string > ( ) ;
208+ try {
209+ const content = await storage . read ( `${ outputPrefix } /${ INDEX_FILE } ` ) ;
210+ if ( content ) {
211+ const text = new TextDecoder ( ) . decode ( content ) ;
212+ for ( const line of text . split ( "\n" ) ) {
213+ if ( line . trim ( ) ) {
214+ const entry = JSON . parse ( line ) ;
215+ ids . add ( entry . id ) ;
216+ }
217+ }
218+ }
219+ } catch {
220+ // Index doesn't exist yet, return empty set
263221 }
264- return chunks ;
222+ return ids ;
223+ }
224+
225+ async function appendToIndex (
226+ storage : ExtractConfig [ "storage" ] ,
227+ outputPrefix : string ,
228+ doc : ExtractedDocument
229+ ) : Promise < void > {
230+ const indexKey = `${ outputPrefix } /${ INDEX_FILE } ` ;
231+ const entry = {
232+ id : doc . id ,
233+ extractedAt : doc . extractedAt ,
234+ wordCount : doc . wordCount ,
235+ charCount : doc . charCount ,
236+ tableCount : doc . tableCount ,
237+ imageCount : doc . imageCount ,
238+ } ;
239+
240+ // Read existing index and append
241+ const existing = await storage . read ( indexKey ) ;
242+ const existingText = existing ? new TextDecoder ( ) . decode ( existing ) : "" ;
243+ await storage . write ( indexKey , existingText + JSON . stringify ( entry ) + "\n" ) ;
265244}
0 commit comments