File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -53,7 +53,7 @@ export async function processDirectory(
5353 }
5454
5555 console . log ( "\nExtraction complete!" ) ;
56- console . log ( ` Output: ${ outputPrefix } /{hash}.txt` ) ;
56+ console . log ( ` Output: ${ outputPrefix } /{hash}.txt, ${ outputPrefix } /{hash}.json ` ) ;
5757}
5858
5959const EXTRACTION_TIMEOUT_MS = 30_000 ; // 30 seconds per document
@@ -106,6 +106,7 @@ async function extractWithPython(
106106 charCount : result . charCount ,
107107 tableCount : result . tableCount ,
108108 imageCount : result . imageCount ,
109+ extraction : result . extraction ,
109110 extractedAt : new Date ( ) . toISOString ( ) ,
110111 } ;
111112}
@@ -173,6 +174,12 @@ async function processBatch(
173174 // Write text file to storage
174175 await storage . write ( `${ outputPrefix } /${ doc . id } .txt` , extracted . text ) ;
175176
177+ // Write extraction JSON to storage
178+ await storage . write (
179+ `${ outputPrefix } /${ doc . id } .json` ,
180+ JSON . stringify ( extracted . extraction )
181+ ) ;
182+
176183 // Update database with extraction metadata
177184 await db . updateExtraction ( {
178185 id : doc . id ,
Original file line number Diff line number Diff line change @@ -15,15 +15,16 @@ def extract(file_path: str) -> dict:
1515 # Export as markdown for text extraction
1616 text = result .document .export_to_markdown ()
1717
18- # Get structured data for counts
19- doc_dict = result .document .export_to_dict ()
18+ # Get full structured extraction (raw, no stripping)
19+ extraction = result .document .export_to_dict ()
2020
2121 return {
2222 "text" : text ,
2323 "wordCount" : len (text .split ()),
2424 "charCount" : len (text ),
25- "tableCount" : len (doc_dict .get ("tables" , [])),
26- "imageCount" : len (doc_dict .get ("pictures" , [])),
25+ "tableCount" : len (extraction .get ("tables" , [])),
26+ "imageCount" : len (extraction .get ("pictures" , [])),
27+ "extraction" : extraction ,
2728 }
2829
2930
Original file line number Diff line number Diff line change @@ -11,6 +11,7 @@ export interface ExtractedDocument {
1111 charCount : number ;
1212 tableCount : number ;
1313 imageCount : number ;
14+ extraction : Record < string , unknown > ;
1415 extractedAt : string ;
1516}
1617
You can’t perform that action at this time.
0 commit comments