Skip to content

Commit 87a2b40

Browse files
committed
feat: save raw extraction
1 parent 93c83ed commit 87a2b40

3 files changed

Lines changed: 14 additions & 5 deletions

File tree

packages/extractor/processor.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ export async function processDirectory(
5353
}
5454

5555
console.log("\nExtraction complete!");
56-
console.log(` Output: ${outputPrefix}/{hash}.txt`);
56+
console.log(` Output: ${outputPrefix}/{hash}.txt, ${outputPrefix}/{hash}.json`);
5757
}
5858

5959
const EXTRACTION_TIMEOUT_MS = 30_000; // 30 seconds per document
@@ -106,6 +106,7 @@ async function extractWithPython(
106106
charCount: result.charCount,
107107
tableCount: result.tableCount,
108108
imageCount: result.imageCount,
109+
extraction: result.extraction,
109110
extractedAt: new Date().toISOString(),
110111
};
111112
}
@@ -173,6 +174,12 @@ async function processBatch(
173174
// Write text file to storage
174175
await storage.write(`${outputPrefix}/${doc.id}.txt`, extracted.text);
175176

177+
// Write extraction JSON to storage
178+
await storage.write(
179+
`${outputPrefix}/${doc.id}.json`,
180+
JSON.stringify(extracted.extraction)
181+
);
182+
176183
// Update database with extraction metadata
177184
await db.updateExtraction({
178185
id: doc.id,

packages/extractor/python/extract.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,16 @@ def extract(file_path: str) -> dict:
1515
# Export as markdown for text extraction
1616
text = result.document.export_to_markdown()
1717

18-
# Get structured data for counts
19-
doc_dict = result.document.export_to_dict()
18+
# Get full structured extraction (raw, no stripping)
19+
extraction = result.document.export_to_dict()
2020

2121
return {
2222
"text": text,
2323
"wordCount": len(text.split()),
2424
"charCount": len(text),
25-
"tableCount": len(doc_dict.get("tables", [])),
26-
"imageCount": len(doc_dict.get("pictures", [])),
25+
"tableCount": len(extraction.get("tables", [])),
26+
"imageCount": len(extraction.get("pictures", [])),
27+
"extraction": extraction,
2728
}
2829

2930

packages/extractor/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export interface ExtractedDocument {
1111
charCount: number;
1212
tableCount: number;
1313
imageCount: number;
14+
extraction: Record<string, unknown>;
1415
extractedAt: string;
1516
}
1617

0 commit comments

Comments
 (0)