feat: persistent extraction server

caiopizzol · caiopizzol · commit 5585cbd9c1ee · 2026-01-21T07:53:01.000-03:00
diff --git a/packages/extractor/processor.ts b/packages/extractor/processor.ts
@@ -6,7 +6,113 @@ import { formatProgress, writeMultiLineProgress, type DocumentRecord } from "@do
 
 const PYTHON_DIR = join(dirname(import.meta.path), "python");
 const PYTHON_PATH = join(PYTHON_DIR, ".venv", "bin", "python");
-const SCRIPT_PATH = join(PYTHON_DIR, "extract.py");
+const SCRIPT_PATH = join(PYTHON_DIR, "extract_server.py");
+
+/**
+ * Persistent Python extraction worker.
+ * Spawns one Python process and communicates via stdin/stdout JSON lines.
+ */
+class PersistentExtractor {
+  private proc: ReturnType<typeof Bun.spawn> | null = null;
+  private stdin: ReturnType<typeof Bun.spawn>["stdin"] | null = null;
+  private initialized = false;
+  private readBuffer = "";
+  private stdoutReader: ReadableStreamDefaultReader<Uint8Array> | null = null;
+  private decoder = new TextDecoder();
+
+  async start(): Promise<void> {
+    this.proc = Bun.spawn([PYTHON_PATH, SCRIPT_PATH], {
+      stdin: "pipe",
+      stdout: "pipe",
+      stderr: "inherit",
+    });
+
+    const stdout = this.proc.stdout;
+    this.stdin = this.proc.stdin;
+
+    if (!stdout || typeof stdout === "number") {
+      throw new Error("Failed to get stdout pipe from Python process");
+    }
+    if (!this.stdin || typeof this.stdin === "number") {
+      throw new Error("Failed to get stdin pipe from Python process");
+    }
+
+    // Get reader for stdout
+    this.stdoutReader = (stdout as ReadableStream<Uint8Array>).getReader();
+
+    // Wait for "ready" signal (imports complete)
+    const readyLine = await this.readLine();
+    const ready = JSON.parse(readyLine);
+    if (!ready.ready) {
+      throw new Error("Python extractor failed to signal ready");
+    }
+
+    // Wait for "initialized" signal (converter created)
+    const initLine = await this.readLine();
+    const init = JSON.parse(initLine);
+    if (!init.initialized) {
+      throw new Error("Python extractor failed to initialize converter");
+    }
+
+    this.initialized = true;
+  }
+
+  private async readLine(): Promise<string> {
+    if (!this.stdoutReader) throw new Error("Reader not initialized");
+
+    while (true) {
+      // Check if we already have a complete line in the buffer
+      const newlineIndex = this.readBuffer.indexOf("\n");
+      if (newlineIndex !== -1) {
+        const line = this.readBuffer.slice(0, newlineIndex);
+        this.readBuffer = this.readBuffer.slice(newlineIndex + 1);
+        return line;
+      }
+
+      // Read more data
+      const { value, done } = await this.stdoutReader.read();
+      if (done) throw new Error("Python process closed unexpectedly");
+
+      this.readBuffer += this.decoder.decode(value, { stream: true });
+    }
+  }
+
+  async extract(filePath: string): Promise<{
+    success: boolean;
+    text?: string;
+    wordCount?: number;
+    charCount?: number;
+    tableCount?: number;
+    imageCount?: number;
+    extraction?: any;
+    error?: string;
+  }> {
+    if (!this.initialized || !this.stdin || typeof this.stdin === "number") {
+      throw new Error("Extractor not initialized");
+    }
+
+    // Send file path to Python using Bun's FileSink
+    (this.stdin as { write: (data: string) => number }).write(filePath + "\n");
+
+    // Read JSON response
+    const responseLine = await this.readLine();
+    return JSON.parse(responseLine);
+  }
+
+  async stop(): Promise<void> {
+    if (this.stdin && typeof this.stdin !== "number") {
+      (this.stdin as { end: () => void }).end();
+      this.stdin = null;
+    }
+    if (this.proc) {
+      this.proc.kill();
+      this.proc = null;
+    }
+    this.initialized = false;
+    this.readBuffer = "";
+    this.stdoutReader = null;
+  }
+}
 
 export async function processDirectory(
   config: ExtractConfig,
@@ -58,59 +164,6 @@ export async function processDirectory(
 
 const EXTRACTION_TIMEOUT_MS = 30_000; // 30 seconds per document
 
-async function extractWithPython(
-  doc: DocumentRecord,
-  localFilePath: string
-): Promise<ExtractedDocument> {
-  const proc = Bun.spawn([PYTHON_PATH, SCRIPT_PATH, localFilePath], {
-    stdout: "pipe",
-    stderr: "pipe",
-  });
-
-  const extractionPromise = (async () => {
-    const stdout = await new Response(proc.stdout).text();
-    const stderr = await new Response(proc.stderr).text();
-    const exitCode = await proc.exited;
-    return { stdout, stderr, exitCode };
-  })();
-
-  let timeoutId: Timer;
-  const timeoutPromise = new Promise<never>((_, reject) => {
-    timeoutId = setTimeout(() => {
-      proc.kill();
-      reject(new Error(`Extraction timed out after ${EXTRACTION_TIMEOUT_MS / 1000}s`));
-    }, EXTRACTION_TIMEOUT_MS);
-  });
-
-  try {
-    var { stdout, stderr, exitCode } = await Promise.race([
-      extractionPromise,
-      timeoutPromise,
-    ]);
-  } finally {
-    clearTimeout(timeoutId!);
-  }
-
-  if (exitCode !== 0) {
-    const errorData = stderr ? JSON.parse(stderr) : { error: "Unknown error" };
-    throw new Error(errorData.error || "Python extraction failed");
-  }
-
-  const result = JSON.parse(stdout);
-
-  return {
-    id: doc.id,
-    sourceKey: `documents/${doc.id}.docx`,
-    text: result.text,
-    wordCount: result.wordCount,
-    charCount: result.charCount,
-    tableCount: result.tableCount,
-    imageCount: result.imageCount,
-    extraction: result.extraction,
-    extractedAt: new Date().toISOString(),
-  };
-}
-
 async function processBatch(
   documents: DocumentRecord[],
   config: ExtractConfig,
@@ -151,7 +204,20 @@ async function processBatch(
 
   const progressInterval = !verbose ? setInterval(updateProgress, 100) : null;
 
-  const processFile = async (): Promise<void> => {
+  // Start pool of persistent Python extractors (one per worker)
+  const numWorkers = Math.min(workers, documents.length);
+  const extractors: PersistentExtractor[] = [];
+
+  console.log(`Starting ${numWorkers} persistent Python extractor(s)...`);
+  for (let i = 0; i < numWorkers; i++) {
+    const extractor = new PersistentExtractor();
+    await extractor.start();
+    extractors.push(extractor);
+  }
+  console.log(`${numWorkers} extractor(s) ready, processing documents...`);
+
+  // Worker function - each worker uses its own extractor
+  const processWorker = async (extractor: PersistentExtractor): Promise<void> => {
     while (queue.length > 0) {
       const doc = queue.shift();
       if (!doc) continue;
@@ -168,26 +234,37 @@ async function processBatch(
         const tempFile = join(tempDir, `${doc.id}.docx`);
         await Bun.write(tempFile, content);
 
-        // Extract using Python
-        const extracted = await extractWithPython(doc, tempFile);
+        // Extract using persistent Python worker with timeout
+        const extractPromise = extractor.extract(tempFile);
+        const timeoutPromise = new Promise<never>((_, reject) => {
+          setTimeout(() => {
+            reject(new Error(`Extraction timed out after ${EXTRACTION_TIMEOUT_MS / 1000}s`));
+          }, EXTRACTION_TIMEOUT_MS);
+        });
+
+        const result = await Promise.race([extractPromise, timeoutPromise]);
+
+        if (!result.success) {
+          throw new Error(result.error || "Extraction failed");
+        }
 
         // Write text file to storage
-        await storage.write(`${outputPrefix}/${doc.id}.txt`, extracted.text);
+        await storage.write(`${outputPrefix}/${doc.id}.txt`, result.text!);
 
         // Write extraction JSON to storage
         await storage.write(
           `${outputPrefix}/${doc.id}.json`,
-          JSON.stringify(extracted.extraction)
+          JSON.stringify(result.extraction)
         );
 
         // Update database with extraction metadata
         await db.updateExtraction({
           id: doc.id,
-          word_count: extracted.wordCount,
-          char_count: extracted.charCount,
-          table_count: extracted.tableCount,
-          image_count: extracted.imageCount,
-          extracted_at: extracted.extractedAt,
+          word_count: result.wordCount!,
+          char_count: result.charCount!,
+          table_count: result.tableCount!,
+          image_count: result.imageCount!,
+          extracted_at: new Date().toISOString(),
         });
 
         successCount++;
@@ -196,7 +273,7 @@ async function processBatch(
         await rm(tempFile, { force: true });
 
         if (verbose) {
-          console.log(`  Extracted: ${doc.id} (${extracted.wordCount} words)`);
+          console.log(`  Extracted: ${doc.id} (${result.wordCount} words)`);
         }
       } catch (err) {
         const error = err instanceof Error ? err.message : String(err);
@@ -212,11 +289,13 @@ async function processBatch(
     }
   };
 
-  const workerPromises = Array(Math.min(workers, documents.length))
-    .fill(null)
-    .map(() => processFile());
-
-  await Promise.all(workerPromises);
+  try {
+    // Run all workers in parallel, each with its own extractor
+    await Promise.all(extractors.map(extractor => processWorker(extractor)));
+  } finally {
+    // Always stop all extractors
+    await Promise.all(extractors.map(e => e.stop()));
+  }
 
   // Clean up progress display
   if (progressInterval) {
diff --git a/packages/extractor/python/extract_server.py b/packages/extractor/python/extract_server.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Persistent DocX extraction server using Docling.
+
+Reads file paths from stdin (one per line), outputs JSON per line to stdout.
+This avoids the overhead of spawning a new Python process for each document.
+"""
+import json
+import sys
+from pathlib import Path
+
+from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import InputFormat
+
+
+def strip_image_data(extraction: dict) -> dict:
+    """Remove base64 image data from extraction to reduce size."""
+    if "pictures" in extraction:
+        extraction["pictures"] = [
+            {k: v for k, v in pic.items() if k != "image"}
+            for pic in extraction["pictures"]
+        ]
+    return extraction
+
+
+def extract(converter: DocumentConverter, file_path: str) -> dict:
+    """Extract text and structure from a DOCX file using Docling."""
+    result = converter.convert(file_path)
+
+    # Export as markdown for text extraction
+    text = result.document.export_to_markdown()
+
+    # Get full structured extraction (stripped of image data)
+    extraction = result.document.export_to_dict()
+    extraction = strip_image_data(extraction)
+
+    return {
+        "text": text,
+        "wordCount": len(text.split()),
+        "charCount": len(text),
+        "tableCount": len(extraction.get("tables", [])),
+        "imageCount": len(extraction.get("pictures", [])),
+        "extraction": extraction,
+    }
+
+
+def main():
+    # Signal that we're ready (after imports complete)
+    print(json.dumps({"ready": True}), flush=True)
+
+    # Initialize converter ONCE (restricted to DOCX only to avoid loading PDF models)
+    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
+
+    # Signal that converter is initialized
+    print(json.dumps({"initialized": True}), flush=True)
+
+    # Read file paths from stdin, output JSON per line
+    for line in sys.stdin:
+        file_path = line.strip()
+        if not file_path:
+            continue
+
+        try:
+            if not Path(file_path).exists():
+                print(json.dumps({"success": False, "error": f"File not found: {file_path}"}), flush=True)
+                continue
+
+            result = extract(converter, file_path)
+            print(json.dumps({"success": True, **result}), flush=True)
+
+        except Exception as e:
+            print(json.dumps({"success": False, "error": str(e)}), flush=True)
+
+
+if __name__ == "__main__":
+    main()