Skip to content

Commit 4ed45b9

Browse files
committed
feat: lang detection
1 parent f68916f commit 4ed45b9

8 files changed

Lines changed: 51 additions & 2 deletions

File tree

.env.example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ EXTRACT_WORKERS=4 # Parallel worker processes
2323
# Embedder settings
2424
EMBED_INPUT_PREFIX=extracted # Input directory prefix (extracted text)
2525
EMBED_BATCH_SIZE=100 # Documents per batch
26-
EMBED_CONCURRENCY=20 # Parallel API requests (Tier 2: 5K RPM, adjust per tier)
26+
EMBED_CONCURRENCY=20 # Parallel API requests
2727
GOOGLE_API_KEY= # Required - get from https://aistudio.google.com/apikey
2828

2929
# Cloudflare R2 (optional - for cloud storage)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
-- Migration: Add language detection columns
2+
-- Run this on existing databases to add language support
3+
4+
-- Add language columns to extraction metadata
5+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS language VARCHAR(10);
6+
ALTER TABLE documents ADD COLUMN IF NOT EXISTS language_confidence REAL;
7+
8+
-- Create index for language filtering
9+
CREATE INDEX IF NOT EXISTS idx_documents_language ON documents(language) WHERE language IS NOT NULL;

db/schema.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ CREATE TABLE IF NOT EXISTS documents (
2222
char_count INTEGER,
2323
table_count INTEGER,
2424
image_count INTEGER,
25+
language VARCHAR(10),
26+
language_confidence REAL,
2527
extraction_error TEXT,
2628

2729
-- Embedding data
@@ -47,6 +49,7 @@ CREATE INDEX IF NOT EXISTS idx_documents_source_url ON documents(source_url);
4749
CREATE INDEX IF NOT EXISTS idx_documents_extracted ON documents(extracted_at) WHERE extracted_at IS NOT NULL;
4850
CREATE INDEX IF NOT EXISTS idx_documents_embedded ON documents(embedded_at) WHERE embedded_at IS NOT NULL;
4951
CREATE INDEX IF NOT EXISTS idx_documents_cluster ON documents(cluster_id) WHERE cluster_id IS NOT NULL;
52+
CREATE INDEX IF NOT EXISTS idx_documents_language ON documents(language) WHERE language IS NOT NULL;
5053

5154
-- Vector similarity search index (IVFFlat for approximate nearest neighbor)
5255
-- Note: Run this AFTER populating embeddings for better index quality

packages/extractor/processor.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ class PersistentExtractor {
8484
charCount?: number;
8585
tableCount?: number;
8686
imageCount?: number;
87+
language?: string;
88+
languageConfidence?: number;
8789
extraction?: any;
8890
error?: string;
8991
}> {
@@ -268,6 +270,8 @@ async function processBatch(
268270
char_count: result.charCount!,
269271
table_count: result.tableCount!,
270272
image_count: result.imageCount!,
273+
language: result.language || "unknown",
274+
language_confidence: result.languageConfidence || 0,
271275
extracted_at: new Date().toISOString(),
272276
});
273277

packages/extractor/python/extract_server.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,23 @@
2121
from docling.datamodel.base_models import InputFormat
2222
from docling_core.types.doc.labels import DocItemLabel
2323

24+
import langid
25+
26+
27+
def detect_language(text: str, min_chars: int = 50) -> tuple[str, float]:
28+
"""Detect language using langid. Returns (lang_code, confidence)."""
29+
if not text or len(text) < min_chars:
30+
return "unknown", 0.0
31+
32+
try:
33+
lang, score = langid.classify(text[:2000])
34+
# Normalize confidence: langid scores are negative log-probs, typically -500 to -3000
35+
# Map to 0-1 where closer to 0 = higher confidence
36+
confidence = max(0.0, min(1.0, 1.0 + score / 3000))
37+
return lang, confidence
38+
except Exception:
39+
return "unknown", 0.0
40+
2441

2542
@contextlib.contextmanager
2643
def suppress_stderr():
@@ -85,6 +102,9 @@ def extract(converter: DocumentConverter, file_path: str) -> dict:
85102
# Use smart extraction to avoid table padding bloat
86103
text = smart_extract_text(result.document)
87104

105+
# Detect language
106+
lang, lang_confidence = detect_language(text)
107+
88108
# Get full structured extraction (stripped of image data)
89109
extraction = result.document.export_to_dict()
90110
extraction = strip_image_data(extraction)
@@ -95,6 +115,8 @@ def extract(converter: DocumentConverter, file_path: str) -> dict:
95115
"charCount": len(text),
96116
"tableCount": len(extraction.get("tables", [])),
97117
"imageCount": len(extraction.get("pictures", [])),
118+
"language": lang,
119+
"languageConfidence": lang_confidence,
98120
"extraction": extraction,
99121
}
100122

packages/extractor/python/pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@
22
name = "text-extractor-python"
33
version = "0.1.0"
44
requires-python = ">=3.10"
5-
dependencies = ["docling>=2.0.0"]
5+
dependencies = [
6+
"docling>=2.0.0",
7+
"langid>=1.1.6",
8+
]

packages/extractor/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ export interface ExtractedDocument {
1111
charCount: number;
1212
tableCount: number;
1313
imageCount: number;
14+
language: string;
15+
languageConfidence: number;
1416
extraction: Record<string, unknown>;
1517
extractedAt: string;
1618
}

packages/shared/db.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ export interface DocumentRecord {
2121
char_count: number | null;
2222
table_count: number | null;
2323
image_count: number | null;
24+
language: string | null;
25+
language_confidence: number | null;
2426
extraction_error: string | null;
2527

2628
// Embedding data
@@ -40,6 +42,8 @@ export interface ExtractionData {
4042
char_count: number;
4143
table_count: number;
4244
image_count: number;
45+
language: string;
46+
language_confidence: number;
4347
extracted_at?: string;
4448
extraction_error?: string;
4549
}
@@ -200,6 +204,8 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
200204
char_count = ${data.char_count},
201205
table_count = ${data.table_count},
202206
image_count = ${data.image_count},
207+
language = ${data.language},
208+
language_confidence = ${data.language_confidence},
203209
extraction_error = NULL
204210
WHERE id = ${data.id}
205211
`;

0 commit comments

Comments
 (0)