-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf-text-quality.ts
More file actions
72 lines (62 loc) · 2.12 KB
/
pdf-text-quality.ts
File metadata and controls
72 lines (62 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import type {
PdfTextExtractionMetrics,
PdfTextQualityAssessment
} from './pdf-processing-types';
export const PDF_TEXT_EXTRACTION_INSUFFICIENT_MESSAGE =
'No se pudo extraer suficiente texto del PDF de forma confiable. Usa un PDF con texto seleccionable o prueba otro proveedor. Este proyecto no incluye OCR.';
const MIN_NON_WHITESPACE_CHARS = 160;
const MIN_PRINTABLE_CHAR_RATIO = 0.85;
const MIN_MULTI_PAGE_CHARS_PER_PAGE = 140;
const MAX_EMPTY_PAGE_RATIO_DIVISOR = 2;
const MIN_CHARS_PER_NON_EMPTY_MULTI_PAGE = 120;
/**
* Applies conservative, fail-closed quality checks before allowing PDF text to
* replace the native file-input path.
*/
export const assessPdfTextExtractionQuality = (
metrics: PdfTextExtractionMetrics
): PdfTextQualityAssessment => {
const failureReasons: string[] = [];
if (metrics.nonEmptyPageCount === 0) {
failureReasons.push('El PDF no devolvio texto util en ninguna pagina.');
}
if (metrics.totalNonWhitespaceChars < MIN_NON_WHITESPACE_CHARS) {
failureReasons.push(
`El texto extraido solo contiene ${metrics.totalNonWhitespaceChars} caracteres no vacios.`
);
}
if (metrics.printableCharRatio < MIN_PRINTABLE_CHAR_RATIO) {
failureReasons.push(
`La proporcion de caracteres imprimibles (${metrics.printableCharRatio.toFixed(2)}) es demasiado baja.`
);
}
if (
metrics.pageCount >= 2 &&
metrics.totalChars < MIN_MULTI_PAGE_CHARS_PER_PAGE * metrics.pageCount
) {
failureReasons.push(
`El volumen total extraido (${metrics.totalChars} caracteres) es demasiado bajo para ${metrics.pageCount} paginas.`
);
}
if (
metrics.pageCount >= 2 &&
metrics.emptyPageCount * MAX_EMPTY_PAGE_RATIO_DIVISOR >= metrics.pageCount
) {
failureReasons.push(
`${metrics.emptyPageCount} de ${metrics.pageCount} paginas quedaron vacias tras la extraccion.`
);
}
if (
metrics.pageCount >= 2 &&
metrics.charsPerNonEmptyPage < MIN_CHARS_PER_NON_EMPTY_MULTI_PAGE
) {
failureReasons.push(
`El promedio de texto por pagina no vacia (${metrics.charsPerNonEmptyPage.toFixed(1)}) es sospechosamente bajo.`
);
}
return {
isAcceptable: failureReasons.length === 0,
failureReasons,
metrics
};
};