-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathchunks.rs
More file actions
209 lines (190 loc) · 6.31 KB
/
chunks.rs
File metadata and controls
209 lines (190 loc) · 6.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
//! Chunk types — atomic units of extracted content.
use serde::{Deserialize, Serialize};
use super::bbox::{BoundingBox, Vertex};
use super::enums::{PdfLayer, TextFormat, TextType};
/// Atomic text fragment — one font run in the PDF content stream.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextChunk {
/// Decoded Unicode text content
pub value: String,
/// Bounding box in page coordinates
pub bbox: BoundingBox,
/// Font name (base font name like "Helvetica")
pub font_name: String,
/// Font size in points (effective, after matrix transforms)
pub font_size: f64,
/// Font weight (100.0 - 900.0)
pub font_weight: f64,
/// Italic angle from font descriptor
pub italic_angle: f64,
/// Text color as hex string (e.g. "#000000")
pub font_color: String,
/// Contrast ratio against background (1.0-21.0)
pub contrast_ratio: f64,
/// X-coordinate of each glyph end position
pub symbol_ends: Vec<f64>,
/// Text baseline format (normal, superscript, subscript)
pub text_format: TextFormat,
/// Text type classification
pub text_type: TextType,
/// Processing layer that produced this chunk
pub pdf_layer: PdfLayer,
/// Whether the OCG (Optional Content Group) is visible
pub ocg_visible: bool,
/// Global index in extraction order
pub index: Option<usize>,
/// Page number (1-based)
pub page_number: Option<u32>,
/// Nesting level (from structure tree)
pub level: Option<String>,
/// Marked content identifier (from BDC/BMC operators in the content stream).
/// Links this chunk to a structure tree node for semantic tagging.
#[serde(skip_serializing_if = "Option::is_none")]
pub mcid: Option<i64>,
}
impl TextChunk {
/// Whether the entire text value is whitespace.
pub fn is_white_space_chunk(&self) -> bool {
self.value.chars().all(|c| c.is_whitespace())
}
/// Collapse consecutive spaces into single space.
pub fn compress_spaces(&mut self) {
let mut result = String::with_capacity(self.value.len());
let mut last_was_space = false;
for ch in self.value.chars() {
if ch == ' ' {
if !last_was_space {
result.push(' ');
}
last_was_space = true;
} else {
result.push(ch);
last_was_space = false;
}
}
self.value = result;
}
/// Number of characters in the text.
pub fn text_length(&self) -> usize {
self.value.chars().count()
}
/// Average width per symbol.
pub fn average_symbol_width(&self) -> f64 {
let len = self.text_length();
if len == 0 {
return 0.0;
}
self.bbox.width() / len as f64
}
/// Get the X coordinate where the symbol at `idx` starts.
pub fn symbol_start_coordinate(&self, idx: usize) -> f64 {
if idx == 0 {
self.bbox.left_x
} else if idx <= self.symbol_ends.len() {
self.symbol_ends[idx - 1]
} else {
self.bbox.right_x
}
}
/// Get the X coordinate where the symbol at `idx` ends.
pub fn symbol_end_coordinate(&self, idx: usize) -> f64 {
if idx < self.symbol_ends.len() {
self.symbol_ends[idx]
} else {
self.bbox.right_x
}
}
}
/// Image bounding box — actual pixel data extracted at output time.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageChunk {
/// Bounding box in page coordinates
pub bbox: BoundingBox,
/// Global index
pub index: Option<u32>,
/// Nesting level
pub level: Option<String>,
}
/// Line segment — used for table border detection.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LineChunk {
/// Bounding box in page coordinates
pub bbox: BoundingBox,
/// Global index
pub index: Option<u32>,
/// Nesting level
pub level: Option<String>,
/// Start vertex
pub start: Vertex,
/// End vertex
pub end: Vertex,
/// Line width in points
pub width: f64,
/// Whether this is a horizontal line
pub is_horizontal_line: bool,
/// Whether this is a vertical line
pub is_vertical_line: bool,
/// Whether this is a square-like shape
pub is_square: bool,
}
/// Vector graphic — collection of line segments forming bullets, decorations, etc.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LineArtChunk {
/// Bounding box encompassing the line art
pub bbox: BoundingBox,
/// Global index
pub index: Option<u32>,
/// Nesting level
pub level: Option<String>,
/// Component line segments
pub line_chunks: Vec<LineChunk>,
}
/// Size comparison tolerance for line art classification.
pub const LINE_ART_SIZE_EPSILON: f64 = 1.0;
#[cfg(test)]
mod tests {
use super::*;
fn make_text_chunk(value: &str) -> TextChunk {
TextChunk {
value: value.to_string(),
bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
font_name: "Helvetica".to_string(),
font_size: 12.0,
font_weight: 400.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
}
}
#[test]
fn test_is_white_space_chunk() {
assert!(make_text_chunk(" ").is_white_space_chunk());
assert!(!make_text_chunk("hello").is_white_space_chunk());
assert!(make_text_chunk("").is_white_space_chunk());
}
#[test]
fn test_compress_spaces() {
let mut chunk = make_text_chunk("hello world test");
chunk.compress_spaces();
assert_eq!(chunk.value, "hello world test");
}
#[test]
fn test_text_length() {
assert_eq!(make_text_chunk("hello").text_length(), 5);
assert_eq!(make_text_chunk("").text_length(), 0);
}
#[test]
fn test_average_symbol_width() {
let chunk = make_text_chunk("hello");
assert!((chunk.average_symbol_width() - 20.0).abs() < 0.01);
}
}