-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathtable.rs
More file actions
174 lines (158 loc) · 5.02 KB
/
table.rs
File metadata and controls
174 lines (158 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
//! Table structures — TableBorder, TableBorderRow, TableBorderCell.
use serde::{Deserialize, Serialize};
use super::bbox::BoundingBox;
use super::chunks::TextChunk;
use super::content::ContentElement;
use super::enums::SemanticType;
/// Epsilon for table border coordinate comparisons.
pub const TABLE_BORDER_EPSILON: f64 = 0.5;
/// Minimum intersection for assigning content to cells.
pub const MIN_CELL_CONTENT_INTERSECTION_PERCENT: f64 = 0.01;
/// Grid-based table structure defined by row/column coordinates.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableBorder {
/// Bounding box
pub bbox: BoundingBox,
/// Global index
pub index: Option<u32>,
/// Nesting level
pub level: Option<String>,
/// X-coordinates of column boundaries (N+1 for N columns)
pub x_coordinates: Vec<f64>,
/// Widths of column boundary lines
pub x_widths: Vec<f64>,
/// Y-coordinates of row boundaries (M+1 for M rows)
pub y_coordinates: Vec<f64>,
/// Widths of row boundary lines
pub y_widths: Vec<f64>,
/// Table rows
pub rows: Vec<TableBorderRow>,
/// Number of rows
pub num_rows: usize,
/// Number of columns
pub num_columns: usize,
/// Whether this table has structural problems
pub is_bad_table: bool,
/// Whether this came from a transformer model
pub is_table_transformer: bool,
/// Previous table in cross-page chain
pub previous_table: Option<Box<TableBorder>>,
/// Next table in cross-page chain
pub next_table: Option<Box<TableBorder>>,
}
/// A row in a TableBorder.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableBorderRow {
/// Bounding box
pub bbox: BoundingBox,
/// Global index
pub index: Option<u32>,
/// Nesting level
pub level: Option<String>,
/// Row number (0-based)
pub row_number: usize,
/// Cells in this row
pub cells: Vec<TableBorderCell>,
/// Optional semantic type (header, body, footer)
pub semantic_type: Option<SemanticType>,
}
/// A cell in a TableBorderRow.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableBorderCell {
/// Bounding box
pub bbox: BoundingBox,
/// Global index
pub index: Option<u32>,
/// Nesting level
pub level: Option<String>,
/// Row number (0-based)
pub row_number: usize,
/// Column number (0-based)
pub col_number: usize,
/// Number of rows this cell spans
pub row_span: usize,
/// Number of columns this cell spans
pub col_span: usize,
/// Raw text content (table tokens)
pub content: Vec<TableToken>,
/// Processed content elements (after sub-pipeline)
pub contents: Vec<ContentElement>,
/// Optional semantic type
pub semantic_type: Option<SemanticType>,
}
/// A text chunk assigned to a table cell.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableToken {
/// Base text chunk
pub base: TextChunk,
/// Token type
pub token_type: TableTokenType,
}
/// Type of content in a table cell.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum TableTokenType {
/// Text content
Text,
/// Image content
Image,
/// Nested table
Table,
}
/// Row of tokens in a table cell.
pub type TableTokenRow = Vec<TableToken>;
/// Collection of detected table borders, indexed by page.
#[derive(Debug, Clone, Default)]
pub struct TableBordersCollection {
/// Per-page table borders
pub table_borders: Vec<Vec<TableBorder>>,
}
impl TableBordersCollection {
/// Create a new collection for the given number of pages.
pub fn new(num_pages: usize) -> Self {
Self {
table_borders: vec![Vec::new(); num_pages],
}
}
/// Add a table border to a page.
pub fn add(&mut self, page: usize, border: TableBorder) {
if page < self.table_borders.len() {
self.table_borders[page].push(border);
}
}
/// Get table borders for a page.
pub fn get_page(&self, page: usize) -> &[TableBorder] {
if page < self.table_borders.len() {
&self.table_borders[page]
} else {
&[]
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_table_borders_collection() {
let mut collection = TableBordersCollection::new(5);
let border = TableBorder {
bbox: BoundingBox::new(Some(1), 10.0, 10.0, 200.0, 300.0),
index: None,
level: None,
x_coordinates: vec![10.0, 100.0, 200.0],
x_widths: vec![1.0, 1.0, 1.0],
y_coordinates: vec![10.0, 150.0, 300.0],
y_widths: vec![1.0, 1.0, 1.0],
rows: vec![],
num_rows: 2,
num_columns: 2,
is_bad_table: false,
is_table_transformer: false,
previous_table: None,
next_table: None,
};
collection.add(0, border);
assert_eq!(collection.get_page(0).len(), 1);
assert_eq!(collection.get_page(1).len(), 0);
assert_eq!(collection.get_page(10).len(), 0);
}
}