Skip to content

Commit c5e09eb

Browse files
jbarrowomohokcoj
andauthored
Dataset Preprocessing Scripts (#19)
* Adding dataset preparation Co-authored-by: Joe Barrow <joseph.d.barrow@gmail.com> Co-authored-by: Pete Matsyburka <hi@petem.dev> * close document handle * add document ids for test and val * remove models bundling and dataset --------- Co-authored-by: Pete Matsyburka <hi@petem.dev>
1 parent 487c9b7 commit c5e09eb

6 files changed

Lines changed: 8365 additions & 3 deletions

File tree

dataset/generate_coco.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import json
2+
import sys
3+
from pathlib import Path
4+
from concurrent.futures import ProcessPoolExecutor, as_completed
5+
6+
import formalpdf
7+
import logging
8+
9+
10+
logging.getLogger("pypdfium2").setLevel(logging.ERROR)
11+
12+
def process_pdf(pdf_path, output_dir):
13+
"""Process all pages of a PDF and generate JSON annotation files"""
14+
json_dir = output_dir / "json"
15+
images_dir = output_dir / "images"
16+
pdf_name = pdf_path.stem
17+
18+
# Check if first page JSON exists - if so, skip entire PDF
19+
first_page_json = json_dir / f"{pdf_name}-0.json"
20+
21+
if first_page_json.exists():
22+
return f"Skipped {pdf_name} (already processed)"
23+
24+
try:
25+
document = formalpdf.open(str(pdf_path))
26+
num_pages = len(document)
27+
total_widgets = 0
28+
29+
for page_idx in range(num_pages):
30+
page = document[page_idx]
31+
pdfium_page = document.document[page_idx]
32+
33+
width_pt, height_pt = pdfium_page.get_size()
34+
target_px = 1680
35+
# Scale based on the smaller dimension
36+
scale = target_px / min(width_pt, height_pt)
37+
38+
image = pdfium_page.render(scale=scale, may_draw_forms=False).to_pil()
39+
widgets = page.widgets()
40+
41+
image_filename = f"{pdf_name}-{page_idx}.png"
42+
43+
# Create image info
44+
image_info = {
45+
"file_name": image_filename,
46+
"width": image.width,
47+
"height": image.height,
48+
}
49+
50+
# Save image
51+
image.save(images_dir / image_filename)
52+
53+
# Process annotations
54+
annotations = []
55+
for widget in widgets:
56+
# convert bounding box in pt to pixels
57+
top = widget.rect.top * scale
58+
left = widget.rect.left * scale
59+
bottom = widget.rect.bottom * scale
60+
right = widget.rect.right * scale
61+
62+
y0 = image.height - top
63+
y1 = image.height - bottom
64+
65+
# try for the category, otherwise "Text"
66+
categories = { "Text": 0,
67+
"ComboBox": 0,
68+
"CheckBox": 1,
69+
"RadioButton": 1,
70+
"Signature": 2,
71+
"PushButton": 3,
72+
"ListBox": 3,
73+
"Unknown": 3 }
74+
75+
category_id = categories.get(widget.field_type_string, 3)
76+
77+
if category_id > 2:
78+
continue
79+
80+
bbox_width = right - left
81+
bbox_height = y1 - y0
82+
83+
annotations.append({
84+
"category_id": category_id,
85+
"bbox": [left, y0, bbox_width, bbox_height],
86+
"area": bbox_width * bbox_height,
87+
"iscrowd": 0,
88+
"segmentation": [],
89+
})
90+
91+
# Create per-page JSON
92+
page_data = {
93+
"image": image_info,
94+
"annotations": annotations,
95+
}
96+
97+
# Save JSON
98+
json_path = json_dir / f"{pdf_name}-{page_idx}.json"
99+
100+
with json_path.open("w") as fp:
101+
json.dump(page_data, fp, indent=2)
102+
103+
total_widgets += len(widgets)
104+
105+
document.document.close()
106+
return f"Processed {pdf_name}: {num_pages} pages, {total_widgets} widgets"
107+
108+
except Exception as e:
109+
return f"Error processing {pdf_name}: {str(e)}"
110+
111+
112+
def main():
113+
pdfs_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("pdfs")
114+
output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("coco")
115+
json_dir = output_dir / "json"
116+
images_dir = output_dir / "images"
117+
118+
# Create directories
119+
output_dir.mkdir(exist_ok=True)
120+
json_dir.mkdir(exist_ok=True)
121+
images_dir.mkdir(exist_ok=True)
122+
123+
# Find all PDF files
124+
pdf_files = list(pdfs_dir.rglob("*.pdf"))
125+
total_pdfs = len(pdf_files)
126+
print(f"Found {total_pdfs} PDF files")
127+
128+
# Check which PDFs are already processed
129+
skipped_count = 0
130+
tasks = []
131+
132+
for pdf_path in pdf_files:
133+
pdf_name = pdf_path.stem
134+
first_page_json = json_dir / f"{pdf_name}-0.json"
135+
136+
if first_page_json.exists():
137+
skipped_count += 1
138+
else:
139+
tasks.append(pdf_path)
140+
141+
print(f"Already processed (skipped): {skipped_count} PDFs")
142+
print(f"New PDFs to process: {len(tasks)}")
143+
144+
if tasks:
145+
# Process PDFs in parallel
146+
with ProcessPoolExecutor() as executor:
147+
futures = {executor.submit(process_pdf, pdf_path, output_dir): pdf_path for pdf_path in tasks}
148+
149+
completed = 0
150+
for future in as_completed(futures):
151+
completed += 1
152+
result = future.result()
153+
print(f"[{completed}/{len(tasks)}] {result}")
154+
155+
156+
if __name__ == "__main__":
157+
main()

dataset/merge_coco.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import json
2+
import os
3+
import sys
4+
from pathlib import Path
5+
6+
7+
def merge_coco_annotations():
8+
"""Merge individual JSON files into a single COCO format annotations file"""
9+
coco_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("coco")
10+
json_dir = coco_dir / "json"
11+
output_file = coco_dir / "annotations.json"
12+
13+
# COCO format structure
14+
coco_data = {
15+
"info": {
16+
"year": 2025,
17+
"version": "1.0",
18+
"description": "Form field detection dataset",
19+
"contributor": "",
20+
"url": "",
21+
"date_created": "2025-10-16"
22+
},
23+
"licenses": [
24+
{
25+
"id": 1,
26+
"name": "Unknown",
27+
"url": ""
28+
}
29+
],
30+
"images": [],
31+
"annotations": [],
32+
"categories": [
33+
{"id": 0, "name": "Text", "supercategory": "none"},
34+
{"id": 1, "name": "CheckBox", "supercategory": "none"}
35+
]
36+
}
37+
38+
# Get all JSON files sorted by name
39+
json_files = sorted(json_dir.glob("*.json"))
40+
41+
image_id = 0
42+
annotation_id = 0
43+
44+
for json_file in json_files:
45+
with json_file.open("r") as fp:
46+
page_data = json.load(fp)
47+
48+
# Add image with sequential ID
49+
image_info = page_data["image"].copy()
50+
image_info["id"] = image_id
51+
coco_data["images"].append(image_info)
52+
53+
# Track seen bounding boxes for this page to skip duplicates
54+
seen_bboxes = set()
55+
56+
# Add annotations with sequential IDs and image_id reference
57+
for annotation in page_data["annotations"]:
58+
if json_file.name.startswith("2908641"):
59+
continue
60+
61+
# Round bounding box to integers
62+
bbox = annotation["bbox"]
63+
bbox_int = [round(bbox[0]), round(bbox[1]), round(bbox[2]), round(bbox[3])]
64+
65+
# Skip if any x or y coordinate is negative
66+
if bbox_int[0] < 0 or bbox_int[1] < 0:
67+
continue
68+
69+
# Skip if bbox extends beyond image boundaries
70+
if (bbox_int[0] + bbox_int[2] > image_info["width"] or
71+
bbox_int[1] + bbox_int[3] > image_info["height"]):
72+
continue
73+
74+
# Calculate area from rounded bounding box
75+
area_int = bbox_int[2] * bbox_int[3]
76+
77+
bbox_tuple = tuple(bbox_int)
78+
79+
# Skip if this bounding box was already added for this page
80+
if bbox_tuple in seen_bboxes:
81+
continue
82+
83+
seen_bboxes.add(bbox_tuple)
84+
annotation_copy = annotation.copy()
85+
annotation_copy["id"] = annotation_id
86+
annotation_copy["image_id"] = image_id
87+
annotation_copy["bbox"] = bbox_int
88+
annotation_copy["area"] = area_int
89+
coco_data["annotations"].append(annotation_copy)
90+
annotation_id += 1
91+
92+
image_id += 1
93+
94+
# Save merged COCO format file
95+
with output_file.open("w") as fp:
96+
json.dump(coco_data, fp, indent=2)
97+
98+
print(f"Merged {len(coco_data['images'])} images with {len(coco_data['annotations'])} annotations")
99+
print(f"Saved to {output_file}")
100+
101+
# Create symlink in images folder
102+
images_dir = coco_dir / "images"
103+
symlink_path = images_dir / "_annotations.coco.json"
104+
105+
# Remove existing symlink if it exists
106+
if symlink_path.exists() or symlink_path.is_symlink():
107+
symlink_path.unlink()
108+
109+
# Create relative symlink
110+
os.symlink(os.path.relpath(output_file, images_dir), symlink_path)
111+
print(f"Created symlink at {symlink_path}")
112+
113+
114+
if __name__ == "__main__":
115+
merge_coco_annotations()

0 commit comments

Comments
 (0)