Skip to content

Commit 920aa10

Browse files
jbarrowomohokcoj
andcommitted
Fixing dataset preparation bug where some widgets are shifted 40px up/left.
Co-authored-by: Joe Barrow <joseph.d.barrow@gmail.com> Co-authored-by: Pete Matsyburka <hi@petem.dev>
1 parent 6907542 commit 920aa10

1 file changed

Lines changed: 38 additions & 11 deletions

File tree

dataset/generate_coco.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
import formalpdf
77
import logging
8+
import pypdfium2.raw as pdfium_c
9+
import ctypes
810

911

1012
logging.getLogger("pypdfium2").setLevel(logging.ERROR)
@@ -38,7 +40,7 @@ def process_pdf(pdf_path, output_dir):
3840
image = pdfium_page.render(scale=scale, may_draw_forms=False).to_pil()
3941
widgets = page.widgets()
4042

41-
image_filename = f"{pdf_name}-{page_idx}.png"
43+
image_filename = f"{pdf_name}-{page_idx}.jpg"
4244

4345
# Create image info
4446
image_info = {
@@ -48,19 +50,42 @@ def process_pdf(pdf_path, output_dir):
4850
}
4951

5052
# Save image
51-
image.save(images_dir / image_filename)
53+
image.save(images_dir / image_filename, format="JPEG")
5254

5355
# Process annotations
5456
annotations = []
5557
for widget in widgets:
56-
# convert bounding box in pt to pixels
57-
top = widget.rect.top * scale
58-
left = widget.rect.left * scale
59-
bottom = widget.rect.bottom * scale
60-
right = widget.rect.right * scale
61-
62-
y0 = image.height - top
63-
y1 = image.height - bottom
58+
# Use pypdfium2's page-to-device coordinate transformation
59+
# to properly convert PDF coordinates to image pixel coordinates
60+
page_x1, page_y1 = widget.rect.left, widget.rect.bottom
61+
page_x2, page_y2 = widget.rect.right, widget.rect.top
62+
63+
# Convert page coordinates to device coordinates
64+
# using pypdfium2's FPDF_PageToDevice function
65+
dev_x1 = ctypes.c_int()
66+
dev_y1 = ctypes.c_int()
67+
dev_x2 = ctypes.c_int()
68+
dev_y2 = ctypes.c_int()
69+
70+
# FPDF_PageToDevice(page, start_x, start_y, size_x, size_y, rotate, page_x, page_y, device_x, device_y)
71+
pdfium_c.FPDF_PageToDevice(
72+
pdfium_page.raw, 0, 0, image.width, image.height, 0,
73+
page_x1, page_y1, ctypes.byref(dev_x1), ctypes.byref(dev_y1)
74+
)
75+
pdfium_c.FPDF_PageToDevice(
76+
pdfium_page.raw, 0, 0, image.width, image.height, 0,
77+
page_x2, page_y2, ctypes.byref(dev_x2), ctypes.byref(dev_y2)
78+
)
79+
80+
# Convert to Python ints
81+
left = float(dev_x1.value)
82+
bottom = float(dev_y1.value)
83+
right = float(dev_x2.value)
84+
top = float(dev_y2.value)
85+
86+
# Device coordinates have top-left origin, so y values are already correct
87+
y0 = min(top, bottom)
88+
y1 = max(top, bottom)
6489

6590
# try for the category, otherwise "Text"
6691
categories = { "Text": 0,
@@ -102,11 +127,13 @@ def process_pdf(pdf_path, output_dir):
102127

103128
total_widgets += len(widgets)
104129

105-
document.document.close()
106130
return f"Processed {pdf_name}: {num_pages} pages, {total_widgets} widgets"
107131

108132
except Exception as e:
109133
return f"Error processing {pdf_name}: {str(e)}"
134+
135+
finally:
136+
document.document.close()
110137

111138

112139
def main():

0 commit comments

Comments
 (0)