Fix image preprocessing using the original implementation as reference

jspast · jspast · commit 92e1d7c42fbf · 2026-04-23T11:38:48.000-03:00
diff --git a/cells2table/models/PaddlePaddle/cell_detection.py b/cells2table/models/PaddlePaddle/cell_detection.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Iterable, Iterator, Sequence
 
+import cv2
 import numpy as np
 from numpy.typing import NDArray
 
@@ -54,6 +55,17 @@ def __call__(
 
         return result
 
+    def preprocess(self, input: Iterable[NDArray[np.uint8]]) -> list[NDArray[np.float32]]:
+        blob = cv2.dnn.blobFromImages(
+            input,
+            scalefactor=1 / 255.0,
+            size=self.input_shape,
+            swapRB=False,
+            crop=False,
+        )  # ty:ignore[no-matching-overload]
+
+        return list(blob)
+
     @classmethod
     def postprocess(
         cls,
diff --git a/cells2table/models/PaddlePaddle/table_classification.py b/cells2table/models/PaddlePaddle/table_classification.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Iterable, Sequence
 
+import cv2
 import numpy as np
 from numpy.typing import NDArray
 
@@ -26,9 +27,9 @@ def get_download_options(cls) -> DownloadOptions:
 
     def __call__(self, input: Iterable[NDArray[np.uint8]]) -> list[ClassificationResult]:
         logger.debug("Started preprocessing")
-        input = self.preprocess(input)
+        images = self.preprocess(input)
 
-        input_dict = dict(zip(self.input_names, [input]))
+        input_dict = dict(zip(self.input_names, [images]))
 
         logger.debug("Done preprocessing")
         logger.debug("Started running the model")
@@ -44,6 +45,65 @@ def __call__(self, input: Iterable[NDArray[np.uint8]]) -> list[ClassificationRes
 
         return result
 
+    def preprocess(self, input: Iterable[NDArray[np.uint8]]) -> list[NDArray[np.float32]]:
+        """PP-LCNet image preprocessing pipeline.
+
+        Args:
+            input: iterable of HxWxC uint8 images (C=3, assumed RGB).
+
+        Output:
+            list of CxHxW float32 tensors (BGR order), normalized with PP-LCNet mean/std.
+        """
+        resize_short = 256  # shorter edge after resize
+        crop_size = 224  # center crop size
+        mean = np.asarray([0.406, 0.456, 0.485], dtype=np.float32)  # RGB mean
+        std = np.asarray([0.225, 0.224, 0.229], dtype=np.float32)  # RGB std
+        rescale_factor = 1.0 / 255.0  # uint8 -> [0,1]
+
+        out: list[NDArray[np.float32]] = []
+
+        for img in input:
+            # Validate and coerce to expected dtype/layout (HWC, uint8, 3 channels)
+            if img.ndim != 3 or img.shape[2] != 3:
+                raise ValueError(f"Expected HxWx3 image, got shape={img.shape}")
+            if img.dtype != np.uint8:
+                raise ValueError(f"Expected uint8 image, got dtype={img.dtype}")
+
+            h, w = img.shape[:2]
+
+            # Resize while preserving aspect ratio using the shorter edge as reference
+            scale = resize_short / float(min(h, w))
+            new_h = int(round(h * scale))
+            new_w = int(round(w * scale))
+
+            # Perform the resize (OpenCV expects size as (width, height))
+            resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+
+            # Center-crop to crop_size x crop_size (assumes resized dims are >= crop_size)
+            if new_h < crop_size or new_w < crop_size:
+                raise ValueError(
+                    f"Resized image too small for center crop: resized={new_h}x{new_w}, crop={crop_size}"
+                )
+            top = (new_h - crop_size) // 2
+            left = (new_w - crop_size) // 2
+            cropped = resized[top : top + crop_size, left : left + crop_size, :]
+
+            # Convert to float32 and rescale to [0,1]
+            x = cropped.astype(np.float32) * rescale_factor
+
+            # Normalize per channel in RGB space: (x - mean) / std
+            x = (x - mean) / std
+
+            # Convert RGB -> BGR
+            x = x[..., ::-1]
+
+            # Convert HWC -> CHW
+            x = np.transpose(x, (2, 0, 1)).astype(np.float32, copy=False)
+
+            out.append(x)
+
+        return out
+
     @classmethod
     def postprocess(cls, pred: Sequence[Sequence[float]]) -> list[ClassificationResult]:
         return [ClassificationResult(cls.classes[np.argmax(p)], max(p)) for p in pred]
diff --git a/cells2table/models/runtimes/onnx.py b/cells2table/models/runtimes/onnx.py
@@ -1,22 +1,14 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Iterable
 
-import cv2
-import numpy as np
 import onnxruntime as ort
-from numpy.typing import NDArray
 
 from cells2table.models.tasks.base import BaseModel
 
 
 class OnnxModel(BaseModel, ABC):
     """Base interface for ONNX models."""
 
-    scale = 1 / 255.0
-    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
-    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
-
     @classmethod
     @abstractmethod
     def get_onnx_path(self) -> str:
@@ -49,14 +41,3 @@ def input_names(self):
     @property
     def output_names(self):
         return [v.name for v in self.session.get_outputs()]
-
-    def preprocess(self, input: Iterable[NDArray[np.uint8]]) -> list[NDArray[np.uint8]]:
-        output = []
-
-        for img in input:
-            img = cv2.resize(img, dsize=self.input_shape, interpolation=cv2.INTER_LANCZOS4)
-            img = (img.astype(np.float32) * self.scale - self.mean) / self.std  # Normalize
-            img = img.transpose(2, 0, 1)  # HWC to CHW
-            output.append(img)
-
-        return output