mem: free intermediate arrays during YoloX inference (#496)

KRRT7 · web-flow · commit 333c6d728837 · 2026-04-02T18:32:11.000-07:00
Free `origin_img`, `img`/`ort_inputs`, `output`, and the **PIL pixel
buffer** at the points where they become dead in `image_processing()`,
instead of letting them linger until function return.

The two main wins:
1. **`origin_img`** — the full-resolution numpy copy of the input PIL
image stays alive through the entire ONNX `session.run()` call. `del
origin_img` frees it before inference.
2. **The PIL image itself** — after `np.array(image)` copies the pixel
data, the PIL buffer is no longer needed. `image.close()` frees it
immediately while preserving PIL metadata (`.width`, `.height`,
`.format`, `.size`).

Savings are proportional to image size: larger pages (higher DPI
renders) carry bigger unused buffers through inference.

## Benchmark

### Azure Standard_D8s_v5 — 8 vCPU Intel Xeon Platinum 8473C, 32 GiB RAM

Simulated ONNX session (35 MiB workspace), 1700×2200 letter-size image
at 200 DPI.

#### Memory

| Ref | Peak Memory | Allocations | Delta |
|:---|---:|---:|:---|
| `main` (base) | 72.0 MiB | 124 | |
| This PR (head) | 47.0 MiB | 118 | 🟢 -35% |

**Peak memory drops 25 MiB (-35%)** by freeing dead buffers before ONNX
inference. Timing is neutral (within noise).

---
*Generated by codeflash optimization agent*

&lt;details&gt;
&lt;summary&gt;&lt;b&gt;Reproduce the benchmark locally&lt;/b&gt;&lt;/summary&gt;

This PR includes a memory benchmark at
`benchmarks/test_benchmark_yolox.py`. Save the script below as
`compare_memory.py` and run it from the repo root:

```bash
pip install memray rich pytest-benchmark
python compare_memory.py
```

```python
#!/usr/bin/env python3
"""Compare peak memory between main and the current branch."""
import argparse, subprocess, sys, tempfile
from pathlib import Path
from rich.console import Console
from rich.table import Table

console = Console()
RUNNER = "import sys\nsys.exit(__import__('pytest').main(sys.argv[1:]))\n"

def branch():
    return subprocess.run(["git","rev-parse","--abbrev-ref","HEAD"],
                          capture_output=True, text=True, check=True).stdout.strip()

def profile(bench, bin_path, runner, ref, bench_src=None):
    head = branch()
    checkout = ref != head
    if checkout:
        subprocess.run(["git","stash","--include-untracked"], capture_output=True)
        subprocess.run(["git","checkout",ref], capture_output=True, check=True)
    copied = False
    try:
        bp = Path(bench)
        if not bp.exists() and bench_src:
            bp.parent.mkdir(parents=True, exist_ok=True)
            bp.write_text(bench_src, encoding="utf-8")
            copied = True
        subprocess.run([sys.executable,"-m","memray","run","--force","-o",bin_path,
                        runner, bench,"-x","-q","--no-header","-rN"],
                       check=True, timeout=600)
    finally:
        if copied:
            Path(bench).unlink(missing_ok=True)
        if checkout:
            subprocess.run(["git","checkout",head], capture_output=True, check=True)
            subprocess.run(["git","stash","pop"], capture_output=True)

def read_peak(bin_path):
    from memray import FileReader
    r = FileReader(bin_path)
    return r.metadata.peak_memory, r.metadata.total_allocations

def fmt(n):
    return f"{n/(1&lt;&lt;20):.1f} MiB" if n &gt;= 1&lt;&lt;20 else f"{n/(1&lt;&lt;10):.1f} KiB" if n &gt;= 1&lt;&lt;10 else f"{n} B"

def main():
    p = argparse.ArgumentParser()
    p.add_argument("bench", nargs="?", default="benchmarks/test_benchmark_yolox.py")
    p.add_argument("--base", default="main")
    args = p.parse_args()
    head = branch()
    if head == args.base:
        console.print(f"[red]Already on {args.base} — checkout the PR branch first.[/red]"); sys.exit(1)
    bench_src = Path(args.bench).read_text(encoding="utf-8") if Path(args.bench).exists() else None
    with tempfile.TemporaryDirectory() as td:
        runner = str(Path(td) / "run_bench.py")
        Path(runner).write_text(RUNNER, encoding="utf-8")
        console.print(f"\n[bold]Profiling [cyan]{head}[/cyan]...[/bold]")
        profile(args.bench, f"{td}/head.bin", runner, head)
        hp, ha = read_peak(f"{td}/head.bin")
        console.print(f"[bold]Profiling [cyan]{args.base}[/cyan]...[/bold]")
        profile(args.bench, f"{td}/base.bin", runner, args.base, bench_src=bench_src)
        bp, ba = read_peak(f"{td}/base.bin")
    table = Table(title=f"Memory: {args.base} vs {head}")
    for col in ["Ref","Peak Memory","Allocations","Delta"]:
        table.add_column(col, justify="right" if col != "Ref" else "left")
    table.add_row(f"{args.base} (base)", fmt(bp), f"{ba:,}", "")
    pct = (hp-bp)/bp*100 if bp else 0
    icon = "🟢" if pct &lt; -1 else ("🔴" if pct &gt; 1 else "⚪")
    table.add_row(f"{head} (head)", fmt(hp), f"{ha:,}", f"{icon} {pct:+.1f}%")
    console.print(table)

if __name__ == "__main__":
    main()
```

&lt;/details&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 1.6.1
+
+### Enhancement
+- Free intermediate arrays (`origin_img`, `img`, `ort_inputs`, `output`) and PIL pixel buffer at dead points during YoloX `image_processing()` to reduce peak memory during inference
+
 ## 1.6.0
 
 ### Fix
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/test_benchmark_yolox.py b/benchmarks/test_benchmark_yolox.py
@@ -0,0 +1,63 @@
+"""Benchmark for YoloX image_processing() memory optimization.
+
+Uses a fake ONNX session to isolate the memory behavior of image_processing()
+without requiring the real model weights. The fake session allocates a realistic
+35 MiB workspace to simulate ONNX inference memory pressure.
+"""
+
+import numpy as np
+from PIL import Image as PILImage
+
+from unstructured_inference.models.yolox import UnstructuredYoloXModel
+
+
+class _FakeInput:
+    def __init__(self) -> None:
+        self.name = "input"
+
+
+class _FakeSession:
+    """Simulates an ONNX inference session with realistic memory allocation."""
+
+    def get_inputs(self):
+        return [_FakeInput()]
+
+    def run(self, _names, _inputs):
+        workspace = np.empty((35 * 1024 * 1024,), dtype=np.uint8)  # 35 MiB  # noqa: F841
+        # input_shape (1024,768), strides [8,16,32] → 128*96 + 64*48 + 32*24 = 16128
+        return [np.random.randn(1, 16128, 16).astype(np.float32)]
+
+
+def make_model() -> UnstructuredYoloXModel:
+    model = object.__new__(UnstructuredYoloXModel)
+    model.model = _FakeSession()
+    model.model_path = "yolox_fake"
+    model.layout_classes = {
+        0: "Caption",
+        1: "Footnote",
+        2: "Formula",
+        3: "List-item",
+        4: "Page-footer",
+        5: "Page-header",
+        6: "Picture",
+        7: "Section-header",
+        8: "Table",
+        9: "Text",
+        10: "Title",
+    }
+    return model
+
+
+# Letter-size page at 200 DPI — the default render resolution
+def make_letter_200dpi() -> PILImage.Image:
+    return PILImage.fromarray(np.random.randint(0, 255, (2200, 1700, 3), dtype=np.uint8))
+
+
+def run_image_processing():
+    model = make_model()
+    img = make_letter_200dpi()
+    return model.image_processing(img)
+
+
+def test_benchmark_yolox_image_processing(benchmark):
+    benchmark(run_image_processing)
diff --git a/pyproject.toml b/pyproject.toml
@@ -128,5 +128,8 @@ filterwarnings = [
     "ignore::DeprecationWarning",
 ]
 
+[tool.codeflash]
+benchmarks-root = "benchmarks"
+
 [tool.coverage.report]
 fail_under = 90
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "1.6.0"  # pragma: no cover
+__version__ = "1.6.1"  # pragma: no cover
diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
@@ -107,13 +107,17 @@ def image_processing(
         # TODO (benjamin): check other shapes for inference
         input_shape = (1024, 768)
         origin_img = np.array(image)
+        image.close()
         img, ratio = preprocess(origin_img, input_shape)
+        del origin_img  # Free full-size image array before ONNX inference
         session = self.model
 
         ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
         output = session.run(None, ort_inputs)
+        del img, ort_inputs  # Free preprocessed inputs after inference
         # TODO(benjamin): check for p6
         predictions = demo_postprocess(output[0], input_shape, p6=False)[0]
+        del output
 
         boxes = predictions[:, :4]
         scores = predictions[:, 4:5] * predictions[:, 5:]

Original file line number	Diff line number	Diff line change
`@@ -128,5 +128,8 @@ filterwarnings = [`
`128`	`128`	`"ignore::DeprecationWarning",`
`129`	`129`	`]`
`130`	`130`
	`131`	`+[tool.codeflash]`
	`132`	`+benchmarks-root = "benchmarks"`
	`133`	`+`
`131`	`134`	`[tool.coverage.report]`
`132`	`135`	`fail_under = 90`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.6.0" # pragma: no cover`
	`1`	`+__version__ = "1.6.1" # pragma: no cover`