Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions .github/workflows/partition-benchmark.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
name: Partition Benchmark

# Runs on every PR targeting main to detect regressions.
# Can also be triggered manually to establish or inspect a new baseline.
on:
pull_request:
branches: [main]
workflow_dispatch:

permissions:
contents: read

env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON_VERSION: "3.12"
# Number of times to run the full benchmark suite.
NUM_ITERATIONS: "3"
# 20% threshold for now and tune later
REGRESSION_THRESHOLD: "0.20"
# Increment to change cache key when benchmark-affecting dependencies are updated, to ensure clean slate runs.
CACHE_VERSION: "v2"
# S3 location for metrics – matches core-product convention.
S3_METRICS_BUCKET_KEY: utic-metrics/ci-metrics
S3_BENCHMARK_PATH: open-source/partition-benchmark/benchmark_best.json

jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}

benchmark:
name: Measure and compare partition() runtime
runs-on: ubuntu-latest
needs: [setup]

steps:

- uses: actions/checkout@v4

- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}

- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor


- name: Restore HuggingFace model cache
uses: actions/cache/restore@v4
with:
path: ~/.cache/huggingface
key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
restore-keys: |
hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-
hf-models-${{ runner.os }}-


- name: Run partition benchmark
env:
NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }}
run: |
uv run --no-sync python scripts/performance/benchmark_partition.py \
benchmark_results.json

- name: Save HuggingFace model cache
uses: actions/cache/save@v4
with:
path: ~/.cache/huggingface
key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}


- name: Download previous best from S3
continue-on-error: true
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
run: |
aws s3 cp \
"s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" \
benchmark_best.json


- name: Compare results against stored best
id: compare
run: |
uv run --no-sync python scripts/performance/compare_benchmark.py \
benchmark_results.json \
benchmark_best.json \
${{ env.REGRESSION_THRESHOLD }}


- name: Upload best result to S3
continue-on-error: true
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
run: |
aws s3 cp \
benchmark_best.json \
"s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}"


- name: Upload benchmark artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results-${{ github.sha }}
path: |
benchmark_results.json
benchmark_best.json
retention-days: 30
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## 0.21.4
- Add a github action for testing time regressions

## 0.21.3

### Enhancements
Expand Down
132 changes: 132 additions & 0 deletions scripts/performance/benchmark_partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""Measure partition() runtime over a fixed set of representative example-docs files.

Follows the same conventions as the existing scripts/performance tooling:
- PDFs and images are run with strategy="hi_res".
- Everything else is run with strategy="fast".
- Each file is timed over NUM_ITERATIONS runs (after a warmup) and the
average is recorded, matching time_partition.py behaviour.

Writes a JSON file mapping each file to its average runtime, plus a ``__total__``
key with the wall-clock total. An optional positional argument sets the output
path (default: scripts/performance/partition-speed-test/benchmark_results.json).

Also writes the total duration to $GITHUB_OUTPUT as ``duration=<seconds>``.

Usage:
uv run --no-sync python scripts/performance/benchmark_partition.py [output.json]

Environment variables:
NUM_ITERATIONS number of timed iterations per file (default: 1)
"""

from __future__ import annotations

import json
import logging
import os
import sys
import time
from pathlib import Path

from unstructured.partition.auto import partition

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)


BENCHMARK_FILES: list[tuple[str, str]] = [
# PDFs - hi_res
("example-docs/pdf/a1977-backus-p21.pdf", "hi_res"),
("example-docs/pdf/copy-protected.pdf", "hi_res"),
("example-docs/pdf/reliance.pdf", "hi_res"),
("example-docs/pdf/pdf-with-ocr-text.pdf", "hi_res"),
# Images - hi_res
("example-docs/double-column-A.jpg", "hi_res"),
("example-docs/double-column-B.jpg", "hi_res"),
("example-docs/embedded-images-tables.jpg", "hi_res"),
# Other document types - fast
("example-docs/contains-pictures.docx", "fast"),
("example-docs/example-10k-1p.html", "fast"),
("example-docs/science-exploration-1p.pptx", "fast"),
]

NUM_ITERATIONS: int = int(os.environ.get("NUM_ITERATIONS", "1"))

DEFAULT_OUTPUT = Path(__file__).parent / "partition-speed-test" / "benchmark_results.json"


def _warmup(filepath: str) -> None:
"""Run a single fast-strategy partition to warm the process up.

Mirrors warm_up_process() in time_partition.py: uses a warmup-docs/
variant if present, otherwise falls back to the file itself.
"""

warmup_dir = Path(__file__).parent / "warmup-docs"
warmup_file = warmup_dir / f"warmup{Path(filepath).suffix}"
target = str(warmup_file) if warmup_file.exists() else filepath
partition(target, strategy="fast")


def _measure(filepath: str, strategy: str, iterations: int) -> float:
"""Return the average wall-clock seconds for partitioning *filepath*.

Identical logic to time_partition.measure_execution_time().
"""

total = 0.0
for _ in range(iterations):
t0 = time.perf_counter()
partition(filepath, strategy=strategy)
total += time.perf_counter() - t0
return total / iterations


def _set_github_output(key: str, value: str) -> None:
"""Write key=value to $GITHUB_OUTPUT when running in Actions."""
gho = os.environ.get("GITHUB_OUTPUT")
if gho:
with open(gho, "a") as fh:
fh.write(f"{key}={value}\n")


def main() -> None:
output_path = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_OUTPUT
repo_root = Path(__file__).resolve().parent.parent.parent # scripts/performance/ -> repo root

logger.info("=" * 60)
logger.info(f"Partition benchmark (NUM_ITERATIONS={NUM_ITERATIONS})")
logger.info("=" * 60)

results: dict[str, float] = {}
grand_start = time.perf_counter()

for rel_path, strategy in BENCHMARK_FILES:
filepath = repo_root / rel_path
if not filepath.exists():
logger.warning(f" WARNING: {rel_path} not found – skipping.")
continue

logger.info(f" {rel_path} (strategy={strategy}, iterations={NUM_ITERATIONS})")
_warmup(str(filepath))
avg = _measure(str(filepath), strategy, NUM_ITERATIONS)
results[rel_path] = round(avg, 4)
logger.info(f" avg {avg:.2f}s")

total_seconds = round(time.perf_counter() - grand_start, 2)
results["__total__"] = total_seconds

logger.info(f"\nTotal wall-clock time: {total_seconds}s")

# Write JSON results file (consumed by compare_benchmark.py)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(results, indent=2) + "\n")
logger.info(f"Results written to {output_path}")

# Also expose total as a GitHub Actions step output
_set_github_output("duration", str(int(total_seconds)))


if __name__ == "__main__":
main()
136 changes: 136 additions & 0 deletions scripts/performance/compare_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""Compare current benchmark results against the stored best runtime.

Usage:
uv run --no-sync python scripts/performance/compare_benchmark.py \
benchmark_results.json \
benchmark_best.json \
[threshold]

current.json JSON produced by benchmark_partition.py for this run
best.json JSON produced by a previous run (the stored best); may not
exist yet on the very first run
threshold Float regression allowance, e.g. 0.20 for 20% (default 0.20)
"""

from __future__ import annotations

import json
import logging
import math
import os
import sys
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)


def _github_output(key: str, value: str) -> None:
"""Write a key=value pair to $GITHUB_OUTPUT when running in Actions."""
gho = os.environ.get("GITHUB_OUTPUT")
if gho:
with open(gho, "a") as fh:
fh.write(f"{key}={value}\n")


def _fmt(seconds: float) -> str:
"""Format a duration, handling NaN for files missing from one side."""
if math.isnan(seconds):
return " n/a"
return f"{seconds:7.2f}s"


def _pct_diff(current: float, best: float) -> str:
if best == 0:
return " n/a"
diff = (current - best) / best * 100
sign = "+" if diff >= 0 else ""
return f"{sign}{diff:.1f}%"


def main() -> None:
if len(sys.argv) < 3:
print(__doc__, file=sys.stderr)
sys.exit(2)

current_path = Path(sys.argv[1])
best_path = Path(sys.argv[2])
threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.20

current: dict[str, float] = json.loads(current_path.read_text())
current_total: float = current["__total__"]

if not best_path.exists():
logger.info("No stored best found – saving current run as the baseline.")
logger.info(f" Total: {current_total:.2f}s")
best_path.parent.mkdir(parents=True, exist_ok=True)
best_path.write_text(current_path.read_text())
_github_output("new_best", "true")
_github_output("regression", "false")
sys.exit(0)

best: dict[str, float] = json.loads(best_path.read_text())
best_total: float = best["__total__"]
limit: float = best_total * (1.0 + threshold)

# Collect all file keys (exclude the __total__ sentinel)
all_files = sorted((set(current.keys()) | set(best.keys())) - {"__total__"})

col_w = max((len(f) for f in all_files), default=40) + 2
header = f"{'File':<{col_w}} {'Current':>9} {'Best':>9} {'Delta':>8}"
logger.info("=" * len(header))
logger.info("Partition benchmark comparison")
logger.info("=" * len(header))
logger.info(header)
logger.info("-" * len(header))

for fname in all_files:
c = current.get(fname, float("nan"))
b = best.get(fname, float("nan"))
logger.info(f"{fname:<{col_w}} {_fmt(c)} {_fmt(b)} {_pct_diff(c, b):>8}")

logger.info("-" * len(header))
logger.info(
f"{'TOTAL':<{col_w}} {_fmt(current_total)} {_fmt(best_total)}"
f" {_pct_diff(current_total, best_total):>8}"
)
logger.info("")
logger.info(f"Threshold : {threshold * 100:.0f}% (fail if current > {limit:.2f}s)")
logger.info("")

# fail on regression beyond threshold
if current_total > limit:
excess_pct = (current_total - best_total) / best_total * 100
logger.error(
f"FAIL: current runtime {current_total:.2f}s exceeds best "
f"{best_total:.2f}s by {excess_pct:.1f}% "
f"(threshold {threshold * 100:.0f}%, limit {limit:.2f}s)."
)
_github_output("new_best", "false")
_github_output("regression", "true")
sys.exit(1)

# pass: current is within threshold of best; update best if current is faster
if current_total < best_total:
improvement_pct = (best_total - current_total) / best_total * 100
logger.info(
f"PASS (new best): {current_total:.2f}s is {improvement_pct:.1f}% "
f"faster than the previous best {best_total:.2f}s – updating in S3."
)
best_path.write_text(current_path.read_text())
_github_output("new_best", "true")
else:
slack_pct = (current_total - best_total) / best_total * 100
logger.info(
f"PASS: {current_total:.2f}s is {slack_pct:.1f}% slower than best "
f"{best_total:.2f}s (within {threshold * 100:.0f}% threshold)."
)
_github_output("new_best", "false")

_github_output("regression", "false")
sys.exit(0)


if __name__ == "__main__":
main()
Loading
Loading