diff --git a/.github/workflows/partition-benchmark.yaml b/.github/workflows/partition-benchmark.yaml new file mode 100644 index 0000000000..b0f4ba9732 --- /dev/null +++ b/.github/workflows/partition-benchmark.yaml @@ -0,0 +1,120 @@ +name: Partition Benchmark + +# Runs on every PR targeting main to detect regressions. +# Can also be triggered manually to establish or inspect a new baseline. +on: + pull_request: + branches: [main] + workflow_dispatch: + +permissions: + contents: read + +env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + PYTHON_VERSION: "3.12" + # Number of times to run the full benchmark suite. + NUM_ITERATIONS: "3" + # 20% threshold for now and tune later + REGRESSION_THRESHOLD: "0.20" + # Increment to change cache key when benchmark-affecting dependencies are updated, to ensure clean slate runs. + CACHE_VERSION: "v2" + # S3 location for metrics – matches core-product convention. + S3_METRICS_BUCKET_KEY: utic-metrics/ci-metrics + S3_BENCHMARK_PATH: open-source/partition-benchmark/benchmark_best.json + +jobs: + setup: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + benchmark: + name: Measure and compare partition() runtime + runs-on: ubuntu-latest + needs: [setup] + + steps: + + - uses: actions/checkout@v4 + + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + + + - name: Restore HuggingFace model cache + uses: actions/cache/restore@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }} + restore-keys: | + hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}- + hf-models-${{ runner.os }}- + + + - name: Run partition benchmark + env: + NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }} + run: | + uv run --no-sync python scripts/performance/benchmark_partition.py \ + benchmark_results.json + + - name: Save HuggingFace model cache + uses: actions/cache/save@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }} + + + - name: Download previous best from S3 + continue-on-error: true + env: + AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }} + run: | + aws s3 cp \ + "s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" \ + benchmark_best.json + + + - name: Compare results against stored best + id: compare + run: | + uv run --no-sync python scripts/performance/compare_benchmark.py \ + benchmark_results.json \ + benchmark_best.json \ + ${{ env.REGRESSION_THRESHOLD }} + + + - name: Upload best result to S3 + continue-on-error: true + env: + AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }} + run: | + aws s3 cp \ + benchmark_best.json \ + "s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" + + + - name: Upload benchmark artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.sha }} + path: | + benchmark_results.json + benchmark_best.json + retention-days: 30 diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ca6d8910..d030366edf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.21.4 +- Add a github action for testing time regressions + ## 0.21.3 ### Enhancements diff --git a/scripts/performance/benchmark_partition.py b/scripts/performance/benchmark_partition.py new file mode 100644 index 0000000000..50b712cf8a --- /dev/null +++ b/scripts/performance/benchmark_partition.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +"""Measure partition() runtime over a fixed set of representative example-docs files. + +Follows the same conventions as the existing scripts/performance tooling: + - PDFs and images are run with strategy="hi_res". + - Everything else is run with strategy="fast". + - Each file is timed over NUM_ITERATIONS runs (after a warmup) and the + average is recorded, matching time_partition.py behaviour. + +Writes a JSON file mapping each file to its average runtime, plus a ``__total__`` +key with the wall-clock total. An optional positional argument sets the output +path (default: scripts/performance/partition-speed-test/benchmark_results.json). + +Also writes the total duration to $GITHUB_OUTPUT as ``duration=``. + +Usage: + uv run --no-sync python scripts/performance/benchmark_partition.py [output.json] + +Environment variables: + NUM_ITERATIONS number of timed iterations per file (default: 1) +""" + +from __future__ import annotations + +import json +import logging +import os +import sys +import time +from pathlib import Path + +from unstructured.partition.auto import partition + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + + +BENCHMARK_FILES: list[tuple[str, str]] = [ + # PDFs - hi_res + ("example-docs/pdf/a1977-backus-p21.pdf", "hi_res"), + ("example-docs/pdf/copy-protected.pdf", "hi_res"), + ("example-docs/pdf/reliance.pdf", "hi_res"), + ("example-docs/pdf/pdf-with-ocr-text.pdf", "hi_res"), + # Images - hi_res + ("example-docs/double-column-A.jpg", "hi_res"), + ("example-docs/double-column-B.jpg", "hi_res"), + ("example-docs/embedded-images-tables.jpg", "hi_res"), + # Other document types - fast + ("example-docs/contains-pictures.docx", "fast"), + ("example-docs/example-10k-1p.html", "fast"), + ("example-docs/science-exploration-1p.pptx", "fast"), +] + +NUM_ITERATIONS: int = int(os.environ.get("NUM_ITERATIONS", "1")) + +DEFAULT_OUTPUT = Path(__file__).parent / "partition-speed-test" / "benchmark_results.json" + + +def _warmup(filepath: str) -> None: + """Run a single fast-strategy partition to warm the process up. + + Mirrors warm_up_process() in time_partition.py: uses a warmup-docs/ + variant if present, otherwise falls back to the file itself. + """ + + warmup_dir = Path(__file__).parent / "warmup-docs" + warmup_file = warmup_dir / f"warmup{Path(filepath).suffix}" + target = str(warmup_file) if warmup_file.exists() else filepath + partition(target, strategy="fast") + + +def _measure(filepath: str, strategy: str, iterations: int) -> float: + """Return the average wall-clock seconds for partitioning *filepath*. + + Identical logic to time_partition.measure_execution_time(). + """ + + total = 0.0 + for _ in range(iterations): + t0 = time.perf_counter() + partition(filepath, strategy=strategy) + total += time.perf_counter() - t0 + return total / iterations + + +def _set_github_output(key: str, value: str) -> None: + """Write key=value to $GITHUB_OUTPUT when running in Actions.""" + gho = os.environ.get("GITHUB_OUTPUT") + if gho: + with open(gho, "a") as fh: + fh.write(f"{key}={value}\n") + + +def main() -> None: + output_path = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_OUTPUT + repo_root = Path(__file__).resolve().parent.parent.parent # scripts/performance/ -> repo root + + logger.info("=" * 60) + logger.info(f"Partition benchmark (NUM_ITERATIONS={NUM_ITERATIONS})") + logger.info("=" * 60) + + results: dict[str, float] = {} + grand_start = time.perf_counter() + + for rel_path, strategy in BENCHMARK_FILES: + filepath = repo_root / rel_path + if not filepath.exists(): + logger.warning(f" WARNING: {rel_path} not found – skipping.") + continue + + logger.info(f" {rel_path} (strategy={strategy}, iterations={NUM_ITERATIONS})") + _warmup(str(filepath)) + avg = _measure(str(filepath), strategy, NUM_ITERATIONS) + results[rel_path] = round(avg, 4) + logger.info(f" avg {avg:.2f}s") + + total_seconds = round(time.perf_counter() - grand_start, 2) + results["__total__"] = total_seconds + + logger.info(f"\nTotal wall-clock time: {total_seconds}s") + + # Write JSON results file (consumed by compare_benchmark.py) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(results, indent=2) + "\n") + logger.info(f"Results written to {output_path}") + + # Also expose total as a GitHub Actions step output + _set_github_output("duration", str(int(total_seconds))) + + +if __name__ == "__main__": + main() diff --git a/scripts/performance/compare_benchmark.py b/scripts/performance/compare_benchmark.py new file mode 100644 index 0000000000..a12f32bf81 --- /dev/null +++ b/scripts/performance/compare_benchmark.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""Compare current benchmark results against the stored best runtime. + +Usage: + uv run --no-sync python scripts/performance/compare_benchmark.py \ + benchmark_results.json \ + benchmark_best.json \ + [threshold] + + current.json JSON produced by benchmark_partition.py for this run + best.json JSON produced by a previous run (the stored best); may not + exist yet on the very first run + threshold Float regression allowance, e.g. 0.20 for 20% (default 0.20) +""" + +from __future__ import annotations + +import json +import logging +import math +import os +import sys +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + + +def _github_output(key: str, value: str) -> None: + """Write a key=value pair to $GITHUB_OUTPUT when running in Actions.""" + gho = os.environ.get("GITHUB_OUTPUT") + if gho: + with open(gho, "a") as fh: + fh.write(f"{key}={value}\n") + + +def _fmt(seconds: float) -> str: + """Format a duration, handling NaN for files missing from one side.""" + if math.isnan(seconds): + return " n/a" + return f"{seconds:7.2f}s" + + +def _pct_diff(current: float, best: float) -> str: + if best == 0: + return " n/a" + diff = (current - best) / best * 100 + sign = "+" if diff >= 0 else "" + return f"{sign}{diff:.1f}%" + + +def main() -> None: + if len(sys.argv) < 3: + print(__doc__, file=sys.stderr) + sys.exit(2) + + current_path = Path(sys.argv[1]) + best_path = Path(sys.argv[2]) + threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.20 + + current: dict[str, float] = json.loads(current_path.read_text()) + current_total: float = current["__total__"] + + if not best_path.exists(): + logger.info("No stored best found – saving current run as the baseline.") + logger.info(f" Total: {current_total:.2f}s") + best_path.parent.mkdir(parents=True, exist_ok=True) + best_path.write_text(current_path.read_text()) + _github_output("new_best", "true") + _github_output("regression", "false") + sys.exit(0) + + best: dict[str, float] = json.loads(best_path.read_text()) + best_total: float = best["__total__"] + limit: float = best_total * (1.0 + threshold) + + # Collect all file keys (exclude the __total__ sentinel) + all_files = sorted((set(current.keys()) | set(best.keys())) - {"__total__"}) + + col_w = max((len(f) for f in all_files), default=40) + 2 + header = f"{'File':<{col_w}} {'Current':>9} {'Best':>9} {'Delta':>8}" + logger.info("=" * len(header)) + logger.info("Partition benchmark comparison") + logger.info("=" * len(header)) + logger.info(header) + logger.info("-" * len(header)) + + for fname in all_files: + c = current.get(fname, float("nan")) + b = best.get(fname, float("nan")) + logger.info(f"{fname:<{col_w}} {_fmt(c)} {_fmt(b)} {_pct_diff(c, b):>8}") + + logger.info("-" * len(header)) + logger.info( + f"{'TOTAL':<{col_w}} {_fmt(current_total)} {_fmt(best_total)}" + f" {_pct_diff(current_total, best_total):>8}" + ) + logger.info("") + logger.info(f"Threshold : {threshold * 100:.0f}% (fail if current > {limit:.2f}s)") + logger.info("") + + # fail on regression beyond threshold + if current_total > limit: + excess_pct = (current_total - best_total) / best_total * 100 + logger.error( + f"FAIL: current runtime {current_total:.2f}s exceeds best " + f"{best_total:.2f}s by {excess_pct:.1f}% " + f"(threshold {threshold * 100:.0f}%, limit {limit:.2f}s)." + ) + _github_output("new_best", "false") + _github_output("regression", "true") + sys.exit(1) + + # pass: current is within threshold of best; update best if current is faster + if current_total < best_total: + improvement_pct = (best_total - current_total) / best_total * 100 + logger.info( + f"PASS (new best): {current_total:.2f}s is {improvement_pct:.1f}% " + f"faster than the previous best {best_total:.2f}s – updating in S3." + ) + best_path.write_text(current_path.read_text()) + _github_output("new_best", "true") + else: + slack_pct = (current_total - best_total) / best_total * 100 + logger.info( + f"PASS: {current_total:.2f}s is {slack_pct:.1f}% slower than best " + f"{best_total:.2f}s (within {threshold * 100:.0f}% threshold)." + ) + _github_output("new_best", "false") + + _github_output("regression", "false") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 95bb48a3fe..03a41ffa02 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.3" # pragma: no cover +__version__ = "0.21.4" # pragma: no cover