Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,12 @@ CLAUDE.md
.DS_Store
Thumbs.db

# Bundled native libraries (binary artifacts)
python/paperjam/libpdfium.so

# Test fixtures (generated)
tests/fixtures/large_*.pdf

# Per-session test artifacts (accuracy reports, etc.)
tests/output/

# Sphinx
_build

# Lock file (library)
uv.lock

Expand Down
12 changes: 11 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ members = [
]

[workspace.package]
version = "0.1.3"
version = "0.2.0"
edition = "2021"
rust-version = "1.75"
license = "MIT"
Expand All @@ -46,3 +46,13 @@ roxmltree = "0.20"
ureq = { version = "3", default-features = false, features = ["rustls-no-provider"] }
rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs", "logging", "std", "tls12"] }
tokio = { version = "1", features = ["rt-multi-thread"] }

[profile.release]
lto = "thin"
codegen-units = 1
strip = "symbols"

[profile.release-with-debug]
inherits = "release"
strip = "none"
debug = true
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,18 @@ steps:
```

```bash
paperjam pipeline run pipeline.yaml
pj pipeline run pipeline.yaml
```

### CLI usage

The CLI binary installed by `cargo install paperjam-cli` is named `pj`:

```bash
paperjam extract text report.pdf
paperjam extract tables data.pdf --format csv
paperjam convert report.pdf report.docx
paperjam info document.pdf
pj info document.pdf
pj extract text report.pdf
pj extract tables data.pdf --strategy lattice --format json
pj convert auto report.pdf -o report.docx
```

### MCP server
Expand Down
20 changes: 13 additions & 7 deletions docs-site/docs/getting-started/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,23 @@ pip install "paperjam[pandas]"

### Documentation

To build these docs locally:
The docs site uses [Docusaurus](https://docusaurus.io/). To build it locally:

```bash
pip install "paperjam[docs]"
cd docs
make html
git clone https://github.com/ByteVeda/paperjam
cd paperjam/docs-site
npm ci
npm run start # dev server with hot reload
npm run build # static site under docs-site/build/
```

## Installing from source

Building from source requires a Rust toolchain (stable, 1.77+) and [maturin](https://maturin.rs/):
Building from source requires a Rust toolchain (stable, 1.75+) and [maturin](https://maturin.rs/):

```bash
pip install maturin
git clone https://github.com/paperjam/paperjam
git clone https://github.com/ByteVeda/paperjam
cd paperjam
maturin develop --release
```
Expand All @@ -52,7 +54,11 @@ Pre-built wheels on PyPI include all features.
| Feature | Methods enabled |
|---------|----------------|
| `render` | `render_page`, `render_pages`, `page.render`, `visual_diff` |
| `signatures` | `signatures`, `verify_signatures`, `sign` |
| `signatures` | `sign_document`, `verify_signatures`, `extract_signatures` |
| `ltv` | LTV timestamp embedding (TSA, OCSP, CRL) for signing |
| `validation` | `validate_pdf_a`, `validate_pdf_ua`, `convert_to_pdf_a` |
| `parallel` | Rayon-based parallel processing (default) |
| `mmap` | Memory-mapped file access for large documents |

When building from source you can control features with the `--features` flag:

Expand Down
27 changes: 27 additions & 0 deletions py_src/paperjam/_paperjam.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,33 @@ def fill_form(
document: RustDocument,
values: dict[str, str],
need_appearances: bool = True,
generate_appearances: bool = False,
) -> tuple[RustDocument, dict[str, Any]]: ...
def modify_form_field(
document: RustDocument,
field_name: str,
*,
value: str | None = None,
default_value: str | None = None,
read_only: bool | None = None,
required: bool | None = None,
max_length: int | None = None,
options: list[dict[str, str]] | None = None,
) -> tuple[RustDocument, dict[str, Any]]: ...
def add_form_field(
document: RustDocument,
name: str,
field_type: str,
page: int,
rect: tuple[float, float, float, float],
value: str | None = None,
default_value: str | None = None,
read_only: bool = False,
required: bool = False,
max_length: int | None = None,
options: list[dict[str, str]] | None = None,
font_size: float = 0.0,
generate_appearance: bool = True,
) -> tuple[RustDocument, dict[str, Any]]: ...
def render_page(
document: RustDocument,
Expand Down
34 changes: 23 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ build-backend = "maturin"

[project]
name = "paperjam"
version = "0.1.3"
description = "Fast PDF processing powered by Rust"
version = "0.2.0"
description = "Fast multi-format document processing (PDF, DOCX, XLSX, PPTX, HTML, EPUB) powered by Rust"
readme = "README.md"
license = { text = "MIT" }
requires-python = ">=3.12"
classifiers = [
Expand All @@ -17,22 +18,33 @@ classifiers = [
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Rust",
"Topic :: Office/Business",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing",
"Typing :: Typed",
]
keywords = ["pdf", "text-extraction", "table-extraction", "rust"]
keywords = [
"pdf",
"docx",
"xlsx",
"pptx",
"html",
"epub",
"text-extraction",
"table-extraction",
"document-conversion",
"rust",
]

[project.urls]
Homepage = "https://docs.byteveda.org/paperjam/"
Documentation = "https://docs.byteveda.org/paperjam/"
Repository = "https://github.com/ByteVeda/paperjam"
Issues = "https://github.com/ByteVeda/paperjam/issues"
Changelog = "https://github.com/ByteVeda/paperjam/blob/main/CHANGELOG.md"

[project.optional-dependencies]
pandas = ["pandas>=2.0"]
docs = [
"sphinx>=7.0",
"furo>=2024.1.29",
"myst-parser>=2.0",
"sphinx-copybutton>=0.5",
"sphinxcontrib-mermaid>=2.0",
"sphinx-autobuild>=2024.0.0",
]
dev = [
"pre-commit>=4.0",
"pytest>=8.0",
Expand Down
Loading