Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions app/converters/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,45 @@ def _deny_url_fetcher(url: str, **kwargs) -> None:


# ---------------------------------------------------------------------------
# DOCX → PDF
# DOCX → PDF (mammoth → HTML → WeasyPrint → PDF)
# ---------------------------------------------------------------------------
# Pure-Python pipeline: mammoth extracts the DOCX body as HTML with images
# inlined as data: URIs, WeasyPrint renders it to PDF. SSRF-safe — the same
# _deny_url_fetcher used by md→pdf blocks any external URL the converted
# HTML might reference. Best-effort: footnotes, headers/footers and embedded
# OLE objects get simplified by mammoth; tables, images, hyperlinks and
# basic paragraph styles survive intact.
@register(("docx", "pdf"))
class DocxToPdfConverter(BaseConverter):
def convert(self, input_path: Path, output_path: Path, **kwargs) -> Path:
from docx2pdf import convert # type: ignore
import mammoth
import weasyprint

with open(input_path, "rb") as f:
result = mammoth.convert_to_html(f)
html_body = result.value
had_warnings = any(m.type == "warning" for m in result.messages)

notice = (
(
'<div style="background:#fff3cd;border:1px solid #ffe39a;'
'padding:8px;font-size:10pt;margin-bottom:1em">'
"Some elements were simplified during conversion "
"(footnotes, headers/footers, or embedded objects)."
"</div>"
)
if had_warnings
else ""
)

convert(str(input_path), str(output_path))
full_html = (
"<!DOCTYPE html><html><head>"
"<style>body{font-family:sans-serif;margin:2cm;line-height:1.6}"
"table{border-collapse:collapse}"
"td,th{border:1px solid #999;padding:4pt}</style>"
f"</head><body>{notice}{html_body}</body></html>"
)
weasyprint.HTML(string=full_html, url_fetcher=_deny_url_fetcher).write_pdf(str(output_path))
return output_path


Expand Down
11 changes: 11 additions & 0 deletions app/static/js/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,17 @@ function setMode(mode) {
compressMode = 'quality';
}

// Drop-zone help text differs per mode: convert covers all source formats,
// compress is limited to JPG/PNG/WebP/TIFF + MP4/AVI/MOV/MKV/WebM. The
// server rejects mismatches anyway, but showing the right list up-front
// keeps users from uploading e.g. an MP3 only to see a 422.
const supConv = document.getElementById('supported-convert');
const supComp = document.getElementById('supported-compress');
if (supConv && supComp) {
supConv.classList.toggle('hidden', mode !== 'convert');
supComp.classList.toggle('hidden', mode !== 'compress');
}

updateConvertOptionsVisibility();
renderFileList();
updateQualityVisibility();
Expand Down
6 changes: 5 additions & 1 deletion app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,15 @@ <h1 class="text-h-page text-ink">{{ _('Convert & Compress Files') }}</h1>
</svg>
<p class="text-gray-400">{{ _('Drag & drop your files here') }}</p>
<p class="text-xs text-gray-600">{{ _('or click to browse (multi-file supported)') }}</p>
<p class="text-xs text-gray-700 mt-2">
<p id="supported-convert" class="text-xs text-gray-700 mt-2">
{{ _('Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF') }}<br>
{{ _('DOCX · PDF · TXT · MD · XLSX · CSV · JSON') }}<br>
{{ _('MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A') }}
</p>
<p id="supported-compress" class="hidden text-xs text-gray-700 mt-2">
{{ _('Supported: JPG · PNG · WebP · TIFF') }}<br>
{{ _('MP4 · MOV · AVI · MKV · WebM') }}
</p>
</div>

<div id="drop-selected" class="hidden space-y-2">
Expand Down
26 changes: 17 additions & 9 deletions docs/formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Re-encode an image at a lower quality to reduce file size without changing forma

| From | To | Notes |
|------|-----|-------|
| **DOCX** | PDF | Requires Microsoft Word on Windows, or LibreOffice on Linux. |
| **DOCX** | PDF | Best-effort: tables, images, hyperlinks and basic styles preserved. Footnotes, headers/footers and embedded OLE objects are simplified. |
| **DOCX** | TXT | Extracts plain text from all paragraphs. Formatting (bold, tables) is lost. |
| **TXT** | PDF | Creates a clean PDF with Helvetica font, A4 page size. |
| **PDF** | TXT | Extracts text from each page using PyPDF. Complex layouts (columns, forms) may not extract cleanly. |
Expand All @@ -55,16 +55,24 @@ Re-encode an image at a lower quality to reduce file size without changing forma

### Notes on DOCX → PDF

**Windows**: Uses `docx2pdf` which interfaces with Microsoft Word via COM. Word must be installed.
The pipeline runs in pure Python: `mammoth` extracts the DOCX body as HTML
(with images inlined as `data:` URIs), then `WeasyPrint` renders the HTML
to PDF. No external binary, no Microsoft Word, no LibreOffice required —
works the same on Linux, macOS, Windows and inside the standard container.

**Linux**: Requires LibreOffice:
```bash
sudo apt install libreoffice
pip install docx2pdf
```
`docx2pdf` on Linux uses LibreOffice in headless mode.
**What is preserved**: paragraphs, basic character formatting (bold, italic),
tables (with cell borders), inline images, hyperlinks, and standard list
styles.

**Alternative** (any platform): Export manually from Microsoft Word or LibreOffice.
**What is simplified**: footnotes and endnotes, headers and footers, page
breaks, embedded OLE objects (Excel charts, Visio diagrams), and DOCX-native
style hierarchies. When the source DOCX uses any of these, the resulting PDF
includes a small notice banner at the top.

**Security**: The HTML pipeline runs WeasyPrint with `_deny_url_fetcher`,
blocking any external resource load that a malformed DOCX might attempt. See
`tests/test_convert_document.py::test_docx_to_pdf_ssrf_blocked` for the
regression guard.

---

Expand Down
8 changes: 0 additions & 8 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -289,14 +289,6 @@ pip install pillow-heif

On Linux, also install: `sudo apt install libheif-dev`

### "DOCX to PDF conversion failed" (Linux)

On Linux, DOCX → PDF requires LibreOffice:

```bash
sudo apt install libreoffice
```

### Permission denied on `data/api_keys.json` (Linux)

```bash
Expand Down
Binary file modified locale/de/LC_MESSAGES/messages.mo
Binary file not shown.
8 changes: 7 additions & 1 deletion locale/de/LC_MESSAGES/messages.po
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: FileMorph VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
"POT-Creation-Date: 2026-05-07 15:22+0200\n"
"POT-Creation-Date: 2026-05-08 15:52+0200\n"
"PO-Revision-Date: 2026-05-08 11:00+0200\n"
"Last-Translator: FileMorph <hallo@filemorph.io>\n"
"Language: de\n"
Expand Down Expand Up @@ -347,6 +347,9 @@ msgstr "Nur JPEG / WebP."
msgid "Language"
msgstr "Sprache"

msgid "MP4 · MOV · AVI · MKV · WebM"
msgstr "MP4 · MOV · AVI · MKV · WebM"

msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
msgstr "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"

Expand Down Expand Up @@ -504,6 +507,9 @@ msgstr ""
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
msgstr "Unterstützt: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"

msgid "Supported: JPG · PNG · WebP · TIFF"
msgstr "Unterstützt: JPG · PNG · WebP · TIFF"

msgid "Switch to English"
msgstr "Zu Englisch wechseln"

Expand Down
Binary file modified locale/en/LC_MESSAGES/messages.mo
Binary file not shown.
8 changes: 7 additions & 1 deletion locale/en/LC_MESSAGES/messages.po
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: FileMorph VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
"POT-Creation-Date: 2026-05-07 15:22+0200\n"
"POT-Creation-Date: 2026-05-08 15:52+0200\n"
"PO-Revision-Date: 2026-05-07 13:43+0200\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: en\n"
Expand Down Expand Up @@ -323,6 +323,9 @@ msgstr ""
msgid "Language"
msgstr ""

msgid "MP4 · MOV · AVI · MKV · WebM"
msgstr ""

msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
msgstr ""

Expand Down Expand Up @@ -468,6 +471,9 @@ msgstr ""
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
msgstr ""

msgid "Supported: JPG · PNG · WebP · TIFF"
msgstr ""

msgid "Switch to English"
msgstr ""

Expand Down
8 changes: 7 additions & 1 deletion locale/messages.pot
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: FileMorph VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
"POT-Creation-Date: 2026-05-07 15:22+0200\n"
"POT-Creation-Date: 2026-05-08 15:52+0200\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
Expand Down Expand Up @@ -322,6 +322,9 @@ msgstr ""
msgid "Language"
msgstr ""

msgid "MP4 · MOV · AVI · MKV · WebM"
msgstr ""

msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
msgstr ""

Expand Down Expand Up @@ -467,6 +470,9 @@ msgstr ""
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
msgstr ""

msgid "Supported: JPG · PNG · WebP · TIFF"
msgstr ""

msgid "Switch to English"
msgstr ""

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pydantic-settings>=2.2.0
Pillow>=10.3.0
pillow-heif>=0.15.0
python-docx>=1.1.0
mammoth>=1.6
pypdf>=4.1.0
# pikepdf provides the qpdf-backed PDF manipulation FileMorph uses
# for PDF/A-2b output (NEU-C.1.a). Wheels bundle qpdf; no system
Expand Down
148 changes: 148 additions & 0 deletions tests/test_convert_document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,30 @@
from pathlib import Path

import pytest


def _weasyprint_works() -> bool:
"""WeasyPrint depends on native GTK/Pango libs; on Windows dev hosts
these typically aren't installed. Linux CI and the Hetzner Dockerfile
do install them, so the DOCX → PDF tests run there. We probe by
actually trying to render — `import weasyprint` succeeds even when
the native libs are missing; the failure surfaces on first use."""
try:
import weasyprint

weasyprint.HTML(string="<p>probe</p>").write_pdf()
return True
except Exception:
return False


_WEASYPRINT_OK = _weasyprint_works()
_skip_no_weasyprint = pytest.mark.skipif(
not _WEASYPRINT_OK,
reason="WeasyPrint native deps (libgobject/pango) unavailable on this host",
)


def test_txt_to_pdf(client, auth_headers, sample_txt):
with sample_txt.open("rb") as f:
res = client.post(
Expand All @@ -9,3 +36,124 @@ def test_txt_to_pdf(client, auth_headers, sample_txt):
assert res.status_code == 200
# PDF files start with %PDF
assert res.content[:4] == b"%PDF"


# ── DOCX → PDF (mammoth + WeasyPrint) ────────────────────────────────────────
# The previous DOCX → PDF converter imported `docx2pdf`, which was not in
# requirements.txt and crashed at runtime on every container deployment.
# These tests pin the new mammoth + WeasyPrint pipeline and guard against a
# regression to that broken state, plus verify SSRF protection.

_DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"


def _make_docx(path: Path, paragraphs: list[str]) -> Path:
from docx import Document

doc = Document()
for p in paragraphs:
doc.add_paragraph(p)
doc.save(str(path))
return path


def _make_docx_with_table(path: Path) -> Path:
from docx import Document

doc = Document()
doc.add_paragraph("Report header.")
table = doc.add_table(rows=2, cols=2)
table.cell(0, 0).text = "Region"
table.cell(0, 1).text = "Sales"
table.cell(1, 0).text = "EU"
table.cell(1, 1).text = "12345"
doc.save(str(path))
return path


@pytest.fixture
def sample_docx(tmp_path) -> Path:
return _make_docx(tmp_path / "sample.docx", ["Hello FileMorph!", "Second paragraph."])


@pytest.fixture
def sample_docx_with_table(tmp_path) -> Path:
return _make_docx_with_table(tmp_path / "with_table.docx")


@_skip_no_weasyprint
def test_docx_to_pdf_happy_path(client, auth_headers, sample_docx):
with sample_docx.open("rb") as f:
res = client.post(
"/api/v1/convert",
headers=auth_headers,
files={"file": ("sample.docx", f, _DOCX_MIME)},
data={"target_format": "pdf"},
)
assert res.status_code == 200, res.text
assert res.content[:5] == b"%PDF-", "output is not a PDF"
assert len(res.content) > 1024, "PDF unexpectedly small"


@_skip_no_weasyprint
def test_docx_to_pdf_with_table(client, auth_headers, sample_docx_with_table):
with sample_docx_with_table.open("rb") as f:
res = client.post(
"/api/v1/convert",
headers=auth_headers,
files={"file": ("with_table.docx", f, _DOCX_MIME)},
data={"target_format": "pdf"},
)
assert res.status_code == 200, res.text
assert res.content[:5] == b"%PDF-"

from io import BytesIO

from pypdf import PdfReader

reader = PdfReader(BytesIO(res.content))
extracted = "\n".join((p.extract_text() or "") for p in reader.pages)
assert "Region" in extracted, "table header missing in PDF"
assert "EU" in extracted, "table cell missing in PDF"


@_skip_no_weasyprint
def test_docx_to_pdf_ssrf_blocked(client, auth_headers, sample_docx, monkeypatch):
"""The DOCX → PDF pipeline must not make outbound network calls.

mammoth inlines images as data: URIs and WeasyPrint runs with
`_deny_url_fetcher`. Even a benign-looking DOCX must convert without
touching the network — guards against a future refactor that drops
`url_fetcher=`.
"""
import socket

def _block(self, addr, *args, **kwargs):
raise AssertionError(f"unexpected outbound network call to {addr!r}")

monkeypatch.setattr(socket.socket, "connect", _block)

with sample_docx.open("rb") as f:
res = client.post(
"/api/v1/convert",
headers=auth_headers,
files={"file": ("sample.docx", f, _DOCX_MIME)},
data={"target_format": "pdf"},
)
assert res.status_code == 200, res.text
assert res.content[:5] == b"%PDF-"


def test_docx_to_txt_unchanged(client, auth_headers, sample_docx):
"""DOCX → TXT uses a separate converter (python-docx) and must keep working."""
with sample_docx.open("rb") as f:
res = client.post(
"/api/v1/convert",
headers=auth_headers,
files={"file": ("sample.docx", f, _DOCX_MIME)},
data={"target_format": "txt"},
)
assert res.status_code == 200, res.text
text = res.content.decode("utf-8")
assert "Hello FileMorph!" in text
assert "Second paragraph." in text
Loading
Loading