diff --git a/app/converters/document.py b/app/converters/document.py
index ec5557e..3fbb1e3 100644
--- a/app/converters/document.py
+++ b/app/converters/document.py
@@ -10,14 +10,45 @@ def _deny_url_fetcher(url: str, **kwargs) -> None:
# ---------------------------------------------------------------------------
-# DOCX → PDF
+# DOCX → PDF (mammoth → HTML → WeasyPrint → PDF)
# ---------------------------------------------------------------------------
+# Pure-Python pipeline: mammoth extracts the DOCX body as HTML with images
+# inlined as data: URIs, WeasyPrint renders it to PDF. SSRF-safe — the same
+# _deny_url_fetcher used by md→pdf blocks any external URL the converted
+# HTML might reference. Best-effort: footnotes, headers/footers and embedded
+# OLE objects get simplified by mammoth; tables, images, hyperlinks and
+# basic paragraph styles survive intact.
@register(("docx", "pdf"))
class DocxToPdfConverter(BaseConverter):
def convert(self, input_path: Path, output_path: Path, **kwargs) -> Path:
- from docx2pdf import convert # type: ignore
+ import mammoth
+ import weasyprint
+
+ with open(input_path, "rb") as f:
+ result = mammoth.convert_to_html(f)
+ html_body = result.value
+ had_warnings = any(m.type == "warning" for m in result.messages)
+
+ notice = (
+ (
+ '
'
+ "Some elements were simplified during conversion "
+ "(footnotes, headers/footers, or embedded objects)."
+ "
"
+ )
+ if had_warnings
+ else ""
+ )
- convert(str(input_path), str(output_path))
+ full_html = (
+ ""
+ ""
+ f"{notice}{html_body}"
+ )
+ weasyprint.HTML(string=full_html, url_fetcher=_deny_url_fetcher).write_pdf(str(output_path))
return output_path
diff --git a/app/static/js/app.js b/app/static/js/app.js
index 6dd04a5..c68affa 100644
--- a/app/static/js/app.js
+++ b/app/static/js/app.js
@@ -120,6 +120,17 @@ function setMode(mode) {
compressMode = 'quality';
}
+ // Drop-zone help text differs per mode: convert covers all source formats,
+ // compress is limited to JPG/PNG/WebP/TIFF + MP4/AVI/MOV/MKV/WebM. The
+ // server rejects mismatches anyway, but showing the right list up-front
+ // keeps users from uploading e.g. an MP3 only to see a 422.
+ const supConv = document.getElementById('supported-convert');
+ const supComp = document.getElementById('supported-compress');
+ if (supConv && supComp) {
+ supConv.classList.toggle('hidden', mode !== 'convert');
+ supComp.classList.toggle('hidden', mode !== 'compress');
+ }
+
updateConvertOptionsVisibility();
renderFileList();
updateQualityVisibility();
diff --git a/app/templates/index.html b/app/templates/index.html
index 0eceaf9..e282e94 100644
--- a/app/templates/index.html
+++ b/app/templates/index.html
@@ -47,11 +47,15 @@ {{ _('Convert & Compress Files') }}
{{ _('Drag & drop your files here') }}
{{ _('or click to browse (multi-file supported)') }}
-
+
{{ _('Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF') }}
{{ _('DOCX · PDF · TXT · MD · XLSX · CSV · JSON') }}
{{ _('MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A') }}
+
+ {{ _('Supported: JPG · PNG · WebP · TIFF') }}
+ {{ _('MP4 · MOV · AVI · MKV · WebM') }}
+
diff --git a/docs/formats.md b/docs/formats.md
index 27d6f66..d1cb0ee 100644
--- a/docs/formats.md
+++ b/docs/formats.md
@@ -46,7 +46,7 @@ Re-encode an image at a lower quality to reduce file size without changing forma
| From | To | Notes |
|------|-----|-------|
-| **DOCX** | PDF | Requires Microsoft Word on Windows, or LibreOffice on Linux. |
+| **DOCX** | PDF | Best-effort: tables, images, hyperlinks and basic styles preserved. Footnotes, headers/footers and embedded OLE objects are simplified. |
| **DOCX** | TXT | Extracts plain text from all paragraphs. Formatting (bold, tables) is lost. |
| **TXT** | PDF | Creates a clean PDF with Helvetica font, A4 page size. |
| **PDF** | TXT | Extracts text from each page using PyPDF. Complex layouts (columns, forms) may not extract cleanly. |
@@ -55,16 +55,24 @@ Re-encode an image at a lower quality to reduce file size without changing forma
### Notes on DOCX → PDF
-**Windows**: Uses `docx2pdf` which interfaces with Microsoft Word via COM. Word must be installed.
+The pipeline runs in pure Python: `mammoth` extracts the DOCX body as HTML
+(with images inlined as `data:` URIs), then `WeasyPrint` renders the HTML
+to PDF. No external binary, no Microsoft Word, no LibreOffice required —
+works the same on Linux, macOS, Windows and inside the standard container.
-**Linux**: Requires LibreOffice:
-```bash
-sudo apt install libreoffice
-pip install docx2pdf
-```
-`docx2pdf` on Linux uses LibreOffice in headless mode.
+**What is preserved**: paragraphs, basic character formatting (bold, italic),
+tables (with cell borders), inline images, hyperlinks, and standard list
+styles.
-**Alternative** (any platform): Export manually from Microsoft Word or LibreOffice.
+**What is simplified**: footnotes and endnotes, headers and footers, page
+breaks, embedded OLE objects (Excel charts, Visio diagrams), and DOCX-native
+style hierarchies. When the source DOCX uses any of these, the resulting PDF
+includes a small notice banner at the top.
+
+**Security**: The HTML pipeline runs WeasyPrint with `_deny_url_fetcher`,
+blocking any external resource load that a malformed DOCX might attempt. See
+`tests/test_convert_document.py::test_docx_to_pdf_ssrf_blocked` for the
+regression guard.
---
diff --git a/docs/installation.md b/docs/installation.md
index a69376c..adf6f8b 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -289,14 +289,6 @@ pip install pillow-heif
On Linux, also install: `sudo apt install libheif-dev`
-### "DOCX to PDF conversion failed" (Linux)
-
-On Linux, DOCX → PDF requires LibreOffice:
-
-```bash
-sudo apt install libreoffice
-```
-
### Permission denied on `data/api_keys.json` (Linux)
```bash
diff --git a/locale/de/LC_MESSAGES/messages.mo b/locale/de/LC_MESSAGES/messages.mo
index 8e38a1d..f078e24 100644
Binary files a/locale/de/LC_MESSAGES/messages.mo and b/locale/de/LC_MESSAGES/messages.mo differ
diff --git a/locale/de/LC_MESSAGES/messages.po b/locale/de/LC_MESSAGES/messages.po
index c0c96ab..1e060b4 100644
--- a/locale/de/LC_MESSAGES/messages.po
+++ b/locale/de/LC_MESSAGES/messages.po
@@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: FileMorph VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2026-05-07 15:22+0200\n"
+"POT-Creation-Date: 2026-05-08 15:52+0200\n"
"PO-Revision-Date: 2026-05-08 11:00+0200\n"
"Last-Translator: FileMorph
\n"
"Language: de\n"
@@ -347,6 +347,9 @@ msgstr "Nur JPEG / WebP."
msgid "Language"
msgstr "Sprache"
+msgid "MP4 · MOV · AVI · MKV · WebM"
+msgstr "MP4 · MOV · AVI · MKV · WebM"
+
msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
msgstr "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
@@ -504,6 +507,9 @@ msgstr ""
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
msgstr "Unterstützt: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
+msgid "Supported: JPG · PNG · WebP · TIFF"
+msgstr "Unterstützt: JPG · PNG · WebP · TIFF"
+
msgid "Switch to English"
msgstr "Zu Englisch wechseln"
diff --git a/locale/en/LC_MESSAGES/messages.mo b/locale/en/LC_MESSAGES/messages.mo
index 3a0e395..de63a55 100644
Binary files a/locale/en/LC_MESSAGES/messages.mo and b/locale/en/LC_MESSAGES/messages.mo differ
diff --git a/locale/en/LC_MESSAGES/messages.po b/locale/en/LC_MESSAGES/messages.po
index 03f91d8..966ec23 100644
--- a/locale/en/LC_MESSAGES/messages.po
+++ b/locale/en/LC_MESSAGES/messages.po
@@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: FileMorph VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2026-05-07 15:22+0200\n"
+"POT-Creation-Date: 2026-05-08 15:52+0200\n"
"PO-Revision-Date: 2026-05-07 13:43+0200\n"
"Last-Translator: FULL NAME \n"
"Language: en\n"
@@ -323,6 +323,9 @@ msgstr ""
msgid "Language"
msgstr ""
+msgid "MP4 · MOV · AVI · MKV · WebM"
+msgstr ""
+
msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
msgstr ""
@@ -468,6 +471,9 @@ msgstr ""
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
msgstr ""
+msgid "Supported: JPG · PNG · WebP · TIFF"
+msgstr ""
+
msgid "Switch to English"
msgstr ""
diff --git a/locale/messages.pot b/locale/messages.pot
index 0e5af18..8832bf3 100644
--- a/locale/messages.pot
+++ b/locale/messages.pot
@@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: FileMorph VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2026-05-07 15:22+0200\n"
+"POT-Creation-Date: 2026-05-08 15:52+0200\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language-Team: LANGUAGE \n"
@@ -322,6 +322,9 @@ msgstr ""
msgid "Language"
msgstr ""
+msgid "MP4 · MOV · AVI · MKV · WebM"
+msgstr ""
+
msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
msgstr ""
@@ -467,6 +470,9 @@ msgstr ""
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
msgstr ""
+msgid "Supported: JPG · PNG · WebP · TIFF"
+msgstr ""
+
msgid "Switch to English"
msgstr ""
diff --git a/requirements.txt b/requirements.txt
index 524a8bc..9b25b0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,7 @@ pydantic-settings>=2.2.0
Pillow>=10.3.0
pillow-heif>=0.15.0
python-docx>=1.1.0
+mammoth>=1.6
pypdf>=4.1.0
# pikepdf provides the qpdf-backed PDF manipulation FileMorph uses
# for PDF/A-2b output (NEU-C.1.a). Wheels bundle qpdf; no system
diff --git a/tests/test_convert_document.py b/tests/test_convert_document.py
index c7e4300..57abe68 100644
--- a/tests/test_convert_document.py
+++ b/tests/test_convert_document.py
@@ -1,3 +1,30 @@
+from pathlib import Path
+
+import pytest
+
+
+def _weasyprint_works() -> bool:
+ """WeasyPrint depends on native GTK/Pango libs; on Windows dev hosts
+ these typically aren't installed. Linux CI and the Hetzner Dockerfile
+ do install them, so the DOCX → PDF tests run there. We probe by
+ actually trying to render — `import weasyprint` succeeds even when
+ the native libs are missing; the failure surfaces on first use."""
+ try:
+ import weasyprint
+
+ weasyprint.HTML(string="probe
").write_pdf()
+ return True
+ except Exception:
+ return False
+
+
+_WEASYPRINT_OK = _weasyprint_works()
+_skip_no_weasyprint = pytest.mark.skipif(
+ not _WEASYPRINT_OK,
+ reason="WeasyPrint native deps (libgobject/pango) unavailable on this host",
+)
+
+
def test_txt_to_pdf(client, auth_headers, sample_txt):
with sample_txt.open("rb") as f:
res = client.post(
@@ -9,3 +36,124 @@ def test_txt_to_pdf(client, auth_headers, sample_txt):
assert res.status_code == 200
# PDF files start with %PDF
assert res.content[:4] == b"%PDF"
+
+
+# ── DOCX → PDF (mammoth + WeasyPrint) ────────────────────────────────────────
+# The previous DOCX → PDF converter imported `docx2pdf`, which was not in
+# requirements.txt and crashed at runtime on every container deployment.
+# These tests pin the new mammoth + WeasyPrint pipeline and guard against a
+# regression to that broken state, plus verify SSRF protection.
+
+_DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+
+
+def _make_docx(path: Path, paragraphs: list[str]) -> Path:
+ from docx import Document
+
+ doc = Document()
+ for p in paragraphs:
+ doc.add_paragraph(p)
+ doc.save(str(path))
+ return path
+
+
+def _make_docx_with_table(path: Path) -> Path:
+ from docx import Document
+
+ doc = Document()
+ doc.add_paragraph("Report header.")
+ table = doc.add_table(rows=2, cols=2)
+ table.cell(0, 0).text = "Region"
+ table.cell(0, 1).text = "Sales"
+ table.cell(1, 0).text = "EU"
+ table.cell(1, 1).text = "12345"
+ doc.save(str(path))
+ return path
+
+
+@pytest.fixture
+def sample_docx(tmp_path) -> Path:
+ return _make_docx(tmp_path / "sample.docx", ["Hello FileMorph!", "Second paragraph."])
+
+
+@pytest.fixture
+def sample_docx_with_table(tmp_path) -> Path:
+ return _make_docx_with_table(tmp_path / "with_table.docx")
+
+
+@_skip_no_weasyprint
+def test_docx_to_pdf_happy_path(client, auth_headers, sample_docx):
+ with sample_docx.open("rb") as f:
+ res = client.post(
+ "/api/v1/convert",
+ headers=auth_headers,
+ files={"file": ("sample.docx", f, _DOCX_MIME)},
+ data={"target_format": "pdf"},
+ )
+ assert res.status_code == 200, res.text
+ assert res.content[:5] == b"%PDF-", "output is not a PDF"
+ assert len(res.content) > 1024, "PDF unexpectedly small"
+
+
+@_skip_no_weasyprint
+def test_docx_to_pdf_with_table(client, auth_headers, sample_docx_with_table):
+ with sample_docx_with_table.open("rb") as f:
+ res = client.post(
+ "/api/v1/convert",
+ headers=auth_headers,
+ files={"file": ("with_table.docx", f, _DOCX_MIME)},
+ data={"target_format": "pdf"},
+ )
+ assert res.status_code == 200, res.text
+ assert res.content[:5] == b"%PDF-"
+
+ from io import BytesIO
+
+ from pypdf import PdfReader
+
+ reader = PdfReader(BytesIO(res.content))
+ extracted = "\n".join((p.extract_text() or "") for p in reader.pages)
+ assert "Region" in extracted, "table header missing in PDF"
+ assert "EU" in extracted, "table cell missing in PDF"
+
+
+@_skip_no_weasyprint
+def test_docx_to_pdf_ssrf_blocked(client, auth_headers, sample_docx, monkeypatch):
+ """The DOCX → PDF pipeline must not make outbound network calls.
+
+ mammoth inlines images as data: URIs and WeasyPrint runs with
+ `_deny_url_fetcher`. Even a benign-looking DOCX must convert without
+ touching the network — guards against a future refactor that drops
+ `url_fetcher=`.
+ """
+ import socket
+
+ def _block(self, addr, *args, **kwargs):
+ raise AssertionError(f"unexpected outbound network call to {addr!r}")
+
+ monkeypatch.setattr(socket.socket, "connect", _block)
+
+ with sample_docx.open("rb") as f:
+ res = client.post(
+ "/api/v1/convert",
+ headers=auth_headers,
+ files={"file": ("sample.docx", f, _DOCX_MIME)},
+ data={"target_format": "pdf"},
+ )
+ assert res.status_code == 200, res.text
+ assert res.content[:5] == b"%PDF-"
+
+
+def test_docx_to_txt_unchanged(client, auth_headers, sample_docx):
+ """DOCX → TXT uses a separate converter (python-docx) and must keep working."""
+ with sample_docx.open("rb") as f:
+ res = client.post(
+ "/api/v1/convert",
+ headers=auth_headers,
+ files={"file": ("sample.docx", f, _DOCX_MIME)},
+ data={"target_format": "txt"},
+ )
+ assert res.status_code == 200, res.text
+ text = res.content.decode("utf-8")
+ assert "Hello FileMorph!" in text
+ assert "Second paragraph." in text
diff --git a/tests/test_homepage_drop_zone_modes.py b/tests/test_homepage_drop_zone_modes.py
new file mode 100644
index 0000000..feca698
--- /dev/null
+++ b/tests/test_homepage_drop_zone_modes.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Drop-zone help text must carry both mode-specific blocks.
+
+The convert and compress modes accept different input formats — convert
+covers all source formats including audio and documents, while compress
+only handles JPG/PNG/WebP/TIFF and MP4/AVI/MOV/MKV/WebM. The homepage
+renders both lists side-by-side; `app/static/js/app.js::setMode()`
+toggles their visibility. If a future refactor drops one of the two
+elements, the user sees a stale or empty caption — this test fails first.
+"""
+
+from __future__ import annotations
+
+
+def test_homepage_carries_both_supported_lists(client):
+ res = client.get("/")
+ assert res.status_code == 200
+ html = res.text
+ assert 'id="supported-convert"' in html, "convert-mode caption missing"
+ assert 'id="supported-compress"' in html, "compress-mode caption missing"
+
+
+def test_compress_caption_lists_only_image_and_video_formats(client):
+ """Compress mode must NOT advertise audio, document, or spreadsheet formats —
+ those have no compressor. Adding them back would mislead the user."""
+ res = client.get("/")
+ assert res.status_code == 200
+ html = res.text
+
+ start = html.find('id="supported-compress"')
+ assert start != -1
+ end = html.find("", start)
+ block = html[start:end]
+
+ for forbidden in ["MP3", "WAV", "FLAC", "OGG", "M4A", "DOCX", "PDF", "XLSX", "CSV"]:
+ assert forbidden not in block, (
+ f"compress-mode caption advertises {forbidden} but no compressor exists for it"
+ )