MrChengLen · MrChengLen · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/app/converters/document.py b/app/converters/document.py
@@ -10,14 +10,45 @@ def _deny_url_fetcher(url: str, **kwargs) -> None:
 
 
 # ---------------------------------------------------------------------------
-# DOCX → PDF
+# DOCX → PDF (mammoth → HTML → WeasyPrint → PDF)
 # ---------------------------------------------------------------------------
+# Pure-Python pipeline: mammoth extracts the DOCX body as HTML with images
+# inlined as data: URIs, WeasyPrint renders it to PDF. SSRF-safe — the same
+# _deny_url_fetcher used by md→pdf blocks any external URL the converted
+# HTML might reference. Best-effort: footnotes, headers/footers and embedded
+# OLE objects get simplified by mammoth; tables, images, hyperlinks and
+# basic paragraph styles survive intact.
 @register(("docx", "pdf"))
 class DocxToPdfConverter(BaseConverter):
     def convert(self, input_path: Path, output_path: Path, **kwargs) -> Path:
-        from docx2pdf import convert  # type: ignore
+        import mammoth
+        import weasyprint
+
+        with open(input_path, "rb") as f:
+            result = mammoth.convert_to_html(f)
+        html_body = result.value
+        had_warnings = any(m.type == "warning" for m in result.messages)
+
+        notice = (
+            (
+                '<div style="background:#fff3cd;border:1px solid #ffe39a;'
+                'padding:8px;font-size:10pt;margin-bottom:1em">'
+                "Some elements were simplified during conversion "
+                "(footnotes, headers/footers, or embedded objects)."
+                "</div>"
+            )
+            if had_warnings
+            else ""
+        )
 
-        convert(str(input_path), str(output_path))
+        full_html = (
+            "<!DOCTYPE html><html><head>"
+            "<style>body{font-family:sans-serif;margin:2cm;line-height:1.6}"
+            "table{border-collapse:collapse}"
+            "td,th{border:1px solid #999;padding:4pt}</style>"
+            f"</head><body>{notice}{html_body}</body></html>"
+        )
+        weasyprint.HTML(string=full_html, url_fetcher=_deny_url_fetcher).write_pdf(str(output_path))
         return output_path
 
 

diff --git a/app/static/js/app.js b/app/static/js/app.js
@@ -120,6 +120,17 @@ function setMode(mode) {
     compressMode = 'quality';
   }
 
+  // Drop-zone help text differs per mode: convert covers all source formats,
+  // compress is limited to JPG/PNG/WebP/TIFF + MP4/AVI/MOV/MKV/WebM. The
+  // server rejects mismatches anyway, but showing the right list up-front
+  // keeps users from uploading e.g. an MP3 only to see a 422.
+  const supConv = document.getElementById('supported-convert');
+  const supComp = document.getElementById('supported-compress');
+  if (supConv && supComp) {
+    supConv.classList.toggle('hidden', mode !== 'convert');
+    supComp.classList.toggle('hidden', mode !== 'compress');
+  }
+
   updateConvertOptionsVisibility();
   renderFileList();
   updateQualityVisibility();

diff --git a/app/templates/index.html b/app/templates/index.html
@@ -47,11 +47,15 @@ <h1 class="text-h-page text-ink">{{ _('Convert & Compress Files') }}</h1>
             </svg>
             <p class="text-gray-400">{{ _('Drag & drop your files here') }}</p>
             <p class="text-xs text-gray-600">{{ _('or click to browse (multi-file supported)') }}</p>
-            <p class="text-xs text-gray-700 mt-2">
+            <p id="supported-convert" class="text-xs text-gray-700 mt-2">
               {{ _('Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF') }}<br>
               {{ _('DOCX · PDF · TXT · MD · XLSX · CSV · JSON') }}<br>
               {{ _('MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A') }}
             </p>
+            <p id="supported-compress" class="hidden text-xs text-gray-700 mt-2">
+              {{ _('Supported: JPG · PNG · WebP · TIFF') }}<br>
+              {{ _('MP4 · MOV · AVI · MKV · WebM') }}
+            </p>
           </div>
 
           <div id="drop-selected" class="hidden space-y-2">

diff --git a/docs/formats.md b/docs/formats.md
@@ -46,7 +46,7 @@ Re-encode an image at a lower quality to reduce file size without changing forma
 
 | From | To | Notes |
 |------|-----|-------|
-| **DOCX** | PDF | Requires Microsoft Word on Windows, or LibreOffice on Linux. |
+| **DOCX** | PDF | Best-effort: tables, images, hyperlinks and basic styles preserved. Footnotes, headers/footers and embedded OLE objects are simplified. |
 | **DOCX** | TXT | Extracts plain text from all paragraphs. Formatting (bold, tables) is lost. |
 | **TXT** | PDF | Creates a clean PDF with Helvetica font, A4 page size. |
 | **PDF** | TXT | Extracts text from each page using PyPDF. Complex layouts (columns, forms) may not extract cleanly. |
@@ -55,16 +55,24 @@ Re-encode an image at a lower quality to reduce file size without changing forma
 
 ### Notes on DOCX → PDF
 
-**Windows**: Uses `docx2pdf` which interfaces with Microsoft Word via COM. Word must be installed.
+The pipeline runs in pure Python: `mammoth` extracts the DOCX body as HTML
+(with images inlined as `data:` URIs), then `WeasyPrint` renders the HTML
+to PDF. No external binary, no Microsoft Word, no LibreOffice required —
+works the same on Linux, macOS, Windows and inside the standard container.
 
-**Linux**: Requires LibreOffice:
-```bash
-sudo apt install libreoffice
-pip install docx2pdf
-```
-`docx2pdf` on Linux uses LibreOffice in headless mode.
+**What is preserved**: paragraphs, basic character formatting (bold, italic),
+tables (with cell borders), inline images, hyperlinks, and standard list
+styles.
 
-**Alternative** (any platform): Export manually from Microsoft Word or LibreOffice.
+**What is simplified**: footnotes and endnotes, headers and footers, page
+breaks, embedded OLE objects (Excel charts, Visio diagrams), and DOCX-native
+style hierarchies. When the source DOCX uses any of these, the resulting PDF
+includes a small notice banner at the top.
+
+**Security**: The HTML pipeline runs WeasyPrint with `_deny_url_fetcher`,
+blocking any external resource load that a malformed DOCX might attempt. See
+`tests/test_convert_document.py::test_docx_to_pdf_ssrf_blocked` for the
+regression guard.
 
 ---
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -289,14 +289,6 @@ pip install pillow-heif
 
 On Linux, also install: `sudo apt install libheif-dev`
 
-### "DOCX to PDF conversion failed" (Linux)
-
-On Linux, DOCX → PDF requires LibreOffice:
-
-```bash
-sudo apt install libreoffice
-```
-
 ### Permission denied on `data/api_keys.json` (Linux)
 
 ```bash

diff --git a/locale/de/LC_MESSAGES/messages.mo b/locale/de/LC_MESSAGES/messages.mo
diff --git a/locale/de/LC_MESSAGES/messages.po b/locale/de/LC_MESSAGES/messages.po
@@ -7,7 +7,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: FileMorph VERSION\n"
 "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2026-05-07 15:22+0200\n"
+"POT-Creation-Date: 2026-05-08 15:52+0200\n"
 "PO-Revision-Date: 2026-05-08 11:00+0200\n"
 "Last-Translator: FileMorph <hallo@filemorph.io>\n"
 "Language: de\n"
@@ -347,6 +347,9 @@ msgstr "Nur JPEG / WebP."
 msgid "Language"
 msgstr "Sprache"
 
+msgid "MP4 · MOV · AVI · MKV · WebM"
+msgstr "MP4 · MOV · AVI · MKV · WebM"
+
 msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
 msgstr "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
 
@@ -504,6 +507,9 @@ msgstr ""
 msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
 msgstr "Unterstützt: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
 
+msgid "Supported: JPG · PNG · WebP · TIFF"
+msgstr "Unterstützt: JPG · PNG · WebP · TIFF"
+
 msgid "Switch to English"
 msgstr "Zu Englisch wechseln"
 

diff --git a/locale/en/LC_MESSAGES/messages.mo b/locale/en/LC_MESSAGES/messages.mo
diff --git a/locale/en/LC_MESSAGES/messages.po b/locale/en/LC_MESSAGES/messages.po
@@ -7,7 +7,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: FileMorph VERSION\n"
 "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2026-05-07 15:22+0200\n"
+"POT-Creation-Date: 2026-05-08 15:52+0200\n"
 "PO-Revision-Date: 2026-05-07 13:43+0200\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
@@ -323,6 +323,9 @@ msgstr ""
 msgid "Language"
 msgstr ""
 
+msgid "MP4 · MOV · AVI · MKV · WebM"
+msgstr ""
+
 msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
 msgstr ""
 
@@ -468,6 +471,9 @@ msgstr ""
 msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
 msgstr ""
 
+msgid "Supported: JPG · PNG · WebP · TIFF"
+msgstr ""
+
 msgid "Switch to English"
 msgstr ""
 

diff --git a/locale/messages.pot b/locale/messages.pot
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: FileMorph VERSION\n"
 "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2026-05-07 15:22+0200\n"
+"POT-Creation-Date: 2026-05-08 15:52+0200\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"
@@ -322,6 +322,9 @@ msgstr ""
 msgid "Language"
 msgstr ""
 
+msgid "MP4 · MOV · AVI · MKV · WebM"
+msgstr ""
+
 msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
 msgstr ""
 
@@ -467,6 +470,9 @@ msgstr ""
 msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
 msgstr ""
 
+msgid "Supported: JPG · PNG · WebP · TIFF"
+msgstr ""
+
 msgid "Switch to English"
 msgstr ""
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ pydantic-settings>=2.2.0
 Pillow>=10.3.0
 pillow-heif>=0.15.0
 python-docx>=1.1.0
+mammoth>=1.6
 pypdf>=4.1.0
 # pikepdf provides the qpdf-backed PDF manipulation FileMorph uses
 # for PDF/A-2b output (NEU-C.1.a). Wheels bundle qpdf; no system

diff --git a/tests/test_convert_document.py b/tests/test_convert_document.py
@@ -1,3 +1,30 @@
+from pathlib import Path
+
+import pytest
+
+
+def _weasyprint_works() -> bool:
+    """WeasyPrint depends on native GTK/Pango libs; on Windows dev hosts
+    these typically aren't installed. Linux CI and the Hetzner Dockerfile
+    do install them, so the DOCX → PDF tests run there. We probe by
+    actually trying to render — `import weasyprint` succeeds even when
+    the native libs are missing; the failure surfaces on first use."""
+    try:
+        import weasyprint
+
+        weasyprint.HTML(string="<p>probe</p>").write_pdf()
+        return True
+    except Exception:
+        return False
+
+
+_WEASYPRINT_OK = _weasyprint_works()
+_skip_no_weasyprint = pytest.mark.skipif(
+    not _WEASYPRINT_OK,
+    reason="WeasyPrint native deps (libgobject/pango) unavailable on this host",
+)
+
+
 def test_txt_to_pdf(client, auth_headers, sample_txt):
     with sample_txt.open("rb") as f:
         res = client.post(
@@ -9,3 +36,124 @@ def test_txt_to_pdf(client, auth_headers, sample_txt):
     assert res.status_code == 200
     # PDF files start with %PDF
     assert res.content[:4] == b"%PDF"
+
+
+# ── DOCX → PDF (mammoth + WeasyPrint) ────────────────────────────────────────
+# The previous DOCX → PDF converter imported `docx2pdf`, which was not in
+# requirements.txt and crashed at runtime on every container deployment.
+# These tests pin the new mammoth + WeasyPrint pipeline and guard against a
+# regression to that broken state, plus verify SSRF protection.
+
+_DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+
+
+def _make_docx(path: Path, paragraphs: list[str]) -> Path:
+    from docx import Document
+
+    doc = Document()
+    for p in paragraphs:
+        doc.add_paragraph(p)
+    doc.save(str(path))
+    return path
+
+
+def _make_docx_with_table(path: Path) -> Path:
+    from docx import Document
+
+    doc = Document()
+    doc.add_paragraph("Report header.")
+    table = doc.add_table(rows=2, cols=2)
+    table.cell(0, 0).text = "Region"
+    table.cell(0, 1).text = "Sales"
+    table.cell(1, 0).text = "EU"
+    table.cell(1, 1).text = "12345"
+    doc.save(str(path))
+    return path
+
+
+@pytest.fixture
+def sample_docx(tmp_path) -> Path:
+    return _make_docx(tmp_path / "sample.docx", ["Hello FileMorph!", "Second paragraph."])
+
+
+@pytest.fixture
+def sample_docx_with_table(tmp_path) -> Path:
+    return _make_docx_with_table(tmp_path / "with_table.docx")
+
+
+@_skip_no_weasyprint
+def test_docx_to_pdf_happy_path(client, auth_headers, sample_docx):
+    with sample_docx.open("rb") as f:
+        res = client.post(
+            "/api/v1/convert",
+            headers=auth_headers,
+            files={"file": ("sample.docx", f, _DOCX_MIME)},
+            data={"target_format": "pdf"},
+        )
+    assert res.status_code == 200, res.text
+    assert res.content[:5] == b"%PDF-", "output is not a PDF"
+    assert len(res.content) > 1024, "PDF unexpectedly small"
+
+
+@_skip_no_weasyprint
+def test_docx_to_pdf_with_table(client, auth_headers, sample_docx_with_table):
+    with sample_docx_with_table.open("rb") as f:
+        res = client.post(
+            "/api/v1/convert",
+            headers=auth_headers,
+            files={"file": ("with_table.docx", f, _DOCX_MIME)},
+            data={"target_format": "pdf"},
+        )
+    assert res.status_code == 200, res.text
+    assert res.content[:5] == b"%PDF-"
+
+    from io import BytesIO
+
+    from pypdf import PdfReader
+
+    reader = PdfReader(BytesIO(res.content))
+    extracted = "\n".join((p.extract_text() or "") for p in reader.pages)
+    assert "Region" in extracted, "table header missing in PDF"
+    assert "EU" in extracted, "table cell missing in PDF"
+
+
+@_skip_no_weasyprint
+def test_docx_to_pdf_ssrf_blocked(client, auth_headers, sample_docx, monkeypatch):
+    """The DOCX → PDF pipeline must not make outbound network calls.
+
+    mammoth inlines images as data: URIs and WeasyPrint runs with
+    `_deny_url_fetcher`. Even a benign-looking DOCX must convert without
+    touching the network — guards against a future refactor that drops
+    `url_fetcher=`.
+    """
+    import socket
+
+    def _block(self, addr, *args, **kwargs):
+        raise AssertionError(f"unexpected outbound network call to {addr!r}")
+
+    monkeypatch.setattr(socket.socket, "connect", _block)
+
+    with sample_docx.open("rb") as f:
+        res = client.post(
+            "/api/v1/convert",
+            headers=auth_headers,
+            files={"file": ("sample.docx", f, _DOCX_MIME)},
+            data={"target_format": "pdf"},
+        )
+    assert res.status_code == 200, res.text
+    assert res.content[:5] == b"%PDF-"
+
+
+def test_docx_to_txt_unchanged(client, auth_headers, sample_docx):
+    """DOCX → TXT uses a separate converter (python-docx) and must keep working."""
+    with sample_docx.open("rb") as f:
+        res = client.post(
+            "/api/v1/convert",
+            headers=auth_headers,
+            files={"file": ("sample.docx", f, _DOCX_MIME)},
+            data={"target_format": "txt"},
+        )
+    assert res.status_code == 200, res.text
+    text = res.content.decode("utf-8")
+    assert "Hello FileMorph!" in text
+    assert "Second paragraph." in text