Skip to content

Commit e54f726

Browse files
authored
Merge pull request #23 from MrChengLen/pr-fix-docx-pdf-and-drop-zone-modes
Pr fix docx pdf and drop zone modes
2 parents 11730bb + 8595fe6 commit e54f726

13 files changed

Lines changed: 275 additions & 24 deletions

File tree

app/converters/document.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,45 @@ def _deny_url_fetcher(url: str, **kwargs) -> None:
1010

1111

1212
# ---------------------------------------------------------------------------
13-
# DOCX → PDF
13+
# DOCX → PDF (mammoth → HTML → WeasyPrint → PDF)
1414
# ---------------------------------------------------------------------------
15+
# Pure-Python pipeline: mammoth extracts the DOCX body as HTML with images
16+
# inlined as data: URIs, WeasyPrint renders it to PDF. SSRF-safe — the same
17+
# _deny_url_fetcher used by md→pdf blocks any external URL the converted
18+
# HTML might reference. Best-effort: footnotes, headers/footers and embedded
19+
# OLE objects get simplified by mammoth; tables, images, hyperlinks and
20+
# basic paragraph styles survive intact.
1521
@register(("docx", "pdf"))
1622
class DocxToPdfConverter(BaseConverter):
1723
def convert(self, input_path: Path, output_path: Path, **kwargs) -> Path:
18-
from docx2pdf import convert # type: ignore
24+
import mammoth
25+
import weasyprint
26+
27+
with open(input_path, "rb") as f:
28+
result = mammoth.convert_to_html(f)
29+
html_body = result.value
30+
had_warnings = any(m.type == "warning" for m in result.messages)
31+
32+
notice = (
33+
(
34+
'<div style="background:#fff3cd;border:1px solid #ffe39a;'
35+
'padding:8px;font-size:10pt;margin-bottom:1em">'
36+
"Some elements were simplified during conversion "
37+
"(footnotes, headers/footers, or embedded objects)."
38+
"</div>"
39+
)
40+
if had_warnings
41+
else ""
42+
)
1943

20-
convert(str(input_path), str(output_path))
44+
full_html = (
45+
"<!DOCTYPE html><html><head>"
46+
"<style>body{font-family:sans-serif;margin:2cm;line-height:1.6}"
47+
"table{border-collapse:collapse}"
48+
"td,th{border:1px solid #999;padding:4pt}</style>"
49+
f"</head><body>{notice}{html_body}</body></html>"
50+
)
51+
weasyprint.HTML(string=full_html, url_fetcher=_deny_url_fetcher).write_pdf(str(output_path))
2152
return output_path
2253

2354

app/static/js/app.js

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,17 @@ function setMode(mode) {
120120
compressMode = 'quality';
121121
}
122122

123+
// Drop-zone help text differs per mode: convert covers all source formats,
124+
// compress is limited to JPG/PNG/WebP/TIFF + MP4/AVI/MOV/MKV/WebM. The
125+
// server rejects mismatches anyway, but showing the right list up-front
126+
// keeps users from uploading e.g. an MP3 only to see a 422.
127+
const supConv = document.getElementById('supported-convert');
128+
const supComp = document.getElementById('supported-compress');
129+
if (supConv && supComp) {
130+
supConv.classList.toggle('hidden', mode !== 'convert');
131+
supComp.classList.toggle('hidden', mode !== 'compress');
132+
}
133+
123134
updateConvertOptionsVisibility();
124135
renderFileList();
125136
updateQualityVisibility();

app/templates/index.html

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,15 @@ <h1 class="text-h-page text-ink">{{ _('Convert & Compress Files') }}</h1>
4747
</svg>
4848
<p class="text-gray-400">{{ _('Drag & drop your files here') }}</p>
4949
<p class="text-xs text-gray-600">{{ _('or click to browse (multi-file supported)') }}</p>
50-
<p class="text-xs text-gray-700 mt-2">
50+
<p id="supported-convert" class="text-xs text-gray-700 mt-2">
5151
{{ _('Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF') }}<br>
5252
{{ _('DOCX · PDF · TXT · MD · XLSX · CSV · JSON') }}<br>
5353
{{ _('MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A') }}
5454
</p>
55+
<p id="supported-compress" class="hidden text-xs text-gray-700 mt-2">
56+
{{ _('Supported: JPG · PNG · WebP · TIFF') }}<br>
57+
{{ _('MP4 · MOV · AVI · MKV · WebM') }}
58+
</p>
5559
</div>
5660

5761
<div id="drop-selected" class="hidden space-y-2">

docs/formats.md

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Re-encode an image at a lower quality to reduce file size without changing forma
4646

4747
| From | To | Notes |
4848
|------|-----|-------|
49-
| **DOCX** | PDF | Requires Microsoft Word on Windows, or LibreOffice on Linux. |
49+
| **DOCX** | PDF | Best-effort: tables, images, hyperlinks and basic styles preserved. Footnotes, headers/footers and embedded OLE objects are simplified. |
5050
| **DOCX** | TXT | Extracts plain text from all paragraphs. Formatting (bold, tables) is lost. |
5151
| **TXT** | PDF | Creates a clean PDF with Helvetica font, A4 page size. |
5252
| **PDF** | TXT | Extracts text from each page using PyPDF. Complex layouts (columns, forms) may not extract cleanly. |
@@ -55,16 +55,24 @@ Re-encode an image at a lower quality to reduce file size without changing forma
5555

5656
### Notes on DOCX → PDF
5757

58-
**Windows**: Uses `docx2pdf` which interfaces with Microsoft Word via COM. Word must be installed.
58+
The pipeline runs in pure Python: `mammoth` extracts the DOCX body as HTML
59+
(with images inlined as `data:` URIs), then `WeasyPrint` renders the HTML
60+
to PDF. No external binary, no Microsoft Word, no LibreOffice required —
61+
works the same on Linux, macOS, Windows and inside the standard container.
5962

60-
**Linux**: Requires LibreOffice:
61-
```bash
62-
sudo apt install libreoffice
63-
pip install docx2pdf
64-
```
65-
`docx2pdf` on Linux uses LibreOffice in headless mode.
63+
**What is preserved**: paragraphs, basic character formatting (bold, italic),
64+
tables (with cell borders), inline images, hyperlinks, and standard list
65+
styles.
6666

67-
**Alternative** (any platform): Export manually from Microsoft Word or LibreOffice.
67+
**What is simplified**: footnotes and endnotes, headers and footers, page
68+
breaks, embedded OLE objects (Excel charts, Visio diagrams), and DOCX-native
69+
style hierarchies. When the source DOCX uses any of these, the resulting PDF
70+
includes a small notice banner at the top.
71+
72+
**Security**: The HTML pipeline runs WeasyPrint with `_deny_url_fetcher`,
73+
blocking any external resource load that a malformed DOCX might attempt. See
74+
`tests/test_convert_document.py::test_docx_to_pdf_ssrf_blocked` for the
75+
regression guard.
6876

6977
---
7078

docs/installation.md

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -289,14 +289,6 @@ pip install pillow-heif
289289

290290
On Linux, also install: `sudo apt install libheif-dev`
291291

292-
### "DOCX to PDF conversion failed" (Linux)
293-
294-
On Linux, DOCX → PDF requires LibreOffice:
295-
296-
```bash
297-
sudo apt install libreoffice
298-
```
299-
300292
### Permission denied on `data/api_keys.json` (Linux)
301293

302294
```bash

locale/de/LC_MESSAGES/messages.mo

177 Bytes
Binary file not shown.

locale/de/LC_MESSAGES/messages.po

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ msgid ""
77
msgstr ""
88
"Project-Id-Version: FileMorph VERSION\n"
99
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
10-
"POT-Creation-Date: 2026-05-07 15:22+0200\n"
10+
"POT-Creation-Date: 2026-05-08 15:52+0200\n"
1111
"PO-Revision-Date: 2026-05-08 11:00+0200\n"
1212
"Last-Translator: FileMorph <hallo@filemorph.io>\n"
1313
"Language: de\n"
@@ -347,6 +347,9 @@ msgstr "Nur JPEG / WebP."
347347
msgid "Language"
348348
msgstr "Sprache"
349349

350+
msgid "MP4 · MOV · AVI · MKV · WebM"
351+
msgstr "MP4 · MOV · AVI · MKV · WebM"
352+
350353
msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
351354
msgstr "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
352355

@@ -504,6 +507,9 @@ msgstr ""
504507
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
505508
msgstr "Unterstützt: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
506509

510+
msgid "Supported: JPG · PNG · WebP · TIFF"
511+
msgstr "Unterstützt: JPG · PNG · WebP · TIFF"
512+
507513
msgid "Switch to English"
508514
msgstr "Zu Englisch wechseln"
509515

locale/en/LC_MESSAGES/messages.mo

0 Bytes
Binary file not shown.

locale/en/LC_MESSAGES/messages.po

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ msgid ""
77
msgstr ""
88
"Project-Id-Version: FileMorph VERSION\n"
99
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
10-
"POT-Creation-Date: 2026-05-07 15:22+0200\n"
10+
"POT-Creation-Date: 2026-05-08 15:52+0200\n"
1111
"PO-Revision-Date: 2026-05-07 13:43+0200\n"
1212
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
1313
"Language: en\n"
@@ -323,6 +323,9 @@ msgstr ""
323323
msgid "Language"
324324
msgstr ""
325325

326+
msgid "MP4 · MOV · AVI · MKV · WebM"
327+
msgstr ""
328+
326329
msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
327330
msgstr ""
328331

@@ -468,6 +471,9 @@ msgstr ""
468471
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
469472
msgstr ""
470473

474+
msgid "Supported: JPG · PNG · WebP · TIFF"
475+
msgstr ""
476+
471477
msgid "Switch to English"
472478
msgstr ""
473479

locale/messages.pot

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ msgid ""
88
msgstr ""
99
"Project-Id-Version: FileMorph VERSION\n"
1010
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
11-
"POT-Creation-Date: 2026-05-07 15:22+0200\n"
11+
"POT-Creation-Date: 2026-05-08 15:52+0200\n"
1212
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
1313
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
1414
"Language-Team: LANGUAGE <LL@li.org>\n"
@@ -322,6 +322,9 @@ msgstr ""
322322
msgid "Language"
323323
msgstr ""
324324

325+
msgid "MP4 · MOV · AVI · MKV · WebM"
326+
msgstr ""
327+
325328
msgid "MP4 · MOV · AVI · MKV · WebM · MP3 · WAV · FLAC · OGG · M4A"
326329
msgstr ""
327330

@@ -467,6 +470,9 @@ msgstr ""
467470
msgid "Supported: HEIC · JPG · PNG · WebP · BMP · TIFF · GIF"
468471
msgstr ""
469472

473+
msgid "Supported: JPG · PNG · WebP · TIFF"
474+
msgstr ""
475+
470476
msgid "Switch to English"
471477
msgstr ""
472478

0 commit comments

Comments
 (0)