Skip to content

Commit 3741456

Browse files
committed
unovonv is deprecated, use unoserver (Fixes #332) (#353)
1 parent 8d59925 commit 3741456

9 files changed

Lines changed: 181 additions & 79 deletions

File tree

.github/workflows/python-tests.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,11 @@ jobs:
2727
--health-timeout 5s
2828
--health-retries 5
2929
ports:
30-
# Maps tcp port 5432 on service container to the host
3130
- 5432:5432
31+
unoserver:
32+
image: ghcr.io/dochub-ulb/unoserver:latest
33+
ports:
34+
- 2003:2003
3235

3336

3437
steps:
@@ -46,7 +49,10 @@ jobs:
4649
run: sudo apt update
4750

4851
- name: Install apt packages
49-
run: sudo apt install -y graphicsmagick mupdf-tools ghostscript unoconv python3-dev
52+
run: sudo apt install -y graphicsmagick mupdf-tools ghostscript pipx python3-dev
53+
54+
- name: Install unoserver
55+
run: pipx install unoserver --system-site-packages
5056

5157
- name: Install the project
5258
run: uv sync --dev
@@ -64,10 +70,10 @@ jobs:
6470
run: uv run ./manage.py collectstatic --noinput -v 0
6571

6672
- name: pytest with SQLite
67-
run: uv run pytest -k "not unoconv"
73+
run: uv run pytest
6874

6975
- name: pytest with PostgreSQL
70-
run: uv run pytest -k "not unoconv" -m postgresql
76+
run: uv run pytest -m postgresql
7177
env:
7278
DB_URL: postgres://postgres:postgres@localhost:5432/postgres
7379

README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,17 @@ First, install uv and system dependencies:
3232
curl -LsSf https://astral.sh/uv/install.sh | sh
3333

3434
# Ubuntu
35-
sudo apt-get install unoconv python3-dev ruby libtiff5-dev libjpeg-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk mupdf-tools redis-server
35+
sudo apt-get install libreoffice pipx python3-dev ruby libtiff5-dev libjpeg-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk mupdf-tools redis-server
36+
# unoserver needs access to LibreOffice's 'uno' library from system packages
37+
pipx install unoserver --system-site-packages
3638
sudo systemctl enable --now redis-server
3739
# Fedora
38-
sudo dnf install unoconv python-devel ruby mupdf redis
40+
sudo dnf install libreoffice pipx python-devel ruby mupdf redis
41+
pipx install unoserver --system-site-packages
3942
sudo systemctl enable --now redis
4043
# Arch linux
41-
sudo pacman -S unoconv ruby python mupdf-tools redis
44+
sudo pacman -S libreoffice python-pipx ruby python mupdf-tools redis
45+
pipx install unoserver --system-site-packages
4246
sudo systemctl enable --now redis
4347
```
4448

documents/tasks.py

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
import re
55
import subprocess
66
import tempfile
7+
import time
78
import uuid
89
from io import BytesIO
910

1011
from django.conf import settings
1112
from django.core.files.base import ContentFile, File
1213

1314
from celery import chain, shared_task
14-
from celery.exceptions import SoftTimeLimitExceeded
1515
from pypdf import PdfReader
1616

1717
from documents.models import Document, DocumentError
@@ -130,34 +130,49 @@ def checksum(self, document_id: int) -> int:
130130

131131
@short_doctask
132132
def convert_office_to_pdf(self, document_id: int) -> int:
133-
try:
134-
document = Document.objects.get(pk=document_id)
133+
document = Document.objects.get(pk=document_id)
135134

136-
with file_as_local(
137-
document.original, prefix="dochub_unoconv_input_"
138-
) as tmpfile:
135+
if settings.DEBUG:
136+
# Check if unoserver is running
137+
ping_result = subprocess.run(
138+
["unoping"], capture_output=True, timeout=5, check=False
139+
)
140+
141+
if ping_result.returncode != 0:
142+
# Server not running, start it as a daemon
143+
# Here we want to use the system unoserver, as it needs access to LibreOffice
139144
try:
140-
sub = subprocess.check_output(
141-
["unoconv", "-f", "pdf", "--stdout", tmpfile.name]
145+
subprocess.Popen(
146+
[
147+
"unoserver",
148+
"--daemon",
149+
"--conversion-timeout",
150+
"300",
151+
],
152+
stdout=subprocess.DEVNULL,
153+
stderr=subprocess.DEVNULL,
142154
)
143-
except OSError as e:
144-
raise MissingBinary("unoconv") from e
145-
except subprocess.CalledProcessError as e:
146-
raise DocumentProcessingError(
147-
document, exc=e, message='"unoconv" has failed: %s' % e.output[:800]
148-
) from e
149-
150-
document.pdf.save(str(uuid.uuid4()) + ".pdf", ContentFile(sub))
151-
152-
return document_id
153-
154-
except SoftTimeLimitExceeded as e:
155-
# If we timeouted, kill the faulty openoffice daemon
156-
# it will respawn at the next unoconv invocation
157-
os.system("killall soffice.bin")
158-
# Still raise the exception so the pipeline for this
159-
# document is still stopped
160-
raise e
155+
except FileNotFoundError as e:
156+
raise MissingBinary("unoserver") from e
157+
# Give the server time to start up and be ready
158+
time.sleep(2)
159+
160+
try:
161+
result = subprocess.run(
162+
["unoconvert", "-", "-", "--convert-to", "pdf"],
163+
input=document.original.read(),
164+
capture_output=True,
165+
check=True,
166+
)
167+
sub = result.stdout
168+
except subprocess.CalledProcessError as e:
169+
raise DocumentProcessingError(
170+
document, exc=e, message="unoconvert has failed: %s" % e.stderr[:2000]
171+
) from e
172+
173+
document.pdf.save(str(uuid.uuid4()) + ".pdf", ContentFile(sub))
174+
175+
return document_id
161176

162177

163178
@short_doctask

documents/templates/documents/viewer.html

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ <h1 class="fs-4 mb-0 d-flex align-items-center gap-1">
3636

3737
<div class="d-sm-flex gap-2 align-items-center">
3838
<div class="mt-2 d-flex gap-2 align-items-center">
39-
{% if document.state == "DONE" %}
39+
{% if document.state == "DONE" and not document.is_unconvertible %}
4040
<a class="btn btn-primary btn-sm d-inline-flex align-items-center gap-1" data-turbo="false"
4141
href="{% url 'document_pdf' document.pk %}">
4242
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor"
@@ -140,7 +140,14 @@ <h1 class="fs-4 mb-0 d-flex align-items-center gap-1">
140140
{% endblock header %}
141141

142142
{% block content %}
143-
{% if document.state == "DONE" %}
143+
{% if document.is_unconvertible %}
144+
<div class="alert alert-primary" role="alert">
145+
DocHub ne sait pas générer d'aperçu pour ce document.
146+
Mais tu peux le
147+
<a href="{% url 'document_original' document.pk %}">télécharger</a>
148+
et l'ouvrir directement chez toi.
149+
</div>
150+
{% elif document.state == "DONE" %}
144151
<div class="container-xl" data-controller="viewer"
145152
data-viewer-src-value="{% url 'document_pdf' document.pk %}?embed">
146153
<div data-viewer-target="sidebar">
@@ -165,20 +172,12 @@ <h1 class="fs-4 mb-0 d-flex align-items-center gap-1">
165172

166173
</div>
167174
{% elif document.state == "ERROR" %}
168-
{% if document.is_unconvertible %}
169-
<div class="alert alert-primary" role="alert">
170-
DocHub ne sait pas générer d'aperçu pour ce document.
171-
Mais tu peux le
172-
<a href="{% url 'document_original' document.pk %}">télécharger</a>
173-
et l'ouvrir directement chez toi.
174-
</div>
175-
{% else %}
176-
<div class="alert alert-primary" role="alert">
177-
Ce document n'a pas pu être traité par DocHub à cause d'une erreur.
178-
Mais tu peux <a href="{% url 'document_original' document.pk %}">télécharger
179-
la version originale </a> si tu le désires.
180-
</div>
181-
{% endif %}
175+
176+
<div class="alert alert-primary" role="alert">
177+
Ce document n'a pas pu être traité par DocHub à cause d'une erreur.
178+
Mais tu peux <a href="{% url 'document_original' document.pk %}">télécharger
179+
la version originale </a> si tu le désires.
180+
</div>
182181
{% else %}
183182
<div class="alert alert-primary" role="alert">
184183
Ce document est en cours de traitement par DocHub et il n'est donc pas encore

documents/tests/celery_test.py

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import signal
2-
from subprocess import call
1+
import logging
2+
import subprocess
33

44
from django.core.files import File
55

@@ -11,28 +11,9 @@
1111
from documents.tasks import mutool_get_pages, process_document
1212
from users.models import User
1313

14-
pytestmark = [pytest.mark.django_db, pytest.mark.celery]
15-
16-
17-
class Alarm(Exception):
18-
pass
19-
20-
21-
def alarm_handler(signum, frame):
22-
raise Alarm
23-
14+
logger = logging.getLogger(__name__)
2415

25-
def start_unoconv():
26-
signal.signal(signal.SIGALRM, alarm_handler)
27-
signal.alarm(1)
28-
try:
29-
call(["unoconv", "--listener"]) # workaround for a shitty unoconv
30-
# Error: Unable to connect or start own listener. Aborting.
31-
# Setting a timeout because if a listener exists alreay it hangs...
32-
except Alarm:
33-
pass
34-
35-
signal.alarm(0) # cancel alarm
16+
pytestmark = [pytest.mark.django_db, pytest.mark.celery]
3617

3718

3819
def create_doc(name, ext):
@@ -79,18 +60,59 @@ def test_send_duplicate():
7960
assert Document.objects.filter(id=doc.id).count() == 0
8061

8162

82-
# TODO : mock unoconv and provide a fake pdf instead
83-
@pytest.mark.unoconv
63+
@pytest.fixture
64+
def unoserver():
65+
# Check if unoserver is running
66+
ping_result = subprocess.run(
67+
["unoping"], capture_output=True, timeout=5, check=False
68+
)
69+
70+
if ping_result.returncode != 0:
71+
logger.debug("Unoserver is not running, starting it ourselves")
72+
sub = subprocess.Popen(
73+
[
74+
"unoserver",
75+
"--daemon",
76+
"--conversion-timeout",
77+
"300",
78+
],
79+
stdout=subprocess.PIPE,
80+
stderr=subprocess.PIPE,
81+
)
82+
logger.debug("Unoserver started")
83+
84+
yield
85+
86+
logger.debug("Killing unoserver")
87+
sub.kill()
88+
89+
# Get stdout and stderr
90+
stdout, stderr = sub.communicate(timeout=5)
91+
92+
# Log them
93+
if stdout:
94+
logger.info(
95+
"unoserver stdout:\n%s", stdout.decode("utf-8", errors="replace")
96+
)
97+
if stderr:
98+
logger.error(
99+
"unoserver stderr:\n%s", stderr.decode("utf-8", errors="replace")
100+
)
101+
else:
102+
logger.debug("Unoserver is already running")
103+
yield
104+
105+
106+
# TODO : mock unoserver and provide a fake pdf instead
107+
@pytest.mark.unoserver
84108
@pytest.mark.slow
85-
def test_send_office():
109+
def test_send_office(unoserver):
86110
doc = create_doc("My office doc", ".docx")
87111

88112
with open("documents/tests/files/2pages.docx", "rb") as fd:
89113
f = File(fd)
90114
doc.original.save("silly-unique-deadbeef-file.docx", f)
91115

92-
start_unoconv()
93-
94116
result = process_document.delay(doc.id)
95117
assert result.status == celery.states.SUCCESS, result.traceback
96118

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,16 @@ dev = [
6262
]
6363

6464
[tool.pytest.ini_options]
65-
norecursedirs = "ve ve3 static media .git node_modules"
65+
norecursedirs = ".venv ve ve3 static media .git node_modules"
6666
DJANGO_SETTINGS_MODULE="www.test_settings"
6767
addopts = "--reuse-db"
68+
filterwarnings = [
69+
"ignore:builtin type.*has no __module__ attribute:DeprecationWarning"
70+
]
6871
markers = """
6972
slow: marks tests as slow (deselect with '-m "not slow"')
7073
network: marks tests using the network (deselect with '-m "not network"')
71-
unoconv: uses unoconv (underterministic)
74+
unoserver: uses unoserver for office document conversion
7275
webtest: http queries against localhost
7376
celery: uses celery tasks
7477
postgresql: needs a postgresql database to run

unoserver/Dockerfile

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Mostly copied from
2+
# https://github.com/unoconv/unoserver-docker/blob/main/Dockerfile
3+
# Under MIT License Copyright (c) 2022 unoconv
4+
5+
FROM eclipse-temurin:24.0.1_9-jdk-alpine-3.21
6+
7+
LABEL org.opencontainers.image.source=https://github.com/DocHub-ULB/DocHub/unoserver
8+
9+
# Install LibreOffice, Python, and essential dependencies
10+
RUN apk add --no-cache \
11+
py3-pip \
12+
libreoffice \
13+
# Essential fonts for document rendering
14+
font-noto \
15+
font-noto-cjk \
16+
ttf-dejavu \
17+
ttf-liberation \
18+
fontconfig && \
19+
fc-cache -f && \
20+
rm -rf /var/cache/apk/* /tmp/*
21+
22+
# Install unoserver
23+
RUN pip install --break-system-packages unoserver==3.6
24+
25+
# Create non-root user
26+
RUN addgroup -S worker && adduser -S worker -G worker
27+
USER worker
28+
WORKDIR /home/worker
29+
30+
# Expose unoserver port
31+
EXPOSE 2003
32+
33+
HEALTHCHECK --interval=5s CMD unoping --host 127.0.0.1 --port 2003
34+
35+
# Run unoserver
36+
CMD ["unoserver", "--interface", "0.0.0.0", "--port", "2003"]
37+

unoserver/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Unoserver Docker Image
2+
3+
This Docker image is used as a service in our GitHub Actions workflow for running tests (see `.github/workflows/python-tests.yml`).
4+
5+
To build and push the image: `docker buildx build --platform linux/amd64,linux/arm64 -t ghcr.io/dochub-ulb/unoserver:latest . --push`

0 commit comments

Comments
 (0)