Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ jobs:
run: sudo apt update

- name: Install apt packages
run: sudo apt install -y graphicsmagick mupdf-tools ghostscript unoconv python3-dev
run: sudo apt install -y graphicsmagick mupdf-tools ghostscript libreoffice pipx python3-dev

- name: Install unoserver
run: pipx install unoserver --system-site-packages

- name: Install the project
run: uv sync --dev
Expand All @@ -64,10 +67,10 @@ jobs:
run: uv run ./manage.py collectstatic --noinput -v 0

- name: pytest with SQLite
run: uv run pytest -k "not unoconv"
run: uv run pytest

- name: pytest with PostgreSQL
run: uv run pytest -k "not unoconv" -m postgresql
run: uv run pytest -m postgresql
env:
DB_URL: postgres://postgres:postgres@localhost:5432/postgres

Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,17 @@ First, install uv and system dependencies:
curl -LsSf https://astral.sh/uv/install.sh | sh

# Ubuntu
sudo apt-get install unoconv python3-dev ruby libtiff5-dev libjpeg-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk mupdf-tools redis-server
sudo apt-get install libreoffice pipx python3-dev ruby libtiff5-dev libjpeg-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk mupdf-tools redis-server
# unoserver needs access to LibreOffice's 'uno' library from system packages
pipx install unoserver --system-site-packages
sudo systemctl enable --now redis-server
# Fedora
sudo dnf install unoconv python-devel ruby mupdf redis
sudo dnf install libreoffice pipx python-devel ruby mupdf redis
pipx install unoserver --system-site-packages
sudo systemctl enable --now redis
# Arch linux
sudo pacman -S unoconv ruby python mupdf-tools redis
sudo pacman -S libreoffice python-pipx ruby python mupdf-tools redis
pipx install unoserver --system-site-packages
sudo systemctl enable --now redis
```

Expand Down
66 changes: 38 additions & 28 deletions documents/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import re
import subprocess
import tempfile
import time
import uuid
from io import BytesIO

from django.conf import settings
from django.core.files.base import ContentFile, File

from celery import chain, shared_task
from celery.exceptions import SoftTimeLimitExceeded
from pypdf import PdfReader

from documents.models import Document, DocumentError
Expand Down Expand Up @@ -130,34 +130,44 @@ def checksum(self, document_id: int) -> int:

@short_doctask
def convert_office_to_pdf(self, document_id: int) -> int:
document = Document.objects.get(pk=document_id)

try:
document = Document.objects.get(pk=document_id)

with file_as_local(
document.original, prefix="dochub_unoconv_input_"
) as tmpfile:
try:
sub = subprocess.check_output(
["unoconv", "-f", "pdf", "--stdout", tmpfile.name]
)
except OSError as e:
raise MissingBinary("unoconv") from e
except subprocess.CalledProcessError as e:
raise DocumentProcessingError(
document, exc=e, message='"unoconv" has failed: %s' % e.output[:800]
) from e

document.pdf.save(str(uuid.uuid4()) + ".pdf", ContentFile(sub))

return document_id

except SoftTimeLimitExceeded as e:
# If we timeouted, kill the faulty openoffice daemon
# it will respawn at the next unoconv invocation
os.system("killall soffice.bin")
# Still raise the exception so the pipeline for this
# document is still stopped
raise e
# Check if unoserver is running
ping_result = subprocess.run(
["unoping"], capture_output=True, timeout=5, check=False
)

if ping_result.returncode != 0:
# Server not running, start it as a daemon
# Note: --daemon causes the process to fork, so we use Popen and don't wait
subprocess.Popen(
["unoserver", "--daemon", "--conversion-timeout", "300"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# Give the server time to start up and be ready
time.sleep(2)

try:
result = subprocess.run(
["unoconvert", "-", "-", "--convert-to", "pdf"],
input=document.original.read(),
capture_output=True,
check=True,
)
sub = result.stdout
except subprocess.CalledProcessError as e:
raise DocumentProcessingError(
document, exc=e, message='"unoconvert" has failed: %s' % e.output[:800]
) from e

except FileNotFoundError as e:
raise MissingBinary("unoserver") from e

document.pdf.save(str(uuid.uuid4()) + ".pdf", ContentFile(sub))

return document_id


@short_doctask
Expand Down
30 changes: 2 additions & 28 deletions documents/tests/celery_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import signal
from subprocess import call

from django.core.files import File

import celery
Expand All @@ -14,27 +11,6 @@
pytestmark = [pytest.mark.django_db, pytest.mark.celery]


class Alarm(Exception):
pass


def alarm_handler(signum, frame):
raise Alarm


def start_unoconv():
signal.signal(signal.SIGALRM, alarm_handler)
signal.alarm(1)
try:
call(["unoconv", "--listener"]) # workaround for a shitty unoconv
# Error: Unable to connect or start own listener. Aborting.
# Setting a timeout because if a listener exists alreay it hangs...
except Alarm:
pass

signal.alarm(0) # cancel alarm


def create_doc(name, ext):
try:
user = User.objects.get(netid="test_user")
Expand Down Expand Up @@ -79,8 +55,8 @@ def test_send_duplicate():
assert Document.objects.filter(id=doc.id).count() == 0


# TODO : mock unoconv and provide a fake pdf instead
@pytest.mark.unoconv
# TODO : mock unoserver and provide a fake pdf instead
@pytest.mark.unoserver
@pytest.mark.slow
def test_send_office():
doc = create_doc("My office doc", ".docx")
Expand All @@ -89,8 +65,6 @@ def test_send_office():
f = File(fd)
doc.original.save("silly-unique-deadbeef-file.docx", f)

start_unoconv()

result = process_document.delay(doc.id)
assert result.status == celery.states.SUCCESS, result.traceback

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ dev = [
]

[tool.pytest.ini_options]
norecursedirs = "ve ve3 static media .git node_modules"
norecursedirs = ".venv ve ve3 static media .git node_modules"
DJANGO_SETTINGS_MODULE="www.test_settings"
addopts = "--reuse-db"
markers = """
slow: marks tests as slow (deselect with '-m "not slow"')
network: marks tests using the network (deselect with '-m "not network"')
unoconv: uses unoconv (underterministic)
unoserver: uses unoserver for office document conversion
webtest: http queries against localhost
celery: uses celery tasks
postgresql: needs a postgresql database to run
Expand Down