Skip to content

Commit 9303280

Browse files
AP-660: Implements static_files plugin (#62)
- Adds a static_files plugin that serves files from ./storage (configurable). - Adds a "Files" tab to the DagRun page that shows files created for that run. Refactors existing URLs/storage dirs to use that new location. - Extracts Jinja2 templating for the main app into mokelumne.templates. Templates need to be rendered slightly differently for emails vs. on-disk indexes, and this cleans up that code.
1 parent a712e75 commit 9303280

19 files changed

Lines changed: 773 additions & 87 deletions

.gitignore

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ tmp
55
artifacts/*
66
logs/*
77
download/*
8-
public/*
8+
files/*
9+
!files/.keep
910
__pycache__/
1011
*.py[cod]
1112
*$py.class
1213
config
1314
airflow.cfg
1415
*.egg-info
1516
.coverage
16-
build/
17+
build/
18+
uv.lock

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ Important environment variables for our build/environment:
107107
|`AWS_MODEL_ID`|The model to use. Make sure it's supported on the ARN.|`AWS_MODEL_ID="us.anthropic.claude-haiku-4-5-20251001-v1:0"`|
108108
|`AWS_MODEL_LABEL`|A human friendly label for the model. Will eventually be displayed in the Tind record.|`AWS_MODEL_LABEL="Claude Haiku 4.5"`|
109109
|`AWS_MODEL_PROVIDER`|The provider for the model. |`AWS_MODEL_PROVIDER=anthropic`|
110-
|`MOKELUMNE_PUBLIC_STORAGE`|Path for public assets|`MOKELUMNE_PUBLIC_STORAGE=/opt/airflow/public`|
111110
|`MOKELUMNE_PUBLIC_URL`|URL to access public assets - must end in `/`|`MOKELUMNE_PUBLIC_URL=https://mokelumne-assets.ucblib.org/`|
112111

113112
Note: The `AIRFLOW_UID` example in `example.env` maps to the reserved `uid` for the `airflow` user in [lap/workflow](https://git.lib.berkeley.edu/lap/workflow/-/wikis/UIDs).

docker-compose.ci.yml

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,45 @@
1+
x-airflow-common-overrides: &airflow-common-overrides
2+
image: ${DOCKER_APP_IMAGE}
3+
volumes: !override
4+
- airflow_files:/opt/airflow/files
5+
16
services:
27
airflow-apiserver:
8+
<<: *airflow-common-overrides
39
build: !reset
410
develop: !reset
5-
image: ${DOCKER_APP_IMAGE}
6-
volumes: !reset
711

812
airflow-cli:
13+
<<: *airflow-common-overrides
914
build: !reset
1015
develop: !reset
11-
image: ${DOCKER_APP_IMAGE}
12-
volumes: !reset
1316

1417
airflow-dag-processor:
18+
<<: *airflow-common-overrides
1519
build: !reset
1620
develop: !reset
17-
image: ${DOCKER_APP_IMAGE}
18-
volumes: !reset
1921

2022
airflow-init:
23+
<<: *airflow-common-overrides
2124
build: !reset
2225
develop: !reset
23-
image: ${DOCKER_APP_IMAGE}
24-
volumes: !reset
2526

2627
airflow-scheduler:
28+
<<: *airflow-common-overrides
2729
build: !reset
2830
develop: !reset
29-
image: ${DOCKER_APP_IMAGE}
30-
volumes: !reset
3131

3232
airflow-triggerer:
33+
<<: *airflow-common-overrides
3334
build: !reset
3435
develop: !reset
35-
image: ${DOCKER_APP_IMAGE}
36-
volumes: !reset
3736

3837
airflow-worker:
38+
<<: *airflow-common-overrides
3939
build: !reset
4040
develop: !reset
41-
image: ${DOCKER_APP_IMAGE}
42-
volumes: !reset
4341

4442
flower: !reset
43+
44+
volumes:
45+
airflow_files: {}

docker-compose.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ x-airflow-common: &airflow-common
6969
volumes:
7070
- ./:/opt/airflow
7171
- ./artifacts:/home/airflow/artifacts
72+
- ./files:/opt/airflow/files
7273
- ./test:/opt/airflow/test
73-
- ./public:/opt/airflow/public
7474
user: "${AIRFLOW_UID:-40093}:0"
7575

7676
services:
@@ -123,11 +123,16 @@ services:
123123
--conn-password "${TIND_API_KEY}" \
124124
--conn-description "TIND API connection"
125125
fi
126+
chown -cR airflow:root /opt/airflow/files
127+
chmod 0775 /opt/airflow/files
126128
depends_on:
127129
postgres:
128130
condition: service_healthy
129131
entrypoint: /bin/bash
130132
restart: on-failure:3
133+
# Runs as root to chown the files directory. Necessary in GitHub Actions
134+
# where it's initialized as root:root.
135+
user: root
131136

132137
airflow-scheduler:
133138
<<: *airflow-common

files/.keep

Whitespace-only changes.
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""Template renderers for Batch Image emails, web pages, etc.
2+
3+
Jinja2 templates live alongside this module as ``*.html`` files. Helpers in
4+
this package load them via a FileSystemLoader anchored to ``__file__``.
5+
"""
6+
7+
from pathlib import Path
8+
9+
from jinja2 import Environment, FileSystemLoader
10+
11+
from mokelumne.plugins.static_files.helpers import static_path_to_url
12+
13+
14+
_env = Environment(
15+
loader=FileSystemLoader(Path(__file__).parent),
16+
autoescape=True,
17+
)
18+
19+
20+
def render_results_html(
21+
query: str,
22+
processed_path: Path, processed_count: int, processed_success: int, processed_failures: int,
23+
fetched_path: Path, fetched_count: int, fetched_success: int, fetched_failures: int,
24+
skipped_path: Path, skipped_count: int,
25+
embed: bool,
26+
) -> str:
27+
"""Render the per-run results summary.
28+
29+
When ``embed`` is True, asset links resolve inside Airflow's sandboxed
30+
"Files" iframe (text/plain downgrade). When False, links are suitable for
31+
standalone browser use, e.g. an emailed summary.
32+
"""
33+
return _env.get_template("results.html").render(
34+
query=query,
35+
processed={
36+
"url": static_path_to_url(processed_path, embed=embed),
37+
"count": processed_count,
38+
"success": processed_success,
39+
"failures": processed_failures,
40+
},
41+
fetched={
42+
"url": static_path_to_url(fetched_path, embed=embed),
43+
"count": fetched_count,
44+
"success": fetched_success,
45+
"failures": fetched_failures,
46+
},
47+
skipped={
48+
"url": static_path_to_url(skipped_path, embed=embed),
49+
"count": skipped_count,
50+
},
51+
)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="utf-8">
5+
<title>Batch image description results</title>
6+
</head>
7+
<body>
8+
<h1>Batch image description results</h1>
9+
<p>Query: {{ query }}</p>
10+
<dl>
11+
<dt><a href="{{ processed.url }}">{{ processed.count }} images processed</a></dt>
12+
<dd>{{ processed.success }} succeeded, {{ processed.failures }} failed</dd>
13+
<dt><a href="{{ fetched.url }}">{{ fetched.count }} images fetched</a></dt>
14+
<dd>{{ fetched.success }} succeeded, {{ fetched.failures }} failed</dd>
15+
<dt><a href="{{ skipped.url }}">{{ skipped.count }} records skipped</a></dt>
16+
<dd>These records did not match filter criteria</dd>
17+
</dl>
18+
</body>
19+
</html>

mokelumne/dags/gen_llm_image_descriptions.py

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,25 @@
77
import logging
88
from collections import namedtuple
99
from datetime import datetime, UTC
10-
from html import escape
1110
from itertools import filterfalse
1211
from os import environ as ENV
1312
from pathlib import Path
1413
from shutil import copyfile
1514
from typing import List
16-
from uuid import uuid4
17-
1815
from airflow.exceptions import AirflowFailException
1916
from airflow.providers.smtp.operators.smtp import EmailOperator
2017
from airflow.sdk import dag, task, task_group, Param, get_current_context
2118
from pymarc.marcxml import map_xml
2219

2320
from langchain_aws import ChatBedrock
21+
from mokelumne.batch_image.templates import render_results_html
2422
from mokelumne.dags.fetch_tind_records import write_query_results_to_xml
2523
from mokelumne.providers.tind.hooks.tind import TindHook
2624
from mokelumne.util import langfuse
2725
from mokelumne.util.image_describer import ImageDescriber
2826
from mokelumne.util.image_fetcher import ImageFetcher, base64_size
29-
from mokelumne.util.storage import run_dir, public_dir, public_path_to_url
27+
from mokelumne.plugins.static_files.helpers import static_files_run_dir
28+
from mokelumne.util.storage import run_dir
3029
from mokelumne.util.tind_csv_writer import TindCsvWriter, is_single_image_record
3130

3231

@@ -357,9 +356,14 @@ def summarise_job():
357356

358357
@task
359358
def generate_id() -> str:
360-
"""Generate a URL-safe directory for collated output."""
361-
output_path = public_dir() / str(uuid4())
362-
output_path.mkdir() # We don't exist_ok=True because it should be unique.
359+
"""Return the per-run public output directory, creating it if needed.
360+
361+
The static_files plugin's DAG-run "Files" tab links to
362+
``<STATIC_FILES_ROOT>/<dag_id>/<run_id>/``, so collated outputs must
363+
land there for the tab to find them.
364+
"""
365+
context = get_current_context()
366+
output_path = static_files_run_dir(context["dag"].dag_id, context["run_id"])
363367
return str(output_path)
364368

365369
@task
@@ -413,22 +417,26 @@ def count_success_fail_of_csv(csv_file: Path, success: str) -> tuple[int, int, i
413417
count_success_fail_of_csv(proc_path, "success")
414418
)
415419

416-
template_html = f"""
417-
<html><head><title>Batch image description results</title></head><body>
418-
<h1>Batch image description results</h1>
419-
<p>Query: {escape(params['tind_query'])}</p>
420-
<dl>
421-
<dt><a href="{public_path_to_url(proc_path)}">{proc_count} images processed</a></dt>
422-
<dd>{proc_success} succeeded, {proc_failures} failed</dd>
423-
<dt><a href="{public_path_to_url(fetched_path)}">{fetch_count} images fetched</a></dt>
424-
<dd>{fetch_success} succeeded, {fetch_failures} failed</dd>
425-
<dt><a href="{public_path_to_url(skipped_path)}">{skip_count} records skipped</a></dt>
426-
<dd>These records did not match filter criteria</dd>
427-
</dl>
428-
</body></html>"""
429-
430-
with (output_path / 'index.html').open('w') as html:
431-
html.write(template_html)
420+
def render(embed: bool) -> str:
421+
return render_results_html(
422+
query=params['tind_query'],
423+
processed_path=proc_path,
424+
processed_count=proc_count,
425+
processed_success=proc_success,
426+
processed_failures=proc_failures,
427+
fetched_path=fetched_path,
428+
fetched_count=fetch_count,
429+
fetched_success=fetch_success,
430+
fetched_failures=fetch_failures,
431+
skipped_path=skipped_path,
432+
skipped_count=skip_count,
433+
embed=embed,
434+
)
435+
436+
# index.html: served from AirFlow's iFrame, so downloads are blocked (embed=True)
437+
(output_path / 'index.html').write_text(render(embed=True), encoding='utf-8')
438+
# email.html: opened from email client, so downloads are allowed (embed=False)
439+
(output_path / 'email.html').write_text(render(embed=False), encoding='utf-8')
432440

433441
return output_dir_str
434442

@@ -442,11 +450,10 @@ def notify_user(pub_directory):
442450

443451
@task
444452
def render_email_template(asset_directory: str) -> str:
445-
"""Create the HTML template for the email message that will be sent."""
453+
"""Read the pre-rendered email body for this run."""
446454
asset_path = Path(str(asset_directory))
447455

448-
with (asset_path / "index.html").open(encoding='utf-8') as html:
449-
return html.read()
456+
return (asset_path / "email.html").read_text(encoding='utf-8')
450457

451458
EmailOperator(
452459
task_id='send_email',

mokelumne/plugins/__init__.py

Whitespace-only changes.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from airflow.plugins_manager import AirflowPlugin
2+
3+
from .config import (
4+
STATIC_FILES_DAG_RUN_TAB_LABEL,
5+
STATIC_FILES_EMBED_PARAM,
6+
STATIC_FILES_PLUGIN_NAME,
7+
STATIC_FILES_ROOT,
8+
STATIC_FILES_ROUTE,
9+
STATIC_FILES_URL_PREFIX,
10+
)
11+
from .helpers import static_file_path, static_files_root, static_path_to_url
12+
from .routes import files_router
13+
14+
15+
class StaticFilesPlugin(AirflowPlugin):
16+
name = STATIC_FILES_PLUGIN_NAME
17+
18+
fastapi_apps = [
19+
{
20+
"app": files_router,
21+
"name": STATIC_FILES_PLUGIN_NAME,
22+
"url_prefix": STATIC_FILES_URL_PREFIX,
23+
}
24+
]
25+
26+
external_views = [
27+
{
28+
"name": STATIC_FILES_DAG_RUN_TAB_LABEL,
29+
"destination": "dag_run",
30+
"url_route": STATIC_FILES_ROUTE,
31+
"href": f"{STATIC_FILES_URL_PREFIX}/{{DAG_ID}}/{{RUN_ID}}/?{STATIC_FILES_EMBED_PARAM}=1",
32+
}
33+
]
34+
35+
# Adds macros allowing the rest of AirFlow to address files served by this plugin.
36+
macros = [
37+
static_file_path,
38+
static_files_root,
39+
static_path_to_url,
40+
]

0 commit comments

Comments
 (0)