Skip to content

Commit 7304b1b

Browse files
committed
Add Prometheus metrics exporter for action execution
1 parent b50c066 commit 7304b1b

File tree

7 files changed

+270
-11
lines changed

7 files changed

+270
-11
lines changed

automation_file/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
from automation_file.core.fim import IntegrityMonitor
4141
from automation_file.core.json_store import read_action_json, write_action_json
4242
from automation_file.core.manifest import ManifestException, verify_manifest, write_manifest
43+
from automation_file.core.metrics import ACTION_COUNT, ACTION_DURATION, record_action
44+
from automation_file.core.metrics import render as render_metrics
4345
from automation_file.core.package_loader import PackageLoader
4446
from automation_file.core.progress import (
4547
CancellationToken,
@@ -169,6 +171,7 @@
169171
)
170172
from automation_file.server.action_acl import ActionACL, ActionNotPermittedException
171173
from automation_file.server.http_server import HTTPActionServer, start_http_action_server
174+
from automation_file.server.metrics_server import MetricsServer, start_metrics_server
172175
from automation_file.server.tcp_server import (
173176
TCPActionServer,
174177
start_autocontrol_socket_server,
@@ -337,6 +340,12 @@ def __getattr__(name: str) -> Any:
337340
"decrypt_file",
338341
"generate_key",
339342
"key_from_password",
343+
"ACTION_COUNT",
344+
"ACTION_DURATION",
345+
"record_action",
346+
"render_metrics",
347+
"MetricsServer",
348+
"start_metrics_server",
340349
# Triggers
341350
"FileWatcher",
342351
"TriggerManager",

automation_file/core/action_executor.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515

1616
from __future__ import annotations
1717

18+
import time
1819
from collections.abc import Mapping
1920
from concurrent.futures import ThreadPoolExecutor
2021
from typing import Any
2122

2223
from automation_file.core.action_registry import ActionRegistry, build_default_registry
2324
from automation_file.core.json_store import read_action_json
25+
from automation_file.core.metrics import record_action
2426
from automation_file.core.substitution import substitute as substitute_payload
2527
from automation_file.exceptions import ExecuteActionException, ValidationException
2628
from automation_file.logging_config import file_automation_logger
@@ -150,27 +152,40 @@ def add_command_to_executor(self, command_dict: Mapping[str, Any]) -> None:
150152

151153
# Internals ---------------------------------------------------------
152154
def _run_one(self, action: list, dry_run: bool) -> Any:
155+
name = _safe_action_name(action)
156+
if dry_run:
157+
return self._run_dry(action)
158+
started = time.monotonic()
159+
ok = False
153160
try:
154-
if dry_run:
155-
name, kind, payload = self._parse_action(action)
156-
if self.registry.resolve(name) is None:
157-
raise ExecuteActionException(f"unknown action: {name!r}")
158-
file_automation_logger.info(
159-
"dry_run: %s kind=%s payload=%r",
160-
name,
161-
kind,
162-
payload,
163-
)
164-
return f"dry_run:{name}"
165161
value = self._execute_event(action)
166162
file_automation_logger.info("execute_action: %s", action)
163+
ok = True
167164
return value
168165
except ExecuteActionException as error:
169166
file_automation_logger.error("execute_action malformed: %r", error)
170167
return repr(error)
171168
except Exception as error: # pylint: disable=broad-except
172169
file_automation_logger.error("execute_action runtime error: %r", error)
173170
return repr(error)
171+
finally:
172+
record_action(name, time.monotonic() - started, ok)
173+
174+
def _run_dry(self, action: list) -> Any:
175+
try:
176+
name, kind, payload = self._parse_action(action)
177+
if self.registry.resolve(name) is None:
178+
raise ExecuteActionException(f"unknown action: {name!r}")
179+
except ExecuteActionException as error:
180+
file_automation_logger.error("execute_action malformed: %r", error)
181+
return repr(error)
182+
file_automation_logger.info(
183+
"dry_run: %s kind=%s payload=%r",
184+
name,
185+
kind,
186+
payload,
187+
)
188+
return f"dry_run:{name}"
174189

175190
@staticmethod
176191
def _coerce(action_list: list | Mapping[str, Any]) -> list:
@@ -188,6 +203,12 @@ def _coerce(action_list: list | Mapping[str, Any]) -> list:
188203
return action_list
189204

190205

206+
def _safe_action_name(action: Any) -> str:
207+
if isinstance(action, list) and action and isinstance(action[0], str):
208+
return action[0]
209+
return "unknown"
210+
211+
191212
# Default shared executor — built once, mutated in place by plugins.
192213
executor: ActionExecutor = ActionExecutor()
193214

automation_file/core/metrics.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Prometheus metrics — per-action counters and duration histogram.
2+
3+
The module exposes two metrics that are updated from
4+
:class:`~automation_file.core.action_executor.ActionExecutor` on every
5+
call:
6+
7+
* ``automation_file_actions_total{action, status}`` — counter incremented
8+
with ``status="ok"`` or ``status="error"`` per action.
9+
* ``automation_file_action_duration_seconds{action}`` — histogram of wall
10+
time spent inside the registered callable.
11+
12+
:func:`render` returns the wire-format text and matching ``Content-Type``
13+
suitable for a ``GET /metrics`` handler. :func:`record_action` is the
14+
single write path — failures are swallowed so a broken metrics backend
15+
can never abort a real action.
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from prometheus_client import CONTENT_TYPE_LATEST, REGISTRY, Counter, Histogram, generate_latest
21+
22+
from automation_file.logging_config import file_automation_logger
23+
24+
_DURATION_BUCKETS = (
25+
0.005,
26+
0.01,
27+
0.025,
28+
0.05,
29+
0.1,
30+
0.25,
31+
0.5,
32+
1.0,
33+
2.5,
34+
5.0,
35+
10.0,
36+
30.0,
37+
60.0,
38+
)
39+
40+
ACTION_COUNT = Counter(
41+
"automation_file_actions_total",
42+
"Total actions executed, partitioned by outcome.",
43+
labelnames=("action", "status"),
44+
)
45+
ACTION_DURATION = Histogram(
46+
"automation_file_action_duration_seconds",
47+
"Time spent inside a registered action callable.",
48+
labelnames=("action",),
49+
buckets=_DURATION_BUCKETS,
50+
)
51+
52+
53+
def record_action(action: str, duration_seconds: float, ok: bool) -> None:
54+
"""Record one action execution. Never raises."""
55+
status = "ok" if ok else "error"
56+
try:
57+
ACTION_COUNT.labels(action=action, status=status).inc()
58+
ACTION_DURATION.labels(action=action).observe(max(0.0, float(duration_seconds)))
59+
except Exception as err: # pragma: no cover - defensive
60+
file_automation_logger.error("metrics.record_action failed: %r", err)
61+
62+
63+
def render() -> tuple[bytes, str]:
64+
"""Return ``(payload, content_type)`` for a ``/metrics`` response."""
65+
return generate_latest(REGISTRY), CONTENT_TYPE_LATEST
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Prometheus scrape endpoint (``GET /metrics``).
2+
3+
``start_metrics_server(host, port)`` spawns a threaded HTTP server that
4+
serves the current metrics snapshot rendered by
5+
:func:`automation_file.core.metrics.render`. Like the other servers in
6+
this package it defaults to loopback and requires ``allow_non_loopback=True``
7+
to bind elsewhere. No auth is attached by default — put it behind a
8+
reverse proxy or use an ACL at the network layer if exposing beyond a
9+
trusted host.
10+
"""
11+
12+
from __future__ import annotations
13+
14+
import threading
15+
from http import HTTPStatus
16+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
17+
18+
from automation_file.core.metrics import render
19+
from automation_file.logging_config import file_automation_logger
20+
from automation_file.server.network_guards import ensure_loopback
21+
22+
_DEFAULT_HOST = "127.0.0.1"
23+
_DEFAULT_PORT = 9945
24+
_METRICS_PATH = "/metrics"
25+
26+
27+
class _MetricsHandler(BaseHTTPRequestHandler):
28+
"""GET /metrics -> current Prometheus snapshot."""
29+
30+
def log_message( # pylint: disable=arguments-differ
31+
self, format_str: str, *args: object
32+
) -> None:
33+
file_automation_logger.info("metrics_server: " + format_str, *args)
34+
35+
def do_GET(self) -> None: # pylint: disable=invalid-name — BaseHTTPRequestHandler API
36+
if self.path != _METRICS_PATH:
37+
self._send(HTTPStatus.NOT_FOUND, b"not found", "text/plain; charset=utf-8")
38+
return
39+
payload, content_type = render()
40+
self._send(HTTPStatus.OK, payload, content_type)
41+
42+
def _send(self, status: HTTPStatus, payload: bytes, content_type: str) -> None:
43+
self.send_response(status)
44+
self.send_header("Content-Type", content_type)
45+
self.send_header("Content-Length", str(len(payload)))
46+
self.end_headers()
47+
self.wfile.write(payload)
48+
49+
50+
class MetricsServer(ThreadingHTTPServer):
51+
"""Threaded HTTP server serving ``GET /metrics``."""
52+
53+
54+
def start_metrics_server(
55+
host: str = _DEFAULT_HOST,
56+
port: int = _DEFAULT_PORT,
57+
allow_non_loopback: bool = False,
58+
) -> MetricsServer:
59+
"""Start a metrics server on a background thread and return it."""
60+
if not allow_non_loopback:
61+
ensure_loopback(host)
62+
server = MetricsServer((host, port), _MetricsHandler)
63+
thread = threading.Thread(target=server.serve_forever, daemon=True)
64+
thread.start()
65+
address = server.server_address
66+
file_automation_logger.info("metrics_server: listening on %s:%d", address[0], address[1])
67+
return server

dev.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ dependencies = [
2626
"PySide6>=6.6.0",
2727
"watchdog>=4.0.0",
2828
"cryptography>=42.0.0",
29+
"prometheus_client>=0.20.0",
2930
"tomli>=2.0.1; python_version<\"3.11\""
3031
]
3132
classifiers = [

stable.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ dependencies = [
2626
"PySide6>=6.6.0",
2727
"watchdog>=4.0.0",
2828
"cryptography>=42.0.0",
29+
"prometheus_client>=0.20.0",
2930
"tomli>=2.0.1; python_version<\"3.11\""
3031
]
3132
classifiers = [

tests/test_metrics.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""Tests for the Prometheus metrics exporter."""
2+
3+
from __future__ import annotations
4+
5+
import urllib.request
6+
7+
import pytest
8+
9+
from automation_file import (
10+
ACTION_COUNT,
11+
execute_action,
12+
record_action,
13+
render_metrics,
14+
start_metrics_server,
15+
)
16+
from automation_file.core.action_executor import executor
17+
from tests._insecure_fixtures import insecure_url, ipv4
18+
19+
20+
def _register_probes() -> None:
21+
if "test_metric_ok" not in executor.registry:
22+
executor.registry.register("test_metric_ok", lambda: "ok")
23+
if "test_metric_fail" not in executor.registry:
24+
25+
def _boom() -> None:
26+
raise RuntimeError("boom")
27+
28+
executor.registry.register("test_metric_fail", _boom)
29+
30+
31+
def test_record_action_increments_counter() -> None:
32+
before = ACTION_COUNT.labels(action="unit.test", status="ok")._value.get()
33+
record_action("unit.test", 0.01, ok=True)
34+
after = ACTION_COUNT.labels(action="unit.test", status="ok")._value.get()
35+
assert after == before + 1
36+
37+
38+
def test_record_action_clamps_negative_duration() -> None:
39+
# Must not raise.
40+
record_action("unit.clamp", -5.0, ok=True)
41+
42+
43+
def test_render_metrics_content_type() -> None:
44+
payload, content_type = render_metrics()
45+
assert content_type.startswith("text/plain")
46+
assert b"automation_file_actions_total" in payload
47+
48+
49+
def test_execute_action_increments_ok_counter() -> None:
50+
_register_probes()
51+
before = ACTION_COUNT.labels(action="test_metric_ok", status="ok")._value.get()
52+
execute_action([["test_metric_ok"]])
53+
after = ACTION_COUNT.labels(action="test_metric_ok", status="ok")._value.get()
54+
assert after == before + 1
55+
56+
57+
def test_execute_action_increments_error_counter() -> None:
58+
_register_probes()
59+
before = ACTION_COUNT.labels(action="test_metric_fail", status="error")._value.get()
60+
execute_action([["test_metric_fail"]])
61+
after = ACTION_COUNT.labels(action="test_metric_fail", status="error")._value.get()
62+
assert after == before + 1
63+
64+
65+
def test_metrics_server_serves_metrics_endpoint() -> None:
66+
_register_probes()
67+
execute_action([["test_metric_ok"]])
68+
server = start_metrics_server(host="127.0.0.1", port=0)
69+
host, port = server.server_address
70+
try:
71+
url = insecure_url("http", f"{host}:{port}/metrics")
72+
with urllib.request.urlopen(url, timeout=3) as resp: # nosec B310 - loopback test server
73+
body = resp.read().decode("utf-8")
74+
assert resp.status == 200
75+
assert "automation_file_actions_total" in body
76+
finally:
77+
server.shutdown()
78+
79+
80+
def test_metrics_server_returns_404_for_other_paths() -> None:
81+
server = start_metrics_server(host="127.0.0.1", port=0)
82+
host, port = server.server_address
83+
try:
84+
url = insecure_url("http", f"{host}:{port}/other")
85+
with pytest.raises(urllib.error.HTTPError) as info:
86+
urllib.request.urlopen(url, timeout=3) # nosec B310 - loopback test server
87+
assert info.value.code == 404
88+
finally:
89+
server.shutdown()
90+
91+
92+
def test_metrics_server_rejects_non_loopback() -> None:
93+
non_loopback = ipv4(8, 8, 8, 8)
94+
with pytest.raises(ValueError):
95+
start_metrics_server(host=non_loopback, port=0)

0 commit comments

Comments
 (0)