Add metric agg mode locally. Add task execution stats. Clean up dependencies.

monoxgas · monoxgas · commit 62fe64409463 · 2025-04-24T12:10:20.000-06:00
diff --git a/dreadnode/main.py b/dreadnode/main.py
@@ -32,7 +32,7 @@
     ENV_SERVER,
     ENV_SERVER_URL,
 )
-from dreadnode.metric import Metric, Scorer, ScorerCallable, T
+from dreadnode.metric import Metric, MetricMode, Scorer, ScorerCallable, T
 from dreadnode.task import P, R, Task
 from dreadnode.tracing.exporters import (
     FileExportConfig,
@@ -757,6 +757,7 @@ def log_metric(
         step: int = 0,
         origin: t.Any | None = None,
         timestamp: datetime | None = None,
+        mode: MetricMode = "direct",
         to: ToObject = "task-or-run",
     ) -> None:
         """
@@ -778,6 +779,14 @@ def log_metric(
             origin: The origin of the metric - can be provided any object which was logged
                 as an input or output anywhere in the run.
             timestamp: The timestamp of the metric - defaults to the current time.
+            mode: The aggregation mode to use for the metric. Helpful when you want to let
+                the library take care of translating your raw values into better representations.
+                - direct: do not modify the value at all (default)
+                - min: the lowest observed value reported for this metric
+                - max: the highest observed value reported for this metric
+                - avg: the average of all reported values for this metric
+                - sum: the cumulative sum of all reported values for this metric
+                - count: increment every time this metric is logged - disregard value
             to: The target object to log the metric to. Can be "task-or-run" or "run".
                 Defaults to "task-or-run". If "task-or-run", the metric will be logged
                 to the current task or run, whichever is the nearest ancestor.
@@ -790,6 +799,7 @@ def log_metric(
         value: Metric,
         *,
         origin: t.Any | None = None,
+        mode: MetricMode = "direct",
         to: ToObject = "task-or-run",
     ) -> None:
         """
@@ -809,6 +819,13 @@ def log_metric(
             value: The metric object.
             origin: The origin of the metric - can be provided any object which was logged
                 as an input or output anywhere in the run.
+            mode: The aggregation mode to use for the metric. Helpful when you want to let
+                the library take care of translating your raw values into better representations.
+                - direct: do not modify the value at all (default)
+                - min: always report the lowest ovbserved value for this metric
+                - max: always report the highest observed value for this metric
+                - sum: report a rolling sum of all values for this metric
+                - count: report the number of times this metric has been logged
             to: The target object to log the metric to. Can be "task-or-run" or "run".
                 Defaults to "task-or-run". If "task-or-run", the metric will be logged
                 to the current task or run, whichever is the nearest ancestor.
@@ -824,6 +841,7 @@ def log_metric(
         step: int = 0,
         origin: t.Any | None = None,
         timestamp: datetime | None = None,
+        mode: MetricMode = "direct",
         to: ToObject = "task-or-run",
     ) -> None:
         task = current_task_span.get()
@@ -838,7 +856,7 @@ def log_metric(
             if isinstance(value, Metric)
             else Metric(float(value), step, timestamp or datetime.now(timezone.utc))
         )
-        target.log_metric(key, metric, origin=origin)
+        target.log_metric(key, metric, origin=origin, mode=mode)
 
     @handle_internal_errors()
     def log_artifact(
diff --git a/dreadnode/metric.py b/dreadnode/metric.py
@@ -10,6 +10,8 @@
 
 T = t.TypeVar("T")
 
+MetricMode = t.Literal["direct", "avg", "sum", "min", "max", "count"]
+
 
 @dataclass
 class Metric:
@@ -55,6 +57,46 @@ def from_many(
         score_attributes = {name: value for name, value, _ in values}
         return cls(value=total / weight, step=step, attributes={**attributes, **score_attributes})
 
+    def apply_mode(self, mode: MetricMode, others: "list[Metric]") -> "Metric":
+        """
+        Apply an aggregation mode to the metric.
+        This will modify the metric in place.
+
+        Args:
+            mode: The mode to apply. One of "sum", "min", "max", or "inc".
+            others: A list of other metrics to apply the mode to.
+
+        Returns:
+            self
+        """
+        previous_mode = next((m.attributes.get("mode") for m in others), mode) or "direct"
+        if mode != previous_mode:
+            raise ValueError(
+                f"Cannot mix metric modes {mode} != {previous_mode}",
+            )
+
+        if mode == "direct":
+            return self
+
+        self.attributes["original"] = self.value
+        self.attributes["mode"] = mode
+
+        prior_values = [m.value for m in sorted(others, key=lambda m: m.timestamp)]
+
+        if mode == "sum":
+            self.value += max(prior_values)
+        elif mode == "min":
+            self.value = min([self.value, *prior_values])
+        elif mode == "max":
+            self.value = max([self.value, *prior_values])
+        elif mode == "count":
+            self.value = len(others) + 1
+        elif mode == "avg" and prior_values:
+            current_avg = prior_values[-1]
+            self.value = current_avg + (self.value - current_avg) / (len(prior_values) + 1)
+
+        return self
+
 
 MetricDict = dict[str, list[Metric]]
 
diff --git a/dreadnode/task.py b/dreadnode/task.py
@@ -52,7 +52,8 @@ def top_n(
         *,
         as_outputs: t.Literal[False] = False,
         reverse: bool = True,
-    ) -> "TaskSpanList[R]": ...
+    ) -> "TaskSpanList[R]":
+        ...
 
     @t.overload
     def top_n(
@@ -61,7 +62,8 @@ def top_n(
         *,
         as_outputs: t.Literal[True],
         reverse: bool = True,
-    ) -> list[R]: ...
+    ) -> list[R]:
+        ...
 
     def top_n(
         self,
@@ -83,7 +85,7 @@ def top_n(
         """
         sorted_ = self.sorted(reverse=reverse)[:n]
         return (
-            t.cast(list[R], [span.output for span in sorted_])  # noqa: TC006
+            t.cast(list[R], [span.output for span in sorted_])
             if as_outputs
             else TaskSpanList(sorted_)
         )
@@ -246,6 +248,8 @@ async def run(self, *args: P.args, **kwargs: P.kwargs) -> TaskSpan[R]:
             run_id=run.run_id,
             tracer=self.tracer,
         ) as span:
+            span.run.log_metric(f"{self.label}.exec.count", 1, mode="count")
+
             for name, value in params_to_log.items():
                 span.log_param(name, value)
 
@@ -254,10 +258,15 @@ async def run(self, *args: P.args, **kwargs: P.kwargs) -> TaskSpan[R]:
                 for name, value in inputs_to_log.items()
             ]
 
-            output = t.cast(R | t.Awaitable[R], self.func(*args, **kwargs))  # noqa: TC006
-            if inspect.isawaitable(output):
-                output = await output
+            try:
+                output = t.cast(R | t.Awaitable[R], self.func(*args, **kwargs))
+                if inspect.isawaitable(output):
+                    output = await output
+            except Exception:
+                span.run.log_metric(f"{self.label}.exec.success_rate", 0, mode="avg")
+                raise
 
+            span.run.log_metric(f"{self.label}.exec.success_rate", 1, mode="avg")
             span.output = output
 
             if self.log_output:
diff --git a/dreadnode/tracing/span.py b/dreadnode/tracing/span.py
@@ -28,7 +28,7 @@
 from dreadnode.artifact.storage import ArtifactStorage
 from dreadnode.artifact.tree_builder import ArtifactTreeBuilder, DirectoryNode
 from dreadnode.constants import MAX_INLINE_OBJECT_BYTES
-from dreadnode.metric import Metric, MetricDict
+from dreadnode.metric import Metric, MetricDict, MetricMode
 from dreadnode.object import Object, ObjectRef, ObjectUri, ObjectVal
 from dreadnode.serialization import Serialized, serialize
 from dreadnode.types import UNSET, AnyDict, JsonDict, JsonValue, Unset
@@ -526,7 +526,9 @@ def log_metric(
         step: int = 0,
         origin: t.Any | None = None,
         timestamp: datetime | None = None,
-    ) -> None: ...
+        mode: MetricMode = "direct",
+    ) -> None:
+        ...
 
     @t.overload
     def log_metric(
@@ -535,7 +537,9 @@ def log_metric(
         value: Metric,
         *,
         origin: t.Any | None = None,
-    ) -> None: ...
+        mode: MetricMode = "direct",
+    ) -> None:
+        ...
 
     def log_metric(
         self,
@@ -545,6 +549,7 @@ def log_metric(
         step: int = 0,
         origin: t.Any | None = None,
         timestamp: datetime | None = None,
+        mode: MetricMode = "direct",
     ) -> None:
         metric = (
             value
@@ -560,9 +565,8 @@ def log_metric(
             )
             metric.attributes[METRIC_ATTRIBUTE_SOURCE_HASH] = origin_hash
 
-        self._metrics.setdefault(key, []).append(metric)
-        if self._span is None:
-            return
+        metrics = self._metrics.setdefault(key, [])
+        metrics.append(metric.apply_mode(mode, metrics))
 
     @property
     def outputs(self) -> AnyDict:
@@ -735,7 +739,9 @@ def log_metric(
         step: int = 0,
         origin: t.Any | None = None,
         timestamp: datetime | None = None,
-    ) -> None: ...
+        mode: MetricMode = "direct",
+    ) -> None:
+        ...
 
     @t.overload
     def log_metric(
@@ -744,7 +750,9 @@ def log_metric(
         value: Metric,
         *,
         origin: t.Any | None = None,
-    ) -> None: ...
+        mode: MetricMode = "direct",
+    ) -> None:
+        ...
 
     def log_metric(
         self,
@@ -754,6 +762,7 @@ def log_metric(
         step: int = 0,
         origin: t.Any | None = None,
         timestamp: datetime | None = None,
+        mode: MetricMode = "direct",
     ) -> None:
         metric = (
             value
@@ -769,12 +778,13 @@ def log_metric(
             )
             metric.attributes[METRIC_ATTRIBUTE_SOURCE_HASH] = origin_hash
 
-        self._metrics.setdefault(key, []).append(metric)
+        metrics = self._metrics.setdefault(key, [])
+        metrics.append(metric.apply_mode(mode, metrics))
 
         # For every metric we log, also log it to the run
         # with our `label` as a prefix.
         #
-        # Don't include `source` as we handled it here.
+        # Don't include `source` and `mode` as we handled it here.
         if (run := current_run_span.get()) is not None:
             run.log_metric(f"{self._label}.{key}", metric)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,20 +46,13 @@ readme = "README.md"
 python = ">=3.10,<3.13"
 pydantic = "^2.9.2"
 httpx = "^0.28.0"
-ruamel-yaml = "^0.18.6"
 logfire = "^3.5.3"
 python-ulid = "^3.0.0"
 fast-depends = "^2.4.12"
 coolname = "^2.2.0"
-pandas = "^2.2.3"
-pyarrow = "^19.0.1"
-loguru = "^0.7.3"
 fsspec = { extras = [
     "s3",
 ], version = "2024.12.0" } # pinned this version to be compatible with datasets
-pydub = "^0.25.1"
-moviepy = "^2.1.2"
-datasets = "^3.5.0"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.8.0"
@@ -72,6 +65,11 @@ pandas-stubs = "^2.2.3.250308"
 types-requests = "^2.32.0.20250306"
 rigging = "^2.3.0"
 typer = "^0.15.2"
+pydub = "^0.25.1"
+moviepy = "^2.1.2"
+datasets = "^3.5.0"
+pandas = "^2.2.3"
+pyarrow = "^19.0.1"
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
diff --git a/uv.lock b/uv.lock