Add metrics client

ehanson8 · ehanson8 · commit aef771aff922 · 2026-04-15T13:51:12.000-04:00
Why these changes are being introduced:
* A metrics client class is needed to implement AWS Cloudwatch metrics

How this addresses that need:
* Add MetricsClient class with methods for publishing individual and batch metrics
* Add METRICS_NAMESPACE and METRICS constants to config.py
* Add metrics attributes to Workflow class
* Add publish_metric method calls to submit_items and finalize_items methods
* Update dependencies

Side effects of this change:
* None

Relevant ticket(s):
* NA
diff --git a/dsc/config.py b/dsc/config.py
@@ -4,6 +4,15 @@
 
 import sentry_sdk
 
+METRICS_NAMESPACE = "dso"
+
+METRICS = [
+    "item_submitted",  # item submitted to DSS
+    "submission_error",  # error during submission to DSS
+    "ingested_item",  # item ingested successfully into DSpace
+    "ingest_error",  # error during attempted item ingest into DSpace
+]
+
 
 class Config:
     REQUIRED_ENV_VARS: Iterable[str] = [
diff --git a/dsc/utils/aws/__init__.py b/dsc/utils/aws/__init__.py
@@ -1,5 +1,6 @@
+from dsc.utils.aws.metrics import MetricsClient
 from dsc.utils.aws.s3 import S3Client
 from dsc.utils.aws.ses import SESClient
 from dsc.utils.aws.sqs import SQSClient
 
-__all__ = ["S3Client", "SESClient", "SQSClient"]
+__all__ = ["MetricsClient", "S3Client", "SESClient", "SQSClient"]
diff --git a/dsc/utils/aws/metrics.py b/dsc/utils/aws/metrics.py
@@ -0,0 +1,198 @@
+"""AWS CloudWatch metrics client for workflow submission tracking."""
+
+from __future__ import annotations
+
+import logging
+
+import boto3
+
+from dsc.config import METRICS, METRICS_NAMESPACE
+
+logger = logging.getLogger(__name__)
+
+
+UNIT_VALUES = frozenset(
+    [
+        "Seconds",
+        "Microseconds",
+        "Milliseconds",
+        "Bytes",
+        "Kilobytes",
+        "Megabytes",
+        "Gigabytes",
+        "Terabytes",
+        "Bits",
+        "Kilobits",
+        "Megabits",
+        "Gigabits",
+        "Terabits",
+        "Percent",
+        "Count",
+        "Bytes/Second",
+        "Kilobytes/Second",
+        "Megabytes/Second",
+        "Gigabytes/Second",
+        "Terabytes/Second",
+        "Bits/Second",
+        "Kilobits/Second",
+        "Megabits/Second",
+        "Gigabits/Second",
+        "Terabits/Second",
+        "Count/Second",
+    ]
+)
+
+
+class MetricsClient:
+    """A simple client to record metrics to AWS CloudWatch."""
+
+    def __init__(self) -> None:
+        """Initialize the MetricsClient."""
+        self.cloudwatch = boto3.client("cloudwatch")
+        self.batch_metrics: list[dict] = []
+
+    def publish_single_metric(
+        self,
+        metric_name: str,
+        value: int,
+        unit: str,
+        metric_dimensions: dict[str, str] | None = None,
+    ) -> None:
+        """Publish a single metric to CloudWatch.
+
+        Args:
+            metric_name: The name of the metric to publish.
+            value: The value of the metric.
+            unit: The unit of the metric.
+            metric_dimensions: Optional dictionary of dimension names and values.
+
+        Raises:
+            ValueError: If unit is invalid.
+        """
+        metric_data = self._validate_and_build_metric_data(
+            metric_name, value, unit, metric_dimensions
+        )
+        self._push_metric_data([metric_data])
+
+    def _validate_and_build_metric_data(
+        self,
+        metric_name: str,
+        value: int,
+        unit: str,
+        metric_dimensions: dict[str, str] | None = None,
+    ) -> dict:
+        """Validate and build a metric data dictionary for CloudWatch.
+
+        Args:
+            metric_name: The name of the metric.
+            value: The value of the metric.
+            unit: The unit of the metric.
+            metric_dimensions: Optional dictionary of dimension names and values.
+
+        Returns:
+            A metric data dictionary formatted for CloudWatch.
+        """
+        self._approved_metric(metric_name)
+        self._validate_unit(unit)
+        dimensions = [
+            {"Name": name, "Value": dim_value}
+            for name, dim_value in (
+                metric_dimensions.items() if metric_dimensions else []
+            )
+        ]
+        return {
+            "MetricName": metric_name,
+            "Value": value,
+            "Unit": unit,
+            "Dimensions": dimensions,
+        }
+
+    def _approved_metric(self, metric_name: str) -> bool:
+        """Check if a metric name is in the approved list of metrics for the application.
+
+        Args:
+            metric_name: The name of the metric to check.
+        """
+        if metric_name not in METRICS:
+            raise ValueError(
+                f"Metric name '{metric_name}' is not in the approved list of metrics: "
+                f"{', '.join(METRICS)}"
+            )
+        return True
+
+    def _validate_unit(self, unit: str) -> None:
+        """Validate that metric unit is allowed by AWS CloudWatch.
+
+        Args:
+            unit: The unit to validate.
+
+        Raises:
+            ValueError: If unit is not allowed by AWS CloudWatch.
+        """
+        if unit not in UNIT_VALUES:
+            raise ValueError(
+                f"Invalid unit '{unit}'. Must be one of: {', '.join(UNIT_VALUES)}"
+            )
+
+    def _push_metric_data(self, metric_data: list[dict]) -> None:
+        """Push metric data to CloudWatch.
+
+        Args:
+            metric_data: List of metric dictionaries to push.
+        """
+        try:
+            self.cloudwatch.put_metric_data(
+                Namespace=METRICS_NAMESPACE, MetricData=metric_data
+            )
+            logger.info(f"Published metric with {metric_data} to CloudWatch.")
+        except Exception:
+            logger.exception(
+                f"Failed to publish metric with {metric_data} to CloudWatch."
+            )
+
+    def add_metric_to_batch(
+        self,
+        metric_name: str,
+        value: int,
+        unit: str,
+        metric_dimensions: dict[str, str] | None = None,
+    ) -> None:
+        """Add a metric to the batch for later publishing.
+
+        Args:
+            metric_name: The name of the metric.
+            value: The value of the metric.
+            unit: The unit of the metric.
+            metric_dimensions: Optional dictionary of dimension names and values.
+
+        Raises:
+            ValueError: If unit is invalid.
+        """
+        metric_data = self._validate_and_build_metric_data(
+            metric_name, value, unit, metric_dimensions
+        )
+        self.batch_metrics.append(metric_data)
+
+    def publish_batch_metrics(self, batch_size: int = 20) -> None:
+        """Publish all accumulated batch metrics to CloudWatch.
+
+        Raises:
+            ValueError: If any metric has an invalid unit or missing required fields.
+        """
+        if not self.batch_metrics:
+            logger.info("No metrics to publish.")
+            return
+
+        # Validate all metrics before publishing
+        for metric in self.batch_metrics:
+            if not all(key in metric for key in ["MetricName", "Value", "Unit"]):
+                raise ValueError(
+                    f"Each metric must contain 'MetricName', 'Value', and 'Unit'. "
+                    f"Invalid metric: {metric}"
+                )
+            self._approved_metric(metric["MetricName"])
+            self._validate_unit(metric["Unit"])
+
+        for x in range(0, len(self.batch_metrics), batch_size):
+            self._push_metric_data(self.batch_metrics[x : x + batch_size])
+        self.batch_metrics.clear()
diff --git a/dsc/workflows/base/workflow.py b/dsc/workflows/base/workflow.py
@@ -19,7 +19,7 @@
 )
 from dsc.item_submission import ItemSubmission
 from dsc.reports import Report
-from dsc.utils.aws import SESClient, SQSClient
+from dsc.utils.aws import MetricsClient, SESClient, SQSClient
 from dsc.utils.validate.schemas import RESULT_MESSAGE_ATTRIBUTES, RESULT_MESSAGE_BODY
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -125,6 +125,11 @@ def __init__(self, batch_id: str) -> None:
             "skipped": 0,
             "errors": 0,
         }
+        self.metrics_client = MetricsClient()
+        self.metrics_dimensions = {
+            "application": "dsc",
+            "workflow_name": self.workflow_name,
+        }
 
         # cache list of bitstreams
         self._batch_bitstream_uris: list[str] | None = None
@@ -323,13 +328,27 @@ def submit_items(self, collection_handle: str) -> list:
                 item_submission.status_details = None
                 item_submission.submit_attempts += 1
                 item_submission.upsert_db()
+
+                self.metrics_client.publish_single_metric(
+                    metric_name="item_submitted",
+                    value=1,
+                    unit="Count",
+                    metric_dimensions=self.metrics_dimensions,
+                )
             except Exception as exception:  # noqa: BLE001
                 self.submission_summary["errors"] += 1
                 item_submission.status = ItemSubmissionStatus.SUBMIT_FAILED
                 item_submission.status_details = str(exception)
                 item_submission.submit_attempts += 1
                 item_submission.upsert_db()
 
+                self.metrics_client.publish_single_metric(
+                    metric_name="submission_error",
+                    value=1,
+                    unit="Count",
+                    metric_dimensions=self.metrics_dimensions,
+                )
+
         logger.info(
             f"Submitted messages to the DSS input queue '{CONFIG.sqs_queue_dss_input}' "
             f"for batch '{self.batch_id}': {json.dumps(self.submission_summary)}"
@@ -399,11 +418,26 @@ def finalize_items(self) -> None:
                 item_submission.dspace_handle = result_message.dspace_handle
                 sqs_results_summary["ingest_success"] += 1
                 logger.debug(f"Record {log_str} was ingested")
+
+                self.metrics_client.publish_single_metric(
+                    metric_name="ingested_item",
+                    value=1,
+                    unit="Count",
+                    metric_dimensions=self.metrics_dimensions,
+                )
             elif result_message.result_type == "error":
                 item_submission.status = ItemSubmissionStatus.INGEST_FAILED
                 item_submission.status_details = result_message.error_info
                 sqs_results_summary["ingest_failed"] += 1
                 logger.debug(f"Record {log_str} failed to ingest")
+
+                self.metrics_client.publish_single_metric(
+                    metric_name="ingest_error",
+                    value=1,
+                    unit="Count",
+                    metric_dimensions=self.metrics_dimensions,
+                )
+
             else:
                 item_submission.status = ItemSubmissionStatus.INGEST_UNKNOWN
                 sqs_results_summary["ingest_unknown"] += 1
diff --git a/uv.lock b/uv.lock