From 25cd4a95e380882d558178d94a0f1e544b1fec54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Juli=C3=A1n?= Date: Fri, 19 Jun 2026 10:28:00 +0200 Subject: [PATCH 1/4] Add Kueue integration (#23908) * Add initial Kueue OpenMetrics integration scaffold. Start with a basic OpenMetrics V2 check that forwards endpoint metrics under the kueue namespace to enable early endpoint validation before adding curated mappings. * Add basic Kueue OpenMetrics scraping * Add curated Kueue OpenMetrics mapping * Add Kueue resource metric suffixing * Document Kueue as a cluster check * Implement Kueue integration * Fix code coverage missing * Update README * Fix manifest * Update metadata * Add owners * Fix Kueue CI validation * Address Kueue review feedback. * Add memory * Pin kind networking and node image for Kueue E2E Use non-default service/pod subnets so the kind cluster's API service IP does not collide with the host environment's Kubernetes networking, which hijacked in-cluster traffic and broke Kueue's webhook cert bootstrap. Also scope the LocalQueue readiness wait to the default namespace. * Fix Kueue go info tag validation. Rename the generic Go version label before submission so E2E metrics pass tag validation. * Make Kueue E2E test pass against live cluster Relax metric tag assertions to match the actual tag set emitted by the controller (endpoint, replica_role, cohort tags) instead of pinning an exact subset, and add the missing assets/service_checks.json (with its manifest reference) that assert_service_checks requires. * Wait for Kueue webhook before applying queue manifests The controller deployment can report `Available` before its webhook server is actually serving, causing intermittent `connection refused` failures when applying ResourceFlavor/ClusterQueue. Wait for the webhook service endpoints and retry the apply to absorb the brief cert-propagation window. * Use remapped tag names in Kueue metric descriptions Metric descriptions referenced the raw Prometheus labels ('cluster_queue', 'local_queue'/'localQueue') instead of the tags Datadog actually emits after remapping ('kueue_cluster_queue', 'kueue_local_queue'). * Rename cluster_queue.pending_workloads to pending_workloads The raw kueue_pending_workloads metric has no cluster_queue in its name, so the cluster_queue. prefix was inconsistent with every other cluster-queue- indexed metric (which keep bare names and just carry the kueue_cluster_queue tag). Drop the prefix to match the source name and the rest of the convention. * Sync Kueue configuration example * Update codeowners * Apply suggestions from code review * Apply Kueue review cleanup * Use metadata assertion for Kueue metric coverage * Remove Kueue service check metadata * Configure Kueue manifestless metadata * Refactor Kueue tag assertions * Assert Kueue e2e metrics from metadata * Assert idle Kueue e2e metrics * Rename Kueue flavor label * Document Kueue GC summary metrics * Remove checks * Add more e2e metrics * Fix e2e setup * Fix Kueue e2e controller rollout wait. * Change codeowners * Unify Kueue unit and e2e metric expectations. Share EXPECTED_METRIC_TAGS between tests, align the OpenMetrics fixture with e2e queue labels, expand unit coverage, and pin go_version to go1.26.3 for kueue.go.info to match the controller toolchain. --- .ddev/config.toml | 3 + .github/CODEOWNERS | 4 + .github/workflows/config/labeler.yml | 4 + .github/workflows/test-all.yml | 20 + code-coverage.datadog.yml | 3 + kueue/CHANGELOG.md | 4 + kueue/README.md | 80 +++ kueue/assets/configuration/spec.yaml | 31 + kueue/changelog.d/23908.added | 1 + kueue/datadog_checks/kueue/__about__.py | 4 + kueue/datadog_checks/kueue/__init__.py | 10 + kueue/datadog_checks/kueue/check.py | 110 +++ .../kueue/config_models/__init__.py | 24 + .../kueue/config_models/defaults.py | 140 ++++ .../kueue/config_models/instance.py | 186 +++++ .../kueue/config_models/shared.py | 60 ++ .../kueue/config_models/validators.py | 13 + .../kueue/data/conf.yaml.example | 656 ++++++++++++++++++ kueue/datadog_checks/kueue/metrics.py | 135 ++++ kueue/hatch.toml | 8 + kueue/metadata.csv | 174 +++++ kueue/pyproject.toml | 61 ++ kueue/tests/__init__.py | 3 + kueue/tests/common.py | 68 ++ kueue/tests/conftest.py | 171 +++++ kueue/tests/fixtures/metrics.txt | 619 +++++++++++++++++ kueue/tests/kind/kind-config.yaml | 12 + kueue/tests/kind/kueue-config.yaml | 50 ++ kueue/tests/kind/metrics-reader.yaml | 18 + kueue/tests/kind/queue.yaml | 30 + kueue/tests/kind/workloads.yaml | 45 ++ kueue/tests/test_e2e.py | 21 + kueue/tests/test_unit.py | 97 +++ 33 files changed, 2865 insertions(+) create mode 100644 kueue/CHANGELOG.md create mode 100644 kueue/README.md create mode 100644 kueue/assets/configuration/spec.yaml create mode 100644 kueue/changelog.d/23908.added create mode 100644 kueue/datadog_checks/kueue/__about__.py create mode 100644 kueue/datadog_checks/kueue/__init__.py create mode 100644 kueue/datadog_checks/kueue/check.py create mode 100644 kueue/datadog_checks/kueue/config_models/__init__.py create mode 100644 kueue/datadog_checks/kueue/config_models/defaults.py create mode 100644 kueue/datadog_checks/kueue/config_models/instance.py create mode 100644 kueue/datadog_checks/kueue/config_models/shared.py create mode 100644 kueue/datadog_checks/kueue/config_models/validators.py create mode 100644 kueue/datadog_checks/kueue/data/conf.yaml.example create mode 100644 kueue/datadog_checks/kueue/metrics.py create mode 100644 kueue/hatch.toml create mode 100644 kueue/metadata.csv create mode 100644 kueue/pyproject.toml create mode 100644 kueue/tests/__init__.py create mode 100644 kueue/tests/common.py create mode 100644 kueue/tests/conftest.py create mode 100644 kueue/tests/fixtures/metrics.txt create mode 100644 kueue/tests/kind/kind-config.yaml create mode 100644 kueue/tests/kind/kueue-config.yaml create mode 100644 kueue/tests/kind/metrics-reader.yaml create mode 100644 kueue/tests/kind/queue.yaml create mode 100644 kueue/tests/kind/workloads.yaml create mode 100644 kueue/tests/test_e2e.py create mode 100644 kueue/tests/test_unit.py diff --git a/.ddev/config.toml b/.ddev/config.toml index c3583e24d1044..cb18809459788 100644 --- a/.ddev/config.toml +++ b/.ddev/config.toml @@ -41,6 +41,7 @@ teamcity = "TeamCity" win32_event_log = "Windows Event Log" krakend = "KrakenD" lustre = "Lustre" +kueue = "Kueue" prefect = "Prefect" n8n = "n8n" hpe_aruba_edgeconnect = "HPE Aruba EdgeConnect" @@ -51,6 +52,7 @@ dell_powerflex = "Dell Powerflex" [overrides.metrics-prefix] krakend = "krakend.api." lustre = "lustre." +kueue = "kueue." prefect = "prefect.server." n8n = "n8n." control_m = "control_m." @@ -270,6 +272,7 @@ __pycache__ = false [overrides.manifest.platforms] krakend = ["linux", "windows", "mac_os"] lustre = ["linux", "windows", "mac_os"] +kueue = ["linux", "windows", "mac_os"] prefect = ["linux", "windows", "mac_os"] n8n = ["linux", "windows", "mac_os"] control_m = ["linux", "windows", "mac_os"] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 916998cca1add..1674771160ebe 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -635,6 +635,10 @@ plaid/assets/logs/ @DataDog/saa /gpu/*.md @DataDog/ebpf-platform @DataDog/documentation /gpu/manifest.json @DataDog/ebpf-platform @DataDog/agent-integrations @DataDog/documentation +/kueue/ @DataDog/gpu-monitoring-agent +/kueue/*.md @DataDog/gpu-monitoring-agent @DataDog/documentation +/kueue/manifest.json @DataDog/gpu-monitoring-agent @DataDog/agent-integrations @DataDog/documentation + /linux_audit_logs/ @DataDog/agent-integrations /linux_audit_logs/*.md @DataDog/agent-integrations @DataDog/documentation /linux_audit_logs/manifest.json @DataDog/agent-integrations @DataDog/documentation diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml index b0b7a132f6147..8bd907eea58aa 100644 --- a/.github/workflows/config/labeler.yml +++ b/.github/workflows/config/labeler.yml @@ -889,6 +889,10 @@ integration/kubevirt_handler: - changed-files: - any-glob-to-any-file: - kubevirt_handler/**/* +integration/kueue: +- changed-files: + - any-glob-to-any-file: + - kueue/**/* integration/kuma: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/test-all.yml b/.github/workflows/test-all.yml index a0aedac7373c7..d268ca64eaa93 100644 --- a/.github/workflows/test-all.yml +++ b/.github/workflows/test-all.yml @@ -2338,6 +2338,26 @@ jobs: minimum-base-package: ${{ inputs.minimum-base-package }} pytest-args: ${{ inputs.pytest-args }} secrets: inherit + j3c620e6: + uses: ./.github/workflows/test-target.yml + with: + job-name: Kueue + target: kueue + platform: linux + runner: '["ubuntu-22.04"]' + repo: "${{ inputs.repo }}" + context: ${{ inputs.context }} + python-version: "${{ inputs.python-version }}" + latest: ${{ inputs.latest }} + agent-image: "${{ inputs.agent-image }}" + agent-image-py2: "${{ inputs.agent-image-py2 }}" + agent-image-windows: "${{ inputs.agent-image-windows }}" + agent-image-windows-py2: "${{ inputs.agent-image-windows-py2 }}" + test-py2: ${{ inputs.test-py2 }} + test-py3: ${{ inputs.test-py3 }} + minimum-base-package: ${{ inputs.minimum-base-package }} + pytest-args: ${{ inputs.pytest-args }} + secrets: inherit j739f9be: uses: ./.github/workflows/test-target.yml with: diff --git a/code-coverage.datadog.yml b/code-coverage.datadog.yml index 0273ce99805c4..1533cb46742de 100644 --- a/code-coverage.datadog.yml +++ b/code-coverage.datadog.yml @@ -421,6 +421,9 @@ services: - id: kubevirt_handler paths: - kubevirt_handler/datadog_checks/kubevirt_handler/ +- id: kueue + paths: + - kueue/datadog_checks/kueue/ - id: kuma paths: - kuma/datadog_checks/kuma/ diff --git a/kueue/CHANGELOG.md b/kueue/CHANGELOG.md new file mode 100644 index 0000000000000..a6fb787822503 --- /dev/null +++ b/kueue/CHANGELOG.md @@ -0,0 +1,4 @@ +# CHANGELOG - Kueue + + + diff --git a/kueue/README.md b/kueue/README.md new file mode 100644 index 0000000000000..8d6635dbe08bc --- /dev/null +++ b/kueue/README.md @@ -0,0 +1,80 @@ +# Agent Check: Kueue + +## Overview + +This check monitors Kueue through the Datadog Agent. + +Kueue is a Kubernetes workload queueing system that allows you to manage and schedule workloads on your Kubernetes cluster. It provides a way to prioritize and manage workloads, and to ensure that workloads are scheduled in a fair and efficient manner. This integration collects metrics from the Kueue controller manager and Kueue API server to help you monitor the health and performance of your Kueue cluster. + +## Setup + +Follow the instructions below to install and configure this check for an Agent running on a host. For containerized environments, see the [Autodiscovery Integration Templates][3]. + +### Installation + +The Kueue check is included in the [Datadog Agent][2] package. +No additional installation is required on your server. + +### Configuration + +Kueue is a cluster-level service. Configure this integration as a Cluster Agent cluster check so only one Agent instance scrapes the Kueue metrics endpoint. + +1. To collect optional ClusterQueue resource metrics, such as `kueue.cluster_queue.resource_usage.gpu`, configure Kueue with `metrics.enableClusterQueueResources: true` and restart the Kueue controller manager. + +2. Provide a [cluster check configuration][10] to the Cluster Agent. For file or ConfigMap based configuration, set `cluster_check: true` in the instance: + + ```yaml + clusterAgent: + confd: + kueue.yaml: |- + cluster_check: true + init_config: + instances: + - openmetrics_endpoint: http://kueue-controller-manager-metrics-service.kueue-system.svc:8080/metrics + ``` + +3. Alternatively, annotate the Kueue metrics service with Autodiscovery cluster check annotations: + + ```yaml + ad.datadoghq.com/endpoints.checks: | + { + "kueue": { + "instances": [ + { + "openmetrics_endpoint": "http://%%host%%:%%port%%/metrics" + } + ] + } + } + ``` + +See the [sample kueue.d/conf.yaml][4] for all available configuration options. + +### Validation + +[Run the Cluster Agent's `clusterchecks` subcommand][11] and look for `kueue` under the Checks section. + +## Data Collected + +### Metrics + +See [metadata.csv][7] for a list of metrics provided by this integration. + +### Events + +The Kueue integration does not include any events. + +## Troubleshooting + +Need help? Contact [Datadog support][8]. + + +[2]: https://app.datadoghq.com/account/settings/agent/latest +[3]: https://docs.datadoghq.com/containers/kubernetes/integrations/ +[4]: https://github.com/DataDog/integrations-core/blob/master/kueue/datadog_checks/kueue/data/conf.yaml.example +[5]: https://docs.datadoghq.com/agent/configuration/agent-commands/#start-stop-and-restart-the-agent +[6]: https://docs.datadoghq.com/agent/configuration/agent-commands/#agent-status-and-information +[7]: https://github.com/DataDog/integrations-core/blob/master/kueue/metadata.csv +[8]: https://docs.datadoghq.com/help/ +[10]: https://docs.datadoghq.com/containers/cluster_agent/clusterchecks/?tab=helm#configuration-from-configuration-files +[11]: https://docs.datadoghq.com/containers/troubleshooting/cluster-and-endpoint-checks/#dispatching-logic-in-the-cluster-agent diff --git a/kueue/assets/configuration/spec.yaml b/kueue/assets/configuration/spec.yaml new file mode 100644 index 0000000000000..0a2ffd8b58783 --- /dev/null +++ b/kueue/assets/configuration/spec.yaml @@ -0,0 +1,31 @@ +name: Kueue +fleet_configurable: true +files: +- name: kueue.yaml + options: + - template: init_config + options: + - template: init_config/openmetrics + - template: instances + options: + - template: instances/openmetrics + overrides: + openmetrics_endpoint.value.example: http://localhost:8080/metrics + openmetrics_endpoint.description: | + Endpoint exposing Kueue's Prometheus metrics. + - name: cluster_check + description: | + Set to true when configuring this integration as a Cluster Agent cluster check. + value: + type: boolean + example: true + - name: resource_name_map + description: | + Mapping of Kueue resource label values to metric name suffixes for resource metrics. + + By default, `cpu` is reported as `cpu`, `memory` is reported as `memory`, `nvidia.com/gpu` is reported as + `gpu`, and unmapped resources are reported as `other`. Built-in resource names cannot be overridden. + value: + type: object + example: + example.com/fpga: fpga diff --git a/kueue/changelog.d/23908.added b/kueue/changelog.d/23908.added new file mode 100644 index 0000000000000..11ddfc9589429 --- /dev/null +++ b/kueue/changelog.d/23908.added @@ -0,0 +1 @@ +Initial Release. diff --git a/kueue/datadog_checks/kueue/__about__.py b/kueue/datadog_checks/kueue/__about__.py new file mode 100644 index 0000000000000..e50f43adfb9b1 --- /dev/null +++ b/kueue/datadog_checks/kueue/__about__.py @@ -0,0 +1,4 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +__version__ = '0.0.1' diff --git a/kueue/datadog_checks/kueue/__init__.py b/kueue/datadog_checks/kueue/__init__.py new file mode 100644 index 0000000000000..6efea2b972b39 --- /dev/null +++ b/kueue/datadog_checks/kueue/__init__.py @@ -0,0 +1,10 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from .__about__ import __version__ +from .check import KueueCheck + +# Some Agent loader paths look for a generic `Check` symbol. +Check = KueueCheck + +__all__ = ['__version__', 'KueueCheck', 'Check'] diff --git a/kueue/datadog_checks/kueue/check.py b/kueue/datadog_checks/kueue/check.py new file mode 100644 index 0000000000000..189292102354f --- /dev/null +++ b/kueue/datadog_checks/kueue/check.py @@ -0,0 +1,110 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import re + +from datadog_checks.base import OpenMetricsBaseCheckV2 +from datadog_checks.base.checks.openmetrics.v2.transform import get_native_dynamic_transformer + +from .config_models import ConfigMixin +from .metrics import LOCAL_QUEUE_METRIC_MAP, METRIC_MAP, RESOURCE_METRIC_MAP + +RESOURCE_METRIC_PATTERN = '^(' + '|'.join(re.escape(k) for k in RESOURCE_METRIC_MAP) + ')$' +LOCAL_QUEUE_METRIC_PATTERN = '^(' + '|'.join(re.escape(k) for k in LOCAL_QUEUE_METRIC_MAP) + ')$' + +RESOURCE_NAME_MAP = { + 'cpu': 'cpu', + 'memory': 'memory', + 'nvidia.com/gpu': 'gpu', +} + +OTHER_RESOURCE_NAME = 'other' + +DEFAULT_RENAME_LABELS = { + 'cluster_queue': 'kueue_cluster_queue', + 'flavor': 'kueue_resource_flavor', + 'version': 'go_version', +} + + +class KueueCheck(OpenMetricsBaseCheckV2, ConfigMixin): + __NAMESPACE__ = 'kueue' + DEFAULT_METRIC_LIMIT = 0 + + def __init__(self, name, init_config, instances): + super().__init__(name, init_config, instances) + self.instance['rename_labels'] = {**DEFAULT_RENAME_LABELS, **self.instance.get('rename_labels', {})} + + def get_default_config(self): + return {'metrics': [METRIC_MAP]} + + def configure_scrapers(self): + super().configure_scrapers() + + metric_transformer = self.scrapers[self.config.openmetrics_endpoint].metric_transformer + metric_transformer.add_custom_transformer( + RESOURCE_METRIC_PATTERN, + self.configure_resource_transformer(), + pattern=True, + ) + metric_transformer.add_custom_transformer( + LOCAL_QUEUE_METRIC_PATTERN, + self.configure_local_queue_transformer(), + pattern=True, + ) + + def configure_resource_transformer(self): + metric_transformer = self.scrapers[self.config.openmetrics_endpoint].metric_transformer + # Built-in names are applied last so they cannot be overridden by user config. + resource_name_map = {**(self.config.resource_name_map or {}), **RESOURCE_NAME_MAP} + cached_transformers = {} + + def resource_transformer(metric, sample_data, runtime_data): + for sample, tags, hostname in sample_data: + resource = sample.labels.get('resource') + if not resource: + self.log.debug('Skipping sample for %s: missing resource label', metric.name) + continue + + resource_name = self.normalize_resource_name(resource_name_map.get(resource, OTHER_RESOURCE_NAME)) + metric_name = f'{RESOURCE_METRIC_MAP[metric.name]}.{resource_name}' + native_transformer = cached_transformers.get(metric_name) + if native_transformer is None: + native_transformer = get_native_dynamic_transformer( + self, metric_name, None, metric_transformer.global_options + ) + cached_transformers[metric_name] = native_transformer + + resource_tags = [tag for tag in tags if tag != f'resource:{resource}'] + resource_tags = self.rename_local_queue_tag(resource_tags) + native_transformer(metric, [(sample, resource_tags, hostname)], runtime_data) + + return resource_transformer + + def configure_local_queue_transformer(self): + metric_transformer = self.scrapers[self.config.openmetrics_endpoint].metric_transformer + cached_transformers = {} + + def local_queue_transformer(metric, sample_data, runtime_data): + metric_name = LOCAL_QUEUE_METRIC_MAP[metric.name] + native_transformer = cached_transformers.get(metric_name) + if native_transformer is None: + native_transformer = get_native_dynamic_transformer( + self, metric_name, None, metric_transformer.global_options + ) + cached_transformers[metric_name] = native_transformer + + new_sample_data = [ + (sample, self.rename_local_queue_tag(tags), hostname) for sample, tags, hostname in sample_data + ] + native_transformer(metric, new_sample_data, runtime_data) + + return local_queue_transformer + + @staticmethod + def rename_local_queue_tag(tags: list[str]) -> list[str]: + return [tag.replace('name:', 'kueue_local_queue:', 1) if tag.startswith('name:') else tag for tag in tags] + + @staticmethod + def normalize_resource_name(resource_name: str) -> str: + return resource_name.replace('/', '.').replace('-', '_') diff --git a/kueue/datadog_checks/kueue/config_models/__init__.py b/kueue/datadog_checks/kueue/config_models/__init__.py new file mode 100644 index 0000000000000..f678b7e73d91a --- /dev/null +++ b/kueue/datadog_checks/kueue/config_models/__init__.py @@ -0,0 +1,24 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from .instance import InstanceConfig +from .shared import SharedConfig + + +class ConfigMixin: + _config_model_instance: InstanceConfig + _config_model_shared: SharedConfig + + @property + def config(self) -> InstanceConfig: + return self._config_model_instance + + @property + def shared_config(self) -> SharedConfig: + return self._config_model_shared diff --git a/kueue/datadog_checks/kueue/config_models/defaults.py b/kueue/datadog_checks/kueue/config_models/defaults.py new file mode 100644 index 0000000000000..1573c3523e55c --- /dev/null +++ b/kueue/datadog_checks/kueue/config_models/defaults.py @@ -0,0 +1,140 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + + +def shared_skip_proxy(): + return False + + +def shared_timeout(): + return 10 + + +def instance_allow_redirects(): + return True + + +def instance_auth_type(): + return 'basic' + + +def instance_cache_metric_wildcards(): + return True + + +def instance_cache_shared_labels(): + return True + + +def instance_cluster_check(): + return True + + +def instance_collect_counters_with_distributions(): + return False + + +def instance_collect_histogram_buckets(): + return True + + +def instance_disable_generic_tags(): + return False + + +def instance_empty_default_hostname(): + return False + + +def instance_enable_health_service_check(): + return True + + +def instance_enable_legacy_tags_normalization(): + return True + + +def instance_histogram_buckets_as_distributions(): + return False + + +def instance_ignore_connection_errors(): + return False + + +def instance_kerberos_auth(): + return 'disabled' + + +def instance_kerberos_delegate(): + return False + + +def instance_kerberos_force_initiate(): + return False + + +def instance_log_requests(): + return False + + +def instance_min_collection_interval(): + return 15 + + +def instance_non_cumulative_histogram_buckets(): + return False + + +def instance_persist_connections(): + return False + + +def instance_request_size(): + return 16 + + +def instance_skip_proxy(): + return False + + +def instance_tag_by_endpoint(): + return True + + +def instance_telemetry(): + return False + + +def instance_timeout(): + return 10 + + +def instance_tls_ignore_warning(): + return False + + +def instance_tls_use_host_header(): + return False + + +def instance_tls_verify(): + return True + + +def instance_use_latest_spec(): + return False + + +def instance_use_legacy_auth_encoding(): + return True + + +def instance_use_process_start_time(): + return False diff --git a/kueue/datadog_checks/kueue/config_models/instance.py b/kueue/datadog_checks/kueue/config_models/instance.py new file mode 100644 index 0000000000000..353c0461859ed --- /dev/null +++ b/kueue/datadog_checks/kueue/config_models/instance.py @@ -0,0 +1,186 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from __future__ import annotations + +from types import MappingProxyType +from typing import Any, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from typing_extensions import Literal + +from datadog_checks.base.utils.functions import identity +from datadog_checks.base.utils.models import validation + +from . import defaults, validators + + +SECURE_FIELD_NAMES = frozenset( + ['auth_token', 'kerberos_cache', 'kerberos_keytab', 'tls_ca_cert', 'tls_cert', 'tls_private_key'] +) + + +class AuthToken(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + reader: Optional[MappingProxyType[str, Any]] = None + writer: Optional[MappingProxyType[str, Any]] = None + + +class ExtraMetrics(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='allow', + frozen=True, + ) + name: Optional[str] = None + type: Optional[str] = None + + +class MetricPatterns(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + exclude: Optional[tuple[str, ...]] = None + include: Optional[tuple[str, ...]] = None + + +class Metrics(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='allow', + frozen=True, + ) + name: Optional[str] = None + type: Optional[str] = None + + +class Proxy(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + http: Optional[str] = None + https: Optional[str] = None + no_proxy: Optional[tuple[str, ...]] = None + + +class ShareLabels(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + labels: Optional[tuple[str, ...]] = None + match: Optional[tuple[str, ...]] = None + + +class InstanceConfig(BaseModel): + model_config = ConfigDict( + validate_default=True, + arbitrary_types_allowed=True, + frozen=True, + ) + allow_redirects: Optional[bool] = None + auth_token: Optional[AuthToken] = None + auth_type: Optional[str] = None + aws_host: Optional[str] = None + aws_region: Optional[str] = None + aws_service: Optional[str] = None + cache_metric_wildcards: Optional[bool] = None + cache_shared_labels: Optional[bool] = None + cluster_check: Optional[bool] = None + collect_counters_with_distributions: Optional[bool] = None + collect_histogram_buckets: Optional[bool] = None + connect_timeout: Optional[float] = None + disable_generic_tags: Optional[bool] = None + empty_default_hostname: Optional[bool] = None + enable_health_service_check: Optional[bool] = None + enable_legacy_tags_normalization: Optional[bool] = None + exclude_labels: Optional[tuple[str, ...]] = None + exclude_metrics: Optional[tuple[str, ...]] = None + exclude_metrics_by_labels: Optional[MappingProxyType[str, Union[bool, tuple[str, ...]]]] = None + extra_headers: Optional[MappingProxyType[str, Any]] = None + extra_metrics: Optional[tuple[Union[str, MappingProxyType[str, Union[str, ExtraMetrics]]], ...]] = None + headers: Optional[MappingProxyType[str, Any]] = None + histogram_buckets_as_distributions: Optional[bool] = None + hostname_format: Optional[str] = None + hostname_label: Optional[str] = None + ignore_connection_errors: Optional[bool] = None + ignore_tags: Optional[tuple[str, ...]] = None + include_labels: Optional[tuple[str, ...]] = None + kerberos_auth: Optional[Literal['required', 'optional', 'disabled']] = None + kerberos_cache: Optional[str] = None + kerberos_delegate: Optional[bool] = None + kerberos_force_initiate: Optional[bool] = None + kerberos_hostname: Optional[str] = None + kerberos_keytab: Optional[str] = None + kerberos_principal: Optional[str] = None + log_requests: Optional[bool] = None + metric_patterns: Optional[MetricPatterns] = None + metrics: Optional[tuple[Union[str, MappingProxyType[str, Union[str, Metrics]]], ...]] = None + min_collection_interval: Optional[float] = None + namespace: Optional[str] = Field(None, pattern='\\w*') + non_cumulative_histogram_buckets: Optional[bool] = None + ntlm_domain: Optional[str] = None + openmetrics_endpoint: str + password: Optional[str] = None + persist_connections: Optional[bool] = None + proxy: Optional[Proxy] = None + raw_line_filters: Optional[tuple[str, ...]] = None + raw_metric_prefix: Optional[str] = None + read_timeout: Optional[float] = None + rename_labels: Optional[MappingProxyType[str, Any]] = None + request_size: Optional[float] = None + resource_name_map: Optional[MappingProxyType[str, Any]] = None + service: Optional[str] = None + share_labels: Optional[MappingProxyType[str, Union[bool, ShareLabels]]] = None + skip_proxy: Optional[bool] = None + tag_by_endpoint: Optional[bool] = None + tags: Optional[tuple[str, ...]] = None + telemetry: Optional[bool] = None + timeout: Optional[float] = None + tls_ca_cert: Optional[str] = None + tls_cert: Optional[str] = None + tls_ciphers: Optional[tuple[str, ...]] = None + tls_ignore_warning: Optional[bool] = None + tls_private_key: Optional[str] = None + tls_protocols_allowed: Optional[tuple[str, ...]] = None + tls_use_host_header: Optional[bool] = None + tls_verify: Optional[bool] = None + use_latest_spec: Optional[bool] = None + use_legacy_auth_encoding: Optional[bool] = None + use_process_start_time: Optional[bool] = None + username: Optional[str] = None + + @model_validator(mode='before') + def _initial_validation(cls, values): + return validation.core.initialize_config(getattr(validators, 'initialize_instance', identity)(values)) + + @field_validator('*', mode='before') + def _validate(cls, value, info): + field = cls.model_fields[info.field_name] + field_name = field.alias or info.field_name + if field_name in info.context['configured_fields']: + value = getattr(validators, f'instance_{info.field_name}', identity)(value, field=field) + + if info.field_name in SECURE_FIELD_NAMES: + validation.security.check_field_trusted_provider( + info.field_name, value, info.context.get('security_config') + ) + else: + value = getattr(defaults, f'instance_{info.field_name}', lambda: value)() + + return validation.utils.make_immutable(value) + + @model_validator(mode='after') + def _final_validation(cls, model): + return validation.core.check_model(getattr(validators, 'check_instance', identity)(model)) diff --git a/kueue/datadog_checks/kueue/config_models/shared.py b/kueue/datadog_checks/kueue/config_models/shared.py new file mode 100644 index 0000000000000..b349fd006d272 --- /dev/null +++ b/kueue/datadog_checks/kueue/config_models/shared.py @@ -0,0 +1,60 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, ConfigDict, field_validator, model_validator + +from datadog_checks.base.utils.functions import identity +from datadog_checks.base.utils.models import validation + +from . import defaults, validators + + +class Proxy(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + http: Optional[str] = None + https: Optional[str] = None + no_proxy: Optional[tuple[str, ...]] = None + + +class SharedConfig(BaseModel): + model_config = ConfigDict( + validate_default=True, + arbitrary_types_allowed=True, + frozen=True, + ) + proxy: Optional[Proxy] = None + service: Optional[str] = None + skip_proxy: Optional[bool] = None + timeout: Optional[float] = None + + @model_validator(mode='before') + def _initial_validation(cls, values): + return validation.core.initialize_config(getattr(validators, 'initialize_shared', identity)(values)) + + @field_validator('*', mode='before') + def _validate(cls, value, info): + field = cls.model_fields[info.field_name] + field_name = field.alias or info.field_name + if field_name in info.context['configured_fields']: + value = getattr(validators, f'shared_{info.field_name}', identity)(value, field=field) + else: + value = getattr(defaults, f'shared_{info.field_name}', lambda: value)() + + return validation.utils.make_immutable(value) + + @model_validator(mode='after') + def _final_validation(cls, model): + return validation.core.check_model(getattr(validators, 'check_shared', identity)(model)) diff --git a/kueue/datadog_checks/kueue/config_models/validators.py b/kueue/datadog_checks/kueue/config_models/validators.py new file mode 100644 index 0000000000000..5e48f02a73da4 --- /dev/null +++ b/kueue/datadog_checks/kueue/config_models/validators.py @@ -0,0 +1,13 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# Here you can include additional config validators or transformers +# +# def initialize_instance(values, **kwargs): +# if 'my_option' not in values and 'my_legacy_option' in values: +# values['my_option'] = values['my_legacy_option'] +# if values.get('my_number') > 10: +# raise ValueError('my_number max value is 10, got %s' % str(values.get('my_number'))) +# +# return values diff --git a/kueue/datadog_checks/kueue/data/conf.yaml.example b/kueue/datadog_checks/kueue/data/conf.yaml.example new file mode 100644 index 0000000000000..87f3c3a3a4576 --- /dev/null +++ b/kueue/datadog_checks/kueue/data/conf.yaml.example @@ -0,0 +1,656 @@ +## All options defined here are available to all instances. +# +init_config: + + ## @param proxy - mapping - optional + ## Set HTTP or HTTPS proxies for all instances. Use the `no_proxy` list + ## to specify hosts that must bypass proxies. + ## + ## The SOCKS protocol is also supported like so: + ## + ## socks5://user:pass@host:port + ## + ## Using the scheme `socks5` causes the DNS resolution to happen on the + ## client, rather than on the proxy server. This is in line with `curl`, + ## which uses the scheme to decide whether to do the DNS resolution on + ## the client or proxy. If you want to resolve the domains on the proxy + ## server, use `socks5h` as the scheme. + # + # proxy: + # http: http://: + # https: https://: + # no_proxy: + # - + # - + + ## @param skip_proxy - boolean - optional - default: false + ## If set to `true`, this makes the check bypass any proxy + ## settings enabled and attempt to reach services directly. + # + # skip_proxy: false + + ## @param timeout - number - optional - default: 10 + ## The timeout for connecting to services. + # + # timeout: 10 + + ## @param service - string - optional + ## Attach the tag `service:` to every metric, event, and service check emitted by this integration. + ## + ## Additionally, this sets the default `service` for every log source. + # + # service: + +## Every instance is scheduled independently of the others. +# +instances: + + ## @param openmetrics_endpoint - string - required + ## Endpoint exposing Kueue's Prometheus metrics. + # + - openmetrics_endpoint: http://localhost:8080/metrics + + ## @param raw_metric_prefix - string - optional + ## A prefix that is removed from all exposed metric names, if present. + ## All configuration options will use the prefix-less name. + # + # raw_metric_prefix: _ + + ## @param extra_metrics - (list of string or mapping) - optional + ## This list defines metrics to collect from the `openmetrics_endpoint`, in addition to + ## what the check collects by default. If the check already collects a metric, then + ## metric definitions here take precedence. Metrics may be defined in 3 ways: + ## + ## 1. If the item is a string, then it represents the exposed metric name, and + ## the sent metric name will be identical. For example: + ## ``` + ## extra_metrics: + ## - + ## - + ## ``` + ## 2. If the item is a mapping, then the keys represent the exposed metric names. + ## + ## 1. If a value is a string, then it represents the sent metric name. For example: + ## ``` + ## extra_metrics: + ## - : + ## - : + ## ``` + ## 2. If a value is a mapping, then it must have a `name` and/or `type` key. + ## The `name` represents the sent metric name, and the `type` represents how + ## the metric should be handled, overriding any type information the endpoint + ## may provide. For example: + ## ``` + ## extra_metrics: + ## - : + ## name: + ## type: + ## - : + ## name: + ## type: + ## ``` + ## The supported native types are `gauge`, `counter`, `histogram`, and `summary`. + ## + ## Note: To collect counter metrics with names ending in `_total`, specify the metric name without the `_total` + ## suffix. For example, to collect the counter metric `promhttp_metric_handler_requests_total`, specify + ## `promhttp_metric_handler_requests`. This submits to Datadog the metric name appended with `.count`. + ## For more information, see: + ## https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#suffixes + ## + ## Regular expressions may be used to match the exposed metric names, for example: + ## ``` + ## extra_metrics: + ## - ^network_(ingress|egress)_.+ + ## - .+: + ## type: gauge + ## ``` + # + # extra_metrics: + # - + # - : + # - : + # name: + # type: + + ## @param exclude_metrics - list of strings - optional + ## A list of metrics to exclude, with each entry being either + ## the exact metric name or a regular expression. + ## + ## In order to exclude all metrics but the ones matching a specific filter, + ## you can use a negative lookahead regex like: + ## - ^(?!foo).*$ + # + # exclude_metrics: [] + + ## @param exclude_metrics_by_labels - mapping - optional + ## A mapping of labels to exclude metrics with matching label name and their corresponding metric values. To match + ## all values of a label, set it to `true`. + ## + ## Note: Label filtering happens before `rename_labels`. + ## + ## For example, the following configuration instructs the check to exclude all metrics with + ## a label `worker` or a label `pid` with the value of either `23` or `42`. + ## + ## exclude_metrics_by_labels: + ## worker: true + ## pid: + ## - '23' + ## - '42' + # + # exclude_metrics_by_labels: {} + + ## @param exclude_labels - list of strings - optional + ## A list of labels to exclude, useful for high cardinality values like timestamps or UUIDs. + ## May be used in conjunction with `include_labels`. + ## Labels defined in `exclude_labels` will take precedence in case of overlap. + ## + ## Note: Label filtering happens before `rename_labels`. + # + # exclude_labels: [] + + ## @param include_labels - list of strings - optional + ## A list of labels to include. May be used in conjunction with `exclude_labels`. + ## Labels defined in `exclude_labels` will take precedence in case of overlap. + ## + ## Note: Label filtering happens before `rename_labels`. + # + # include_labels: [] + + ## @param rename_labels - mapping - optional + ## A mapping of label names to their new names. + # + # rename_labels: + # : + # : + + ## @param enable_health_service_check - boolean - optional - default: true + ## Whether or not to send a service check named `.openmetrics.health` which reports + ## the health of the `openmetrics_endpoint`. + # + # enable_health_service_check: true + + ## @param ignore_connection_errors - boolean - optional - default: false + ## Whether or not to ignore connection errors when scraping `openmetrics_endpoint`. + # + # ignore_connection_errors: false + + ## @param hostname_label - string - optional + ## Override the hostname for every metric submission with the value of one of its labels. + # + # hostname_label: + + ## @param hostname_format - string - optional + ## When `hostname_label` is set, this instructs the check how to format the values. The string + ## `` is replaced by the value of the label defined by `hostname_label`. + # + # hostname_format: + + ## @param collect_histogram_buckets - boolean - optional - default: true + ## Whether or not to send histogram buckets. + # + # collect_histogram_buckets: true + + ## @param non_cumulative_histogram_buckets - boolean - optional - default: false + ## Whether or not histogram buckets are non-cumulative and to come with a `lower_bound` tag. + # + # non_cumulative_histogram_buckets: false + + ## @param histogram_buckets_as_distributions - boolean - optional - default: false + ## Whether or not to send histogram buckets as Datadog distribution metrics. This implicitly + ## enables the `collect_histogram_buckets` and `non_cumulative_histogram_buckets` options. + ## + ## Learn more about distribution metrics: + ## https://docs.datadoghq.com/developers/metrics/types/?tab=distribution#metric-types + # + # histogram_buckets_as_distributions: false + + ## @param collect_counters_with_distributions - boolean - optional - default: false + ## Whether or not to also collect the observation counter metrics ending in `.sum` and `.count` + ## when sending histogram buckets as Datadog distribution metrics. This implicitly enables the + ## `histogram_buckets_as_distributions` option. + # + # collect_counters_with_distributions: false + + ## @param use_process_start_time - boolean - optional - default: false + ## Whether to enable a heuristic for reporting counter values on the first scrape. When true, + ## the first time an endpoint is scraped, check `process_start_time_seconds` to decide whether zero + ## initial value can be assumed for counters. This requires keeping metrics in memory until the entire + ## response is received. + # + # use_process_start_time: false + + ## @param share_labels - mapping - optional + ## This mapping allows for the sharing of labels across multiple metrics. The keys represent the + ## exposed metrics from which to share labels, and the values are mappings that configure the + ## sharing behavior. Each mapping must have at least one of the following keys: + ## + ## - labels - This is a list of labels to share. All labels are shared if this is not set. + ## - match - This is a list of labels to match on other metrics as a condition for sharing. + ## - values - This is a list of allowed values as a condition for sharing. + ## + ## To unconditionally share all labels of a metric, set it to `true`. + ## + ## For example, the following configuration instructs the check to apply all labels from `metric_a` + ## to all other metrics, the `node` label from `metric_b` to only those metrics that have a `pod` + ## label value that matches the `pod` label value of `metric_b`, and all labels from `metric_c` + ## to all other metrics if their value is equal to `23` or `42`. + # + # share_labels: + # metric_a: true + # metric_b: + # labels: + # - node + # match: + # - pod + # metric_c: + # values: + # - 23 + # - 42 + + ## @param cache_shared_labels - boolean - optional - default: true + ## When `share_labels` is set, it instructs the check to cache labels collected from the first payload + ## for improved performance. + ## + ## Set this to `false` to compute label sharing for every payload at the risk of potentially increased memory usage. + # + # cache_shared_labels: true + + ## @param raw_line_filters - list of strings - optional + ## A list of regular expressions used to exclude lines read from the `openmetrics_endpoint` + ## from being parsed. + # + # raw_line_filters: [] + + ## @param cache_metric_wildcards - boolean - optional - default: true + ## Whether or not to cache data from metrics that are defined by regular expressions rather + ## than the full metric name. + # + # cache_metric_wildcards: true + + ## @param telemetry - boolean - optional - default: false + ## Whether or not to submit metrics prefixed by `.telemetry.` for debugging purposes. + # + # telemetry: false + + ## @param ignore_tags - list of strings - optional + ## A list of regular expressions used to ignore tags added by Autodiscovery and entries in the `tags` option. + # + # ignore_tags: + # - + # - + # - + + ## @param proxy - mapping - optional + ## This overrides the `proxy` setting in `init_config`. + ## + ## Set HTTP or HTTPS proxies for this instance. Use the `no_proxy` list + ## to specify hosts that must bypass proxies. + ## + ## The SOCKS protocol is also supported, for example: + ## + ## socks5://user:pass@host:port + ## + ## Using the scheme `socks5` causes the DNS resolution to happen on the + ## client, rather than on the proxy server. This is in line with `curl`, + ## which uses the scheme to decide whether to do the DNS resolution on + ## the client or proxy. If you want to resolve the domains on the proxy + ## server, use `socks5h` as the scheme. + # + # proxy: + # http: http://: + # https: https://: + # no_proxy: + # - + # - + + ## @param skip_proxy - boolean - optional - default: false + ## This overrides the `skip_proxy` setting in `init_config`. + ## + ## If set to `true`, this makes the check bypass any proxy + ## settings enabled and attempt to reach services directly. + # + # skip_proxy: false + + ## @param auth_type - string - optional - default: basic + ## The type of authentication to use. The available types (and related options) are: + ## ``` + ## - basic + ## |__ username + ## |__ password + ## |__ use_legacy_auth_encoding + ## - digest + ## |__ username + ## |__ password + ## - ntlm + ## |__ ntlm_domain + ## |__ password + ## - kerberos + ## |__ kerberos_auth + ## |__ kerberos_cache + ## |__ kerberos_delegate + ## |__ kerberos_force_initiate + ## |__ kerberos_hostname + ## |__ kerberos_keytab + ## |__ kerberos_principal + ## - aws + ## |__ aws_region + ## |__ aws_host + ## |__ aws_service + ## ``` + ## The `aws` auth type relies on boto3 to automatically gather AWS credentials, for example: from `.aws/credentials`. + ## Details: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#configuring-credentials + # + # auth_type: basic + + ## @param use_legacy_auth_encoding - boolean - optional - default: true + ## When `auth_type` is set to `basic`, this determines whether to encode as `latin1` rather than `utf-8`. + # + # use_legacy_auth_encoding: true + + ## @param username - string - optional + ## The username to use if services are behind basic or digest auth. + # + # username: + + ## @param password - string - optional + ## The password to use if services are behind basic or NTLM auth. + # + # password: + + ## @param ntlm_domain - string - optional + ## If your services use NTLM authentication, specify + ## the domain used in the check. For NTLM Auth, append + ## the username to domain, not as the `username` parameter. + # + # ntlm_domain: \ + + ## @param kerberos_auth - string - optional - default: disabled + ## If your services use Kerberos authentication, you can specify the Kerberos + ## strategy to use between: + ## + ## - required + ## - optional + ## - disabled + ## + ## See https://github.com/requests/requests-kerberos#mutual-authentication + # + # kerberos_auth: disabled + + ## @param kerberos_cache - string - optional + ## Sets the KRB5CCNAME environment variable. + ## It should point to a credential cache with a valid TGT. + # + # kerberos_cache: + + ## @param kerberos_delegate - boolean - optional - default: false + ## Set to `true` to enable Kerberos delegation of credentials to a server that requests delegation. + ## + ## See https://github.com/requests/requests-kerberos#delegation + # + # kerberos_delegate: false + + ## @param kerberos_force_initiate - boolean - optional - default: false + ## Set to `true` to preemptively initiate the Kerberos GSS exchange and + ## present a Kerberos ticket on the initial request (and all subsequent). + ## + ## See https://github.com/requests/requests-kerberos#preemptive-authentication + # + # kerberos_force_initiate: false + + ## @param kerberos_hostname - string - optional + ## Override the hostname used for the Kerberos GSS exchange if its DNS name doesn't + ## match its Kerberos hostname, for example: behind a content switch or load balancer. + ## + ## See https://github.com/requests/requests-kerberos#hostname-override + # + # kerberos_hostname: + + ## @param kerberos_principal - string - optional + ## Set an explicit principal, to force Kerberos to look for a + ## matching credential cache for the named user. + ## + ## See https://github.com/requests/requests-kerberos#explicit-principal + # + # kerberos_principal: + + ## @param kerberos_keytab - string - optional + ## Set the path to your Kerberos key tab file. + # + # kerberos_keytab: + + ## @param auth_token - mapping - optional + ## This allows for the use of authentication information from dynamic sources. + ## Both a reader and writer must be configured. + ## + ## The available readers are: + ## + ## - type: file + ## path (required): The absolute path for the file to read from. + ## pattern: A regular expression pattern with a single capture group used to find the + ## token rather than using the entire file, for example: Your secret is (.+) + ## - type: oauth + ## url (required): The token endpoint. + ## client_id (required): The client identifier. + ## client_secret (required): The client secret. + ## basic_auth: Whether the provider expects credentials to be transmitted in + ## an HTTP Basic Auth header. The default is: false + ## options: Mapping of additional options to pass to the provider, such as the audience + ## or the scope. For example: + ## options: + ## audience: https://example.com + ## scope: read:example + ## + ## The available writers are: + ## + ## - type: header + ## name (required): The name of the field, for example: Authorization + ## value: The template value, for example `Bearer `. The default is: + ## placeholder: The substring in `value` to replace with the token, defaults to: + # + # auth_token: + # reader: + # type: + # : + # : + # writer: + # type: + # : + # : + + ## @param aws_region - string - optional + ## If your services require AWS Signature Version 4 signing, set the region. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_region: + + ## @param aws_host - string - optional + ## If your services require AWS Signature Version 4 signing, set the host. + ## This only needs the hostname and does not require the protocol (HTTP, HTTPS, and more). + ## For example, if connecting to https://us-east-1.amazonaws.com/, set `aws_host` to `us-east-1.amazonaws.com`. + ## + ## Note: This setting is not necessary for official integrations. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_host: + + ## @param aws_service - string - optional + ## If your services require AWS Signature Version 4 signing, set the service code. For a list + ## of available service codes, see https://docs.aws.amazon.com/general/latest/gr/rande.html + ## + ## Note: This setting is not necessary for official integrations. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_service: + + ## @param tls_verify - boolean - optional - default: true + ## Instructs the check to validate the TLS certificate of services. + # + # tls_verify: true + + ## @param tls_use_host_header - boolean - optional - default: false + ## If a `Host` header is set, this enables its use for SNI (matching against the TLS certificate CN or SAN). + # + # tls_use_host_header: false + + ## @param tls_ignore_warning - boolean - optional - default: false + ## If `tls_verify` is disabled, security warnings are logged by the check. + ## Disable those by setting `tls_ignore_warning` to true. + # + # tls_ignore_warning: false + + ## @param tls_cert - string - optional + ## The path to a single file in PEM format containing a certificate as well as any + ## number of CA certificates needed to establish the certificate's authenticity for + ## use when connecting to services. It may also contain an unencrypted private key to use. + # + # tls_cert: + + ## @param tls_private_key - string - optional + ## The unencrypted private key to use for `tls_cert` when connecting to services. This is + ## required if `tls_cert` is set and it does not already contain a private key. + # + # tls_private_key: + + ## @param tls_ca_cert - string - optional + ## The path to a file of concatenated CA certificates in PEM format or a directory + ## containing several CA certificates in PEM format. If a directory, the directory + ## must have been processed using the `openssl rehash` command. See: + ## https://www.openssl.org/docs/man3.2/man1/c_rehash.html + # + # tls_ca_cert: + + ## @param tls_protocols_allowed - list of strings - optional + ## The expected versions of TLS/SSL when fetching intermediate certificates. + ## Only `SSLv3`, `TLSv1.2`, `TLSv1.3` are allowed by default. The possible values are: + ## SSLv3 + ## TLSv1 + ## TLSv1.1 + ## TLSv1.2 + ## TLSv1.3 + # + # tls_protocols_allowed: + # - SSLv3 + # - TLSv1.2 + # - TLSv1.3 + + ## @param tls_ciphers - list of strings - optional + ## The list of ciphers suites to use when connecting to an endpoint. If not specified, + ## `ALL` ciphers are used. For list of ciphers see: + ## https://www.openssl.org/docs/man1.0.2/man1/ciphers.html + # + # tls_ciphers: + # - TLS_AES_256_GCM_SHA384 + # - TLS_CHACHA20_POLY1305_SHA256 + # - TLS_AES_128_GCM_SHA256 + + ## @param headers - mapping - optional + ## The headers parameter allows you to send specific headers with every request. + ## You can use it for explicitly specifying the host header or adding headers for + ## authorization purposes. + ## + ## This overrides any default headers. + # + # headers: + # Host: + # X-Auth-Token: + + ## @param extra_headers - mapping - optional + ## Additional headers to send with every request. + # + # extra_headers: + # Host: + # X-Auth-Token: + + ## @param timeout - number - optional - default: 10 + ## The timeout for accessing services. + ## + ## This overrides the `timeout` setting in `init_config`. + # + # timeout: 10 + + ## @param connect_timeout - number - optional + ## The connect timeout for accessing services. Defaults to `timeout`. + # + # connect_timeout: + + ## @param read_timeout - number - optional + ## The read timeout for accessing services. Defaults to `timeout`. + # + # read_timeout: + + ## @param request_size - number - optional - default: 16 + ## The number of kibibytes (KiB) to read from streaming HTTP responses at a time. + # + # request_size: 16 + + ## @param log_requests - boolean - optional - default: false + ## Whether or not to debug log the HTTP(S) requests made, including the method and URL. + # + # log_requests: false + + ## @param persist_connections - boolean - optional - default: false + ## Whether or not to persist cookies and use connection pooling for improved performance. + # + # persist_connections: false + + ## @param allow_redirects - boolean - optional - default: true + ## Whether or not to allow URL redirection. + # + # allow_redirects: true + + ## @param tags - list of strings - optional + ## A list of tags to attach to every metric and service check emitted by this instance. + ## + ## Learn more about tagging at https://docs.datadoghq.com/tagging + # + # tags: + # - : + # - : + + ## @param service - string - optional + ## Attach the tag `service:` to every metric, event, and service check emitted by this integration. + ## + ## Overrides any `service` defined in the `init_config` section. + # + # service: + + ## @param min_collection_interval - number - optional - default: 15 + ## This changes the collection interval of the check. For more information, see: + ## https://docs.datadoghq.com/developers/write_agent_check/#collection-interval + # + # min_collection_interval: 15 + + ## @param empty_default_hostname - boolean - optional - default: false + ## This forces the check to send metrics with no hostname. + ## + ## This is useful for cluster-level checks. + # + # empty_default_hostname: false + + ## @param metric_patterns - mapping - optional + ## A mapping of metrics to include or exclude, with each entry being a regular expression. + ## + ## Metrics defined in `exclude` will take precedence in case of overlap. + # + # metric_patterns: + # include: + # - + # exclude: + # - + + ## @param cluster_check - boolean - optional - default: true + ## Set to true when configuring this integration as a Cluster Agent cluster check. + # + # cluster_check: true + + ## @param resource_name_map - mapping - optional + ## Mapping of Kueue resource label values to metric name suffixes for resource metrics. + ## + ## By default, `cpu` is reported as `cpu`, `memory` is reported as `memory`, `nvidia.com/gpu` is reported as + ## `gpu`, and unmapped resources are reported as `other`. Built-in resource names cannot be overridden. + # + # resource_name_map: + # example.com/fpga: fpga diff --git a/kueue/datadog_checks/kueue/metrics.py b/kueue/datadog_checks/kueue/metrics.py new file mode 100644 index 0000000000000..8059f6e6d76ee --- /dev/null +++ b/kueue/datadog_checks/kueue/metrics.py @@ -0,0 +1,135 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# Some metrics mapping are too long. This turns off the 120 line limit for this file: +# ruff: noqa: E501 + +RESOURCE_METRIC_MAP = { + 'kueue_cluster_queue_borrowing_limit': 'cluster_queue.borrowing_limit', + 'kueue_cluster_queue_lending_limit': 'cluster_queue.lending_limit', + 'kueue_cluster_queue_nominal_quota': 'cluster_queue.nominal_quota', + 'kueue_cluster_queue_resource_pending': 'cluster_queue.resource_pending', + 'kueue_cluster_queue_resource_reservation': 'cluster_queue.resource_reservation', + 'kueue_cluster_queue_resource_usage': 'cluster_queue.resource_usage', + 'kueue_cohort_subtree_quota': 'cohort_subtree.quota', + 'kueue_cohort_subtree_resource_reservations': 'cohort_subtree.resource_reservations', + 'kueue_local_queue_resource_reservation': 'local_queue.resource_reservation', + 'kueue_local_queue_resource_usage': 'local_queue.resource_usage', +} + +LOCAL_QUEUE_METRIC_MAP = { + 'kueue_local_queue_admission_checks_wait_time_seconds': 'local_queue.admission_checks.wait_time.seconds', + 'kueue_local_queue_admission_wait_time_seconds': 'local_queue.admission.wait_time.seconds', + 'kueue_local_queue_admitted_until_ready_wait_time_seconds': 'local_queue.admitted_until_ready.wait_time.seconds', + 'kueue_local_queue_admitted_active_workloads': 'local_queue.admitted.active_workloads', + 'kueue_local_queue_admitted_workloads': 'local_queue.admitted.workloads', + 'kueue_local_queue_evicted_workloads': 'local_queue.evicted_workloads', + 'kueue_local_queue_finished_workloads': 'local_queue.finished_workloads', + 'kueue_local_queue_finished_workloads_total': 'local_queue.finished_workloads', + 'kueue_local_queue_pending_workloads': 'local_queue.pending_workloads', + 'kueue_local_queue_quota_reserved_wait_time_seconds': 'local_queue.quota_reserved.wait_time.seconds', + 'kueue_local_queue_quota_reserved_workloads': 'local_queue.quota_reserved.workloads', + 'kueue_local_queue_ready_wait_time_seconds': 'local_queue.ready_wait_time.seconds', + 'kueue_local_queue_reserving_active_workloads': 'local_queue.reserving.active_workloads', + 'kueue_local_queue_status': 'local_queue.status', +} + +METRIC_MAP = { + 'certwatcher_read_certificate': 'certwatcher.read_certificate', + 'certwatcher_read_certificate_errors': 'certwatcher.read_certificate_errors', + 'controller_runtime_active_workers': 'controller.runtime.active_workers', + 'controller_runtime_conversion_webhook_panics': 'controller.runtime.conversion_webhook_panics', + 'controller_runtime_max_concurrent_reconciles': 'controller.runtime.max_concurrent_reconciles', + 'controller_runtime_reconcile': 'controller.runtime.reconcile', + 'controller_runtime_reconcile_errors': 'controller.runtime.reconcile_errors', + 'controller_runtime_reconcile_panics': 'controller.runtime.reconcile_panics', + 'controller_runtime_reconcile_time_seconds': 'controller.runtime.reconcile_time.seconds', + 'controller_runtime_reconcile_timeouts': 'controller.runtime.reconcile_timeouts', + 'controller_runtime_terminal_reconcile_errors': 'controller.runtime.terminal_reconcile_errors', + 'controller_runtime_webhook_latency_seconds': 'controller.runtime.webhook_latency.seconds', + 'controller_runtime_webhook_panics': 'controller.runtime.webhook_panics', + 'controller_runtime_webhook_requests': 'controller.runtime.webhook_requests', + 'controller_runtime_webhook_requests_in_flight': 'controller.runtime.webhook_requests_in_flight', + 'go_gc_duration_seconds': 'go.gc.duration.seconds', + 'go_gc_gogc_percent': 'go.gc.gogc.percent', + 'go_gc_gomemlimit_bytes': 'go.gc.gomemlimit.bytes', + 'go_goroutines': 'go.goroutines', + 'go_info': 'go.info', + 'go_memstats_alloc_bytes': {'name': 'go.memstats.alloc_bytes', 'type': 'native_dynamic'}, + 'go_memstats_buck_hash_sys_bytes': 'go.memstats.buck_hash.sys_bytes', + 'go_memstats_frees': 'go.memstats.frees', + 'go_memstats_gc_sys_bytes': 'go.memstats.gc.sys_bytes', + 'go_memstats_heap_alloc_bytes': 'go.memstats.heap_alloc.bytes', + 'go_memstats_heap_idle_bytes': 'go.memstats.heap_idle.bytes', + 'go_memstats_heap_inuse_bytes': 'go.memstats.heap_inuse.bytes', + 'go_memstats_heap_objects': 'go.memstats.heap_objects', + 'go_memstats_heap_released_bytes': 'go.memstats.heap_released.bytes', + 'go_memstats_heap_sys_bytes': 'go.memstats.heap_sys.bytes', + 'go_memstats_last_gc_time_seconds': { + 'name': 'go.memstats.time_since_last_gc.seconds', + 'type': 'time_elapsed', + }, + 'go_memstats_mallocs': 'go.memstats.mallocs', + 'go_memstats_mcache_inuse_bytes': 'go.memstats.mcache_inuse.bytes', + 'go_memstats_mcache_sys_bytes': 'go.memstats.mcache_sys.bytes', + 'go_memstats_mspan_inuse_bytes': 'go.memstats.mspan_inuse.bytes', + 'go_memstats_mspan_sys_bytes': 'go.memstats.mspan_sys.bytes', + 'go_memstats_next_gc_bytes': 'go.memstats.next_gc.bytes', + 'go_memstats_other_sys_bytes': 'go.memstats.other_sys.bytes', + 'go_memstats_stack_inuse_bytes': 'go.memstats.stack_inuse.bytes', + 'go_memstats_stack_sys_bytes': 'go.memstats.stack_sys.bytes', + 'go_memstats_sys_bytes': 'go.memstats.sys.bytes', + 'go_threads': 'go.threads', + 'kueue_admission_attempt_duration_seconds': 'admission_attempt.duration.seconds', + 'kueue_admission_attempts': 'admission.attempts', + 'kueue_admission_checks_wait_time_seconds': 'admission_checks.wait_time.seconds', + 'kueue_admission_cycle_preemption_skips': 'admission_cycle.preemption_skips', + 'kueue_admission_wait_time_seconds': 'admission.wait_time.seconds', + 'kueue_admitted_until_ready_wait_time_seconds': 'admitted_until_ready.wait_time.seconds', + 'kueue_admitted_active_workloads': 'admitted.active_workloads', + 'kueue_admitted_workloads': 'admitted.workloads', + 'kueue_build_info': 'build_info', + 'kueue_cluster_queue_info': 'cluster_queue.info', + 'kueue_cluster_queue_status': 'cluster_queue.status', + 'kueue_cluster_queue_weighted_share': 'cluster_queue.weighted_share', + 'kueue_cohort_info': 'cohort.info', + 'kueue_cohort_subtree_admitted_active_workloads': 'cohort_subtree.admitted.active_workloads', + 'kueue_cohort_subtree_admitted_workloads': 'cohort_subtree.admitted.workloads', + 'kueue_cohort_weighted_share': 'cohort.weighted_share', + 'kueue_evicted_workloads': 'evicted_workloads', + 'kueue_evicted_workloads_once': 'evicted_workloads_once', + 'kueue_finished_workloads': 'finished_workloads', + 'kueue_finished_workloads_total': 'finished_workloads', + 'kueue_pending_workloads': 'pending_workloads', + 'kueue_pods_ready_to_evicted_time_seconds': 'pods_ready_to_evicted_time.seconds', + 'kueue_preempted_workloads': 'preempted_workloads', + 'kueue_quota_reserved_wait_time_seconds': 'quota_reserved.wait_time.seconds', + 'kueue_quota_reserved_workloads': 'quota_reserved.workloads', + 'kueue_ready_wait_time_seconds': 'ready_wait_time.seconds', + 'kueue_replaced_workload_slices': 'replaced_workload_slices', + 'kueue_resource_flavor_quota_reserved_workloads': 'resource_flavor.quota_reserved_workloads', + 'kueue_reserving_active_workloads': 'reserving.active_workloads', + 'kueue_workload_creation_latency_seconds': 'workload.creation_latency.seconds', + 'kueue_workload_eviction_latency_seconds': 'workload.eviction_latency.seconds', + 'leader_election_master_status': 'leader_election.master_status', + 'process_cpu_seconds': 'process.cpu.seconds', + 'process_max_fds': 'process.max_fds', + 'process_network_receive_bytes': 'process.network_receive.bytes', + 'process_network_transmit_bytes': 'process.network_transmit.bytes', + 'process_open_fds': 'process.open_fds', + 'process_resident_memory_bytes': 'process.resident_memory.bytes', + 'process_start_time_seconds': { + 'name': 'process.uptime.seconds', + 'type': 'time_elapsed', + }, + 'process_virtual_memory_bytes': 'process.virtual_memory.bytes', + 'process_virtual_memory_max_bytes': 'process.virtual_memory.max_bytes', + 'workqueue_adds': 'workqueue.adds', + 'workqueue_depth': 'workqueue.depth', + 'workqueue_longest_running_processor_seconds': 'workqueue.longest_running_processor.seconds', + 'workqueue_queue_duration_seconds': 'workqueue.queue_duration.seconds', + 'workqueue_retries': 'workqueue.retries', + 'workqueue_unfinished_work_seconds': 'workqueue.unfinished_work.seconds', + 'workqueue_work_duration_seconds': 'workqueue.work_duration.seconds', +} diff --git a/kueue/hatch.toml b/kueue/hatch.toml new file mode 100644 index 0000000000000..77a0fdcc4cb59 --- /dev/null +++ b/kueue/hatch.toml @@ -0,0 +1,8 @@ +[env.collectors.datadog-checks] + +[[envs.default.matrix]] +python = ["3.13"] +version = ["v0.18.0"] + +[envs.default.overrides] +matrix.version.env-vars = "KUEUE_VERSION" diff --git a/kueue/metadata.csv b/kueue/metadata.csv new file mode 100644 index 0000000000000..3b59a104d5b12 --- /dev/null +++ b/kueue/metadata.csv @@ -0,0 +1,174 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags +kueue.admission.attempts.count,count,0,,,"The total number of workload admission attempts. Each admission attempt might try to admit more than one workload. The 'result' label can be 'success' when at least one workload is admitted or 'inadmissible' when no workload is admitted.",0,kueue,admission_attempts_count,, +kueue.admission.wait_time.seconds.bucket,count,0,second,,"The time between workload creation or requeueing and admission, per 'kueue_cluster_queue'.",0,kueue,admission_wait_time_seconds_bucket,, +kueue.admission.wait_time.seconds.count,count,0,second,,"The time between workload creation or requeueing and admission, per 'kueue_cluster_queue'.",0,kueue,admission_wait_time_seconds_count,, +kueue.admission.wait_time.seconds.sum,count,0,second,,"The time between workload creation or requeueing and admission, per 'kueue_cluster_queue'.",0,kueue,admission_wait_time_seconds_sum,, +kueue.admission_attempt.duration.seconds.bucket,count,0,second,,"The duration of an admission attempt. The 'result' label can be 'success' when at least one workload is admitted or 'inadmissible' when no workload is admitted.",0,kueue,admission_attempt_duration_seconds_bucket,, +kueue.admission_attempt.duration.seconds.count,count,0,second,,"The duration of an admission attempt. The 'result' label can be 'success' when at least one workload is admitted or 'inadmissible' when no workload is admitted.",0,kueue,admission_attempt_duration_seconds_count,, +kueue.admission_attempt.duration.seconds.sum,count,0,second,,"The duration of an admission attempt. The 'result' label can be 'success' when at least one workload is admitted or 'inadmissible' when no workload is admitted.",0,kueue,admission_attempt_duration_seconds_sum,, +kueue.admission_checks.wait_time.seconds.bucket,count,0,second,,"The time between quota reservation and admission for a workload, per 'kueue_cluster_queue'.",0,kueue,admission_checks_wait_time_seconds_bucket,, +kueue.admission_checks.wait_time.seconds.count,count,0,second,,"The time between quota reservation and admission for a workload, per 'kueue_cluster_queue'.",0,kueue,admission_checks_wait_time_seconds_count,, +kueue.admission_checks.wait_time.seconds.sum,count,0,second,,"The time between quota reservation and admission for a workload, per 'kueue_cluster_queue'.",0,kueue,admission_checks_wait_time_seconds_sum,, +kueue.admission_cycle.preemption_skips,gauge,0,,,"The number of Workloads in the ClusterQueue that received preemption candidates but were skipped because other ClusterQueues needed the same resources in the same cycle.",0,kueue,admission_cycle_preemption_skips,, +kueue.admitted.active_workloads,gauge,0,,,"The number of admitted workloads that are active, per 'kueue_cluster_queue'.",0,kueue,admitted_active_workloads,, +kueue.admitted.workloads.count,count,0,,,"The total number of admitted workloads, per 'kueue_cluster_queue'.",0,kueue,admitted_workloads_count,, +kueue.admitted_until_ready.wait_time.seconds.bucket,count,0,second,,"The time between workload admission and the workload becoming ready, per 'kueue_cluster_queue'.",0,kueue,admitted_until_ready_wait_time_seconds_bucket,, +kueue.admitted_until_ready.wait_time.seconds.count,count,0,second,,"The time between workload admission and the workload becoming ready, per 'kueue_cluster_queue'.",0,kueue,admitted_until_ready_wait_time_seconds_count,, +kueue.admitted_until_ready.wait_time.seconds.sum,count,0,second,,"The time between workload admission and the workload becoming ready, per 'kueue_cluster_queue'.",0,kueue,admitted_until_ready_wait_time_seconds_sum,, +kueue.build_info,gauge,0,,,"Kueue build information. The value is 1 and is labeled by Git version, Git commit, build date, Go version, compiler, and platform.",0,kueue,build_info,, +kueue.certwatcher.read_certificate,gauge,0,,,Kueue certwatcher read certificate metric.,0,kueue,certwatcher_read_certificate,, +kueue.certwatcher.read_certificate_errors,gauge,0,,,Kueue certwatcher read certificate errors metric.,0,kueue,certwatcher_read_certificate_errors,, +kueue.cluster_queue.borrowing_limit.cpu,gauge,0,,,"Reports the ClusterQueue's resource borrowing limit across all flavors.",0,kueue,cluster_queue_borrowing_limit_cpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.borrowing_limit.gpu,gauge,0,,,"Reports the ClusterQueue's resource borrowing limit across all flavors.",0,kueue,cluster_queue_borrowing_limit_gpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.borrowing_limit.memory,gauge,0,,,"Reports the ClusterQueue's resource borrowing limit across all flavors.",0,kueue,cluster_queue_borrowing_limit_memory,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.borrowing_limit.other,gauge,0,,,"Reports the ClusterQueue's resource borrowing limit across all flavors.",0,kueue,cluster_queue_borrowing_limit_other,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.info,gauge,0,,,Reports ClusterQueue hierarchy information. The metric has value 1 and can be joined using labels.,0,kueue,cluster_queue_info,, +kueue.cluster_queue.lending_limit.cpu,gauge,0,,,"Reports the ClusterQueue's resource lending limit across all flavors.",0,kueue,cluster_queue_lending_limit_cpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.lending_limit.gpu,gauge,0,,,"Reports the ClusterQueue's resource lending limit across all flavors.",0,kueue,cluster_queue_lending_limit_gpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.lending_limit.memory,gauge,0,,,"Reports the ClusterQueue's resource lending limit across all flavors.",0,kueue,cluster_queue_lending_limit_memory,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.lending_limit.other,gauge,0,,,"Reports the ClusterQueue's resource lending limit across all flavors.",0,kueue,cluster_queue_lending_limit_other,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.nominal_quota.cpu,gauge,0,,,"Reports the ClusterQueue's nominal resource quota across all flavors.",0,kueue,cluster_queue_nominal_quota_cpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.nominal_quota.gpu,gauge,0,,,"Reports the ClusterQueue's nominal resource quota across all flavors.",0,kueue,cluster_queue_nominal_quota_gpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.nominal_quota.memory,gauge,0,,,"Reports the ClusterQueue's nominal resource quota across all flavors.",0,kueue,cluster_queue_nominal_quota_memory,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.nominal_quota.other,gauge,0,,,"Reports the ClusterQueue's nominal resource quota across all flavors.",0,kueue,cluster_queue_nominal_quota_other,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_pending.cpu,gauge,0,,,"Reports the ClusterQueue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors.",0,kueue,cluster_queue_resource_pending_cpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_pending.gpu,gauge,0,,,"Reports the ClusterQueue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors.",0,kueue,cluster_queue_resource_pending_gpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_pending.memory,gauge,0,,,"Reports the ClusterQueue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors.",0,kueue,cluster_queue_resource_pending_memory,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_pending.other,gauge,0,,,"Reports the ClusterQueue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors.",0,kueue,cluster_queue_resource_pending_other,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_reservation.cpu,gauge,0,,,"Reports the ClusterQueue's total resource reservation across all flavors.",0,kueue,cluster_queue_resource_reservation_cpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_reservation.gpu,gauge,0,,,"Reports the ClusterQueue's total resource reservation across all flavors.",0,kueue,cluster_queue_resource_reservation_gpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_reservation.memory,gauge,0,,,"Reports the ClusterQueue's total resource reservation across all flavors.",0,kueue,cluster_queue_resource_reservation_memory,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_reservation.other,gauge,0,,,"Reports the ClusterQueue's total resource reservation across all flavors.",0,kueue,cluster_queue_resource_reservation_other,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_usage.cpu,gauge,0,,,"Reports the ClusterQueue's total resource usage across all flavors.",0,kueue,cluster_queue_resource_usage_cpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_usage.gpu,gauge,0,,,"Reports the ClusterQueue's total resource usage across all flavors.",0,kueue,cluster_queue_resource_usage_gpu,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_usage.memory,gauge,0,,,"Reports the ClusterQueue's total resource usage across all flavors.",0,kueue,cluster_queue_resource_usage_memory,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.resource_usage.other,gauge,0,,,"Reports the ClusterQueue's total resource usage across all flavors.",0,kueue,cluster_queue_resource_usage_other,,"kueue_cluster_queue:default,cohort:default,flavor:on-demand,replica_role:leader" +kueue.cluster_queue.status,gauge,0,,,"Reports the 'status' of each 'kueue_cluster_queue'. Possible values are 'pending', 'active', and 'terminated'. For a ClusterQueue, the metric reports a value of 1 for exactly one status.",0,kueue,cluster_queue_status,, +kueue.cluster_queue.weighted_share,gauge,0,,,"Reports the maximum ratio of usage above nominal quota to lendable resources in the cohort across all resources provided by the ClusterQueue, divided by weight. A value of 0 means that the ClusterQueue's usage is below the nominal quota. If the ClusterQueue has a weight of 0 and is borrowing, the metric returns NaN.",0,kueue,cluster_queue_weighted_share,, +kueue.cohort.info,gauge,0,,,Reports Cohort hierarchy information. The metric has value 1 and can be joined using labels.,0,kueue,cohort_info,, +kueue.cohort.weighted_share,gauge,0,,,"Reports the maximum ratio of usage above nominal quota to lendable resources in the Cohort across all resources provided by the Cohort, divided by weight. A value of 0 means that the Cohort's usage is below the nominal quota. If the Cohort has a weight of 0 and is borrowing, the metric returns NaN.",0,kueue,cohort_weighted_share,, +kueue.cohort_subtree.admitted.active_workloads,gauge,0,,,"The number of admitted Workloads that are active, per cohort subtree.",0,kueue,cohort_subtree_admitted_active_workloads,, +kueue.cohort_subtree.admitted.workloads.count,count,0,,,"The total number of admitted workloads, per cohort subtree.",0,kueue,cohort_subtree_admitted_workloads_count,, +kueue.cohort_subtree.quota.cpu,gauge,0,,,"Reports the cohort's nominal quota aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_quota_cpu,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.quota.gpu,gauge,0,,,"Reports the cohort's nominal quota aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_quota_gpu,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.quota.memory,gauge,0,,,"Reports the cohort's nominal quota aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_quota_memory,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.quota.other,gauge,0,,,"Reports the cohort's nominal quota aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_quota_other,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.resource_reservations.cpu,gauge,0,,,"Reports the cohort's resource reservations aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_resource_reservations_cpu,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.resource_reservations.gpu,gauge,0,,,"Reports the cohort's resource reservations aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_resource_reservations_gpu,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.resource_reservations.memory,gauge,0,,,"Reports the cohort's resource reservations aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_resource_reservations_memory,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.cohort_subtree.resource_reservations.other,gauge,0,,,"Reports the cohort's resource reservations aggregated within the cohort subtree. Values are reported per resource and flavor.",0,kueue,cohort_subtree_resource_reservations_other,,"cohort:default,flavor:on-demand,replica_role:leader" +kueue.controller.runtime.active_workers,gauge,0,,,Kueue controller runtime active workers metric.,0,kueue,controller_runtime_active_workers,, +kueue.controller.runtime.conversion_webhook_panics,gauge,0,,,Kueue controller runtime conversion webhook panics metric.,0,kueue,controller_runtime_conversion_webhook_panics,, +kueue.controller.runtime.max_concurrent_reconciles,gauge,0,,,Kueue controller runtime max concurrent reconciles metric.,0,kueue,controller_runtime_max_concurrent_reconciles,, +kueue.controller.runtime.reconcile,gauge,0,,,Kueue controller runtime reconcile metric.,0,kueue,controller_runtime_reconcile,, +kueue.controller.runtime.reconcile_errors,gauge,0,,,Kueue controller runtime reconcile errors metric.,0,kueue,controller_runtime_reconcile_errors,, +kueue.controller.runtime.reconcile_panics,gauge,0,,,Kueue controller runtime reconcile panics metric.,0,kueue,controller_runtime_reconcile_panics,, +kueue.controller.runtime.reconcile_time.seconds,gauge,0,second,,Kueue controller runtime reconcile time seconds metric.,0,kueue,controller_runtime_reconcile_time_seconds,, +kueue.controller.runtime.reconcile_timeouts,gauge,0,,,Kueue controller runtime reconcile timeouts metric.,0,kueue,controller_runtime_reconcile_timeouts,, +kueue.controller.runtime.terminal_reconcile_errors,gauge,0,,,Kueue controller runtime terminal reconcile errors metric.,0,kueue,controller_runtime_terminal_reconcile_errors,, +kueue.controller.runtime.webhook_latency.seconds,gauge,0,second,,Kueue controller runtime webhook latency seconds metric.,0,kueue,controller_runtime_webhook_latency_seconds,, +kueue.controller.runtime.webhook_panics,gauge,0,,,Kueue controller runtime webhook panics metric.,0,kueue,controller_runtime_webhook_panics,, +kueue.controller.runtime.webhook_requests,gauge,0,,,Kueue controller runtime webhook requests metric.,0,kueue,controller_runtime_webhook_requests,, +kueue.controller.runtime.webhook_requests_in_flight,gauge,0,,,Kueue controller runtime webhook requests in flight metric.,0,kueue,controller_runtime_webhook_requests_in_flight,, +kueue.evicted_workloads.count,count,0,,,"The number of evicted workloads per cluster queue, tagged by reason, underlying cause, priority class, and replica role.",0,kueue,evicted_workloads_count,, +kueue.evicted_workloads_once.count,count,0,,,"The number of unique workload evictions per cluster queue, tagged by eviction reason, underlying cause, priority class, and replica role.",0,kueue,evicted_workloads_once_count,, +kueue.finished_workloads,gauge,0,,,"The number of finished workloads, per 'kueue_cluster_queue'.",0,kueue,finished_workloads,, +kueue.go.gc.duration.seconds.count,count,0,second,,The summary count of garbage collection cycles.,0,kueue,go_gc_duration_seconds_count,, +kueue.go.gc.duration.seconds.quantile,gauge,0,second,,A summary of the pause duration of garbage collection cycles.,0,kueue,go_gc_duration_seconds,, +kueue.go.gc.duration.seconds.sum,count,0,second,,The sum of the pause duration of garbage collection cycles.,0,kueue,go_gc_duration_seconds_sum,, +kueue.go.gc.gogc.percent,gauge,0,percent,,Kueue go gc gogc percent metric.,0,kueue,go_gc_gogc_percent,, +kueue.go.gc.gomemlimit.bytes,gauge,0,byte,,Kueue go gc gomemlimit bytes metric.,0,kueue,go_gc_gomemlimit_bytes,, +kueue.go.goroutines,gauge,0,,,Number of goroutines that currently exist.,0,kueue,go_goroutines,, +kueue.go.info,gauge,0,,,Kueue go info metric.,0,kueue,go_info,, +kueue.go.memstats.alloc_bytes,gauge,0,,,Kueue go memstats alloc bytes metric.,0,kueue,go_memstats_alloc_bytes,, +kueue.go.memstats.buck_hash.sys_bytes,gauge,0,,,Kueue go memstats buck hash sys bytes metric.,0,kueue,go_memstats_buck_hash_sys_bytes,, +kueue.go.memstats.frees,gauge,0,,,Kueue go memstats frees metric.,0,kueue,go_memstats_frees,, +kueue.go.memstats.gc.sys_bytes,gauge,0,,,Kueue go memstats gc sys bytes metric.,0,kueue,go_memstats_gc_sys_bytes,, +kueue.go.memstats.heap_alloc.bytes,gauge,0,byte,,Kueue go memstats heap alloc bytes metric.,0,kueue,go_memstats_heap_alloc_bytes,, +kueue.go.memstats.heap_idle.bytes,gauge,0,byte,,Kueue go memstats heap idle bytes metric.,0,kueue,go_memstats_heap_idle_bytes,, +kueue.go.memstats.heap_inuse.bytes,gauge,0,byte,,Kueue go memstats heap inuse bytes metric.,0,kueue,go_memstats_heap_inuse_bytes,, +kueue.go.memstats.heap_objects,gauge,0,,,Kueue go memstats heap objects metric.,0,kueue,go_memstats_heap_objects,, +kueue.go.memstats.heap_released.bytes,gauge,0,byte,,Kueue go memstats heap released bytes metric.,0,kueue,go_memstats_heap_released_bytes,, +kueue.go.memstats.heap_sys.bytes,gauge,0,byte,,Kueue go memstats heap sys bytes metric.,0,kueue,go_memstats_heap_sys_bytes,, +kueue.go.memstats.mallocs,gauge,0,,,Kueue go memstats mallocs metric.,0,kueue,go_memstats_mallocs,, +kueue.go.memstats.mcache_inuse.bytes,gauge,0,byte,,Kueue go memstats mcache inuse bytes metric.,0,kueue,go_memstats_mcache_inuse_bytes,, +kueue.go.memstats.mcache_sys.bytes,gauge,0,byte,,Kueue go memstats mcache sys bytes metric.,0,kueue,go_memstats_mcache_sys_bytes,, +kueue.go.memstats.mspan_inuse.bytes,gauge,0,byte,,Kueue go memstats mspan inuse bytes metric.,0,kueue,go_memstats_mspan_inuse_bytes,, +kueue.go.memstats.mspan_sys.bytes,gauge,0,byte,,Kueue go memstats mspan sys bytes metric.,0,kueue,go_memstats_mspan_sys_bytes,, +kueue.go.memstats.next_gc.bytes,gauge,0,byte,,Kueue go memstats next gc bytes metric.,0,kueue,go_memstats_next_gc_bytes,, +kueue.go.memstats.other_sys.bytes,gauge,0,byte,,Kueue go memstats other sys bytes metric.,0,kueue,go_memstats_other_sys_bytes,, +kueue.go.memstats.stack_inuse.bytes,gauge,0,byte,,Kueue go memstats stack inuse bytes metric.,0,kueue,go_memstats_stack_inuse_bytes,, +kueue.go.memstats.stack_sys.bytes,gauge,0,byte,,Kueue go memstats stack sys bytes metric.,0,kueue,go_memstats_stack_sys_bytes,, +kueue.go.memstats.sys.bytes,gauge,0,byte,,Kueue go memstats sys bytes metric.,0,kueue,go_memstats_sys_bytes,, +kueue.go.memstats.time_since_last_gc.seconds,gauge,0,second,,Kueue go memstats time since last gc seconds metric.,0,kueue,go_memstats_time_since_last_gc_seconds,, +kueue.go.threads,gauge,0,,,Kueue go threads metric.,0,kueue,go_threads,, +kueue.leader_election.master_status,gauge,0,,,Kueue leader election master status metric.,0,kueue,leader_election_master_status,, +kueue.local_queue.admission.wait_time.seconds.bucket,count,0,second,,"The time between workload creation or requeueing and admission, per 'kueue_local_queue'.",0,kueue,local_queue_admission_wait_time_seconds_bucket,, +kueue.local_queue.admission.wait_time.seconds.count,count,0,second,,"The time between workload creation or requeueing and admission, per 'kueue_local_queue'.",0,kueue,local_queue_admission_wait_time_seconds_count,, +kueue.local_queue.admission.wait_time.seconds.sum,count,0,second,,"The time between workload creation or requeueing and admission, per 'kueue_local_queue'.",0,kueue,local_queue_admission_wait_time_seconds_sum,, +kueue.local_queue.admission_checks.wait_time.seconds.bucket,count,0,second,,"The time between quota reservation and admission for a workload, per 'kueue_local_queue'.",0,kueue,local_queue_admission_checks_wait_time_seconds_bucket,, +kueue.local_queue.admission_checks.wait_time.seconds.count,count,0,second,,"The time between quota reservation and admission for a workload, per 'kueue_local_queue'.",0,kueue,local_queue_admission_checks_wait_time_seconds_count,, +kueue.local_queue.admission_checks.wait_time.seconds.sum,count,0,second,,"The time between quota reservation and admission for a workload, per 'kueue_local_queue'.",0,kueue,local_queue_admission_checks_wait_time_seconds_sum,, +kueue.local_queue.admitted.active_workloads,gauge,0,,,"The number of admitted Workloads that are active, per 'kueue_local_queue'.",0,kueue,local_queue_admitted_active_workloads,, +kueue.local_queue.admitted.workloads.count,count,0,,,"The total number of admitted workloads, per 'kueue_local_queue'.",0,kueue,local_queue_admitted_workloads_count,, +kueue.local_queue.admitted_until_ready.wait_time.seconds.bucket,count,0,second,,"The time between workload admission and the workload becoming ready, per 'kueue_local_queue'.",0,kueue,local_queue_admitted_until_ready_wait_time_seconds_bucket,, +kueue.local_queue.admitted_until_ready.wait_time.seconds.count,count,0,second,,"The time between workload admission and the workload becoming ready, per 'kueue_local_queue'.",0,kueue,local_queue_admitted_until_ready_wait_time_seconds_count,, +kueue.local_queue.admitted_until_ready.wait_time.seconds.sum,count,0,second,,"The time between workload admission and the workload becoming ready, per 'kueue_local_queue'.",0,kueue,local_queue_admitted_until_ready_wait_time_seconds_sum,, +kueue.local_queue.evicted_workloads.count,count,0,,,"The number of evicted workloads per local queue, tagged by namespace, reason, underlying cause, priority class, and replica role.",0,kueue,local_queue_evicted_workloads_count,, +kueue.local_queue.finished_workloads,gauge,0,,,"The number of finished workloads, per 'kueue_local_queue'.",0,kueue,local_queue_finished_workloads,, +kueue.local_queue.finished_workloads.count,count,0,,,"The total number of finished workloads, per 'kueue_local_queue'.",0,kueue,local_queue_finished_workloads_count,, +kueue.local_queue.pending_workloads,gauge,0,,,"The number of pending workloads, per 'kueue_local_queue' and 'status'. The 'status' value can be 'active' for workloads in the admission queue or 'inadmissible' for workloads with a failed admission attempt that are not retried until cluster conditions that could make them admissible change.",0,kueue,local_queue_pending_workloads,,"kueue_local_queue:gpu,namespace:team-a,status:active,replica_role:leader" +kueue.local_queue.quota_reserved.wait_time.seconds.bucket,count,0,second,,"The time from when a workload is created or requeued until it receives a quota reservation, per 'kueue_local_queue'.",0,kueue,local_queue_quota_reserved_wait_time_seconds_bucket,, +kueue.local_queue.quota_reserved.wait_time.seconds.count,count,0,second,,"The time from when a workload is created or requeued until it receives a quota reservation, per 'kueue_local_queue'.",0,kueue,local_queue_quota_reserved_wait_time_seconds_count,, +kueue.local_queue.quota_reserved.wait_time.seconds.sum,count,0,second,,"The time from when a workload is created or requeued until it receives a quota reservation, per 'kueue_local_queue'.",0,kueue,local_queue_quota_reserved_wait_time_seconds_sum,, +kueue.local_queue.quota_reserved.workloads.count,count,0,,,"The total number of quota-reserved workloads, per 'kueue_local_queue'.",0,kueue,local_queue_quota_reserved_workloads_count,, +kueue.local_queue.ready_wait_time.seconds.bucket,count,0,second,,"The time between workload creation or requeueing and the workload becoming ready, per 'kueue_local_queue'.",0,kueue,local_queue_ready_wait_time_seconds_bucket,, +kueue.local_queue.ready_wait_time.seconds.count,count,0,second,,"The time between workload creation or requeueing and the workload becoming ready, per 'kueue_local_queue'.",0,kueue,local_queue_ready_wait_time_seconds_count,, +kueue.local_queue.ready_wait_time.seconds.sum,count,0,second,,"The time between workload creation or requeueing and the workload becoming ready, per 'kueue_local_queue'.",0,kueue,local_queue_ready_wait_time_seconds_sum,, +kueue.local_queue.reserving.active_workloads,gauge,0,,,"The number of Workloads that are reserving quota, per 'kueue_local_queue'.",0,kueue,local_queue_reserving_active_workloads,, +kueue.local_queue.resource_reservation.cpu,gauge,0,,,"Reports the LocalQueue's total resource reservation across all flavors.",0,kueue,local_queue_resource_reservation_cpu,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_reservation.gpu,gauge,0,,,"Reports the LocalQueue's total resource reservation across all flavors.",0,kueue,local_queue_resource_reservation_gpu,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_reservation.memory,gauge,0,,,"Reports the LocalQueue's total resource reservation across all flavors.",0,kueue,local_queue_resource_reservation_memory,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_reservation.other,gauge,0,,,"Reports the LocalQueue's total resource reservation across all flavors.",0,kueue,local_queue_resource_reservation_other,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_usage.cpu,gauge,0,,,"Reports the LocalQueue's total resource usage across all flavors.",0,kueue,local_queue_resource_usage_cpu,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_usage.gpu,gauge,0,,,"Reports the LocalQueue's total resource usage across all flavors.",0,kueue,local_queue_resource_usage_gpu,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_usage.memory,gauge,0,,,"Reports the LocalQueue's total resource usage across all flavors.",0,kueue,local_queue_resource_usage_memory,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.resource_usage.other,gauge,0,,,"Reports the LocalQueue's total resource usage across all flavors.",0,kueue,local_queue_resource_usage_other,,"kueue_local_queue:gpu,namespace:team-a,flavor:on-demand,replica_role:leader" +kueue.local_queue.status,gauge,0,,,"Reports the 'active' status of each 'kueue_local_queue'. Possible values are 'True', 'False', and 'Unknown'. For a LocalQueue, only one status has a value of 1.",0,kueue,local_queue_status,, +kueue.pending_workloads,gauge,0,,,"The number of pending workloads, per 'kueue_cluster_queue' and 'status'. The 'status' value can be 'active' for workloads in the admission queue or 'inadmissible' for workloads with a failed admission attempt that are not retried until cluster conditions that could make them admissible change.",0,kueue,pending_workloads,,"kueue_cluster_queue:default,status:inadmissible,replica_role:leader" +kueue.pods_ready_to_evicted_time.seconds.bucket,count,0,second,,"Bucket counts for the time between the workload's pods becoming ready and workload eviction, tagged by cluster queue, reason, underlying cause, and replica role.",0,kueue,pods_ready_to_evicted_time_seconds_bucket,, +kueue.pods_ready_to_evicted_time.seconds.count,count,0,second,,"Count of observations for the time between the workload's pods becoming ready and workload eviction, tagged by cluster queue, reason, underlying cause, and replica role.",0,kueue,pods_ready_to_evicted_time_seconds_count,, +kueue.pods_ready_to_evicted_time.seconds.sum,count,0,second,,"Total time between the workload's pods becoming ready and workload eviction, tagged by cluster queue, reason, underlying cause, and replica role.",0,kueue,pods_ready_to_evicted_time_seconds_sum,, +kueue.preempted_workloads.count,count,0,,,"The number of preempted workloads per preempting cluster queue, tagged by preemption reason and replica role.",0,kueue,preempted_workloads_count,, +kueue.process.cpu.seconds,gauge,0,second,,Kueue process cpu seconds metric.,0,kueue,process_cpu_seconds,, +kueue.process.max_fds,gauge,0,,,Kueue process max fds metric.,0,kueue,process_max_fds,, +kueue.process.network_receive.bytes,gauge,0,byte,,Kueue process network receive bytes metric.,0,kueue,process_network_receive_bytes,, +kueue.process.network_transmit.bytes,gauge,0,byte,,Kueue process network transmit bytes metric.,0,kueue,process_network_transmit_bytes,, +kueue.process.open_fds,gauge,0,,,Kueue process open fds metric.,0,kueue,process_open_fds,, +kueue.process.resident_memory.bytes,gauge,0,byte,,Kueue process resident memory bytes metric.,0,kueue,process_resident_memory_bytes,, +kueue.process.uptime.seconds,gauge,0,second,,Kueue process uptime seconds metric.,0,kueue,process_uptime_seconds,, +kueue.process.virtual_memory.bytes,gauge,0,byte,,Kueue process virtual memory bytes metric.,0,kueue,process_virtual_memory_bytes,, +kueue.process.virtual_memory.max_bytes,gauge,0,,,Kueue process virtual memory max bytes metric.,0,kueue,process_virtual_memory_max_bytes,, +kueue.quota_reserved.wait_time.seconds.bucket,count,0,second,,"The time from when a workload is created or requeued until it receives a quota reservation, per 'kueue_cluster_queue'.",0,kueue,quota_reserved_wait_time_seconds_bucket,, +kueue.quota_reserved.wait_time.seconds.count,count,0,second,,"The time from when a workload is created or requeued until it receives a quota reservation, per 'kueue_cluster_queue'.",0,kueue,quota_reserved_wait_time_seconds_count,, +kueue.quota_reserved.wait_time.seconds.sum,count,0,second,,"The time from when a workload is created or requeued until it receives a quota reservation, per 'kueue_cluster_queue'.",0,kueue,quota_reserved_wait_time_seconds_sum,, +kueue.quota_reserved.workloads.count,count,0,,,"The total number of quota-reserved workloads, per 'kueue_cluster_queue'.",0,kueue,quota_reserved_workloads_count,, +kueue.ready_wait_time.seconds.bucket,count,0,second,,"The time between workload creation or requeueing and the workload becoming ready, per 'kueue_cluster_queue'.",0,kueue,ready_wait_time_seconds_bucket,, +kueue.ready_wait_time.seconds.count,count,0,second,,"The time between workload creation or requeueing and the workload becoming ready, per 'kueue_cluster_queue'.",0,kueue,ready_wait_time_seconds_count,, +kueue.ready_wait_time.seconds.sum,count,0,second,,"The time between workload creation or requeueing and the workload becoming ready, per 'kueue_cluster_queue'.",0,kueue,ready_wait_time_seconds_sum,, +kueue.replaced_workload_slices.count,count,0,,,"The number of replaced workload slices, per 'kueue_cluster_queue'.",0,kueue,replaced_workload_slices_count,, +kueue.reserving.active_workloads,gauge,0,,,"The number of Workloads that are reserving quota, per 'kueue_cluster_queue'.",0,kueue,reserving_active_workloads,, +kueue.resource_flavor.quota_reserved_workloads,gauge,0,,,Number of reserved workloads.,0,kueue,resource_flavor_quota_reserved_workloads,,"kueue_cluster_queue:default,flavor:on-demand" +kueue.workload.creation_latency.seconds.bucket,count,0,second,,"The time between job creation and workload creation, per 'job_kind'. Entries are only recorded for objects with generation 1.",0,kueue,workload_creation_latency_seconds_bucket,, +kueue.workload.creation_latency.seconds.count,count,0,second,,"The time between job creation and workload creation, per 'job_kind'. Entries are only recorded for objects with generation 1.",0,kueue,workload_creation_latency_seconds_count,, +kueue.workload.creation_latency.seconds.sum,count,0,second,,"The time between job creation and workload creation, per 'job_kind'. Entries are only recorded for objects with generation 1.",0,kueue,workload_creation_latency_seconds_sum,, +kueue.workload.eviction_latency.seconds.bucket,count,0,second,,"Bucket counts for the time from workload eviction until the workload returns to pending and releases quota, tagged by cluster queue, reason, and replica role.",0,kueue,workload_eviction_latency_seconds_bucket,, +kueue.workload.eviction_latency.seconds.count,count,0,second,,"Count of observations for the time from workload eviction until the workload returns to pending and releases quota, tagged by cluster queue, reason, and replica role.",0,kueue,workload_eviction_latency_seconds_count,, +kueue.workload.eviction_latency.seconds.sum,count,0,second,,"Total time from workload eviction until the workload returns to pending and releases quota, tagged by cluster queue, reason, and replica role.",0,kueue,workload_eviction_latency_seconds_sum,, +kueue.workqueue.adds,gauge,0,,,Kueue workqueue adds metric.,0,kueue,workqueue_adds,, +kueue.workqueue.depth,gauge,0,,,Kueue workqueue depth metric.,0,kueue,workqueue_depth,, +kueue.workqueue.longest_running_processor.seconds,gauge,0,second,,Kueue workqueue longest running processor seconds metric.,0,kueue,workqueue_longest_running_processor_seconds,, +kueue.workqueue.queue_duration.seconds,gauge,0,second,,Kueue workqueue queue duration seconds metric.,0,kueue,workqueue_queue_duration_seconds,, +kueue.workqueue.retries,gauge,0,,,Kueue workqueue retries metric.,0,kueue,workqueue_retries,, +kueue.workqueue.unfinished_work.seconds,gauge,0,second,,Kueue workqueue unfinished work seconds metric.,0,kueue,workqueue_unfinished_work_seconds,, +kueue.workqueue.work_duration.seconds,gauge,0,second,,Kueue workqueue work duration seconds metric.,0,kueue,workqueue_work_duration_seconds,, diff --git a/kueue/pyproject.toml b/kueue/pyproject.toml new file mode 100644 index 0000000000000..486327b61716c --- /dev/null +++ b/kueue/pyproject.toml @@ -0,0 +1,61 @@ +[build-system] +requires = [ + "hatchling>=0.13.0", +] +build-backend = "hatchling.build" + +[project] +name = "datadog-kueue" +description = "The Kueue check" +readme = "README.md" +license = "BSD-3-Clause" +requires-python = ">=3.12" +keywords = [ + "datadog", + "datadog agent", + "datadog check", + "kueue", +] +authors = [ + { name = "Datadog", email = "packages@datadoghq.com" }, +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: BSD License", + "Private :: Do Not Upload", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: System :: Monitoring", +] +dependencies = [ + "datadog-checks-base>=37.33.0", +] +dynamic = [ + "version", +] + +[project.optional-dependencies] +deps = [] + +[project.urls] +Source = "https://github.com/DataDog/integrations-core" + +[tool.hatch.version] +path = "datadog_checks/kueue/__about__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/datadog_checks", + "/tests", + "/manifest.json", +] + +[tool.hatch.build.targets.wheel] +include = [ + "/datadog_checks/kueue", +] +dev-mode-dirs = [ + ".", +] diff --git a/kueue/tests/__init__.py b/kueue/tests/__init__.py new file mode 100644 index 0000000000000..75c6647cb9233 --- /dev/null +++ b/kueue/tests/__init__.py @@ -0,0 +1,3 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) diff --git a/kueue/tests/common.py b/kueue/tests/common.py new file mode 100644 index 0000000000000..16af033f8fffb --- /dev/null +++ b/kueue/tests/common.py @@ -0,0 +1,68 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import os + +from datadog_checks.dev import get_here + +HERE = get_here() + + +def get_fixture_path(filename): + return os.path.join(HERE, 'fixtures', filename) + + +MOCKED_INSTANCE = { + 'openmetrics_endpoint': 'http://localhost:8080/metrics', + 'tags': ['test:tag'], +} + +# Tags defined in the YAML files for the e2e tests +_cluster_queue_tags = ['kueue_cluster_queue:cluster-queue', 'replica_role:leader'] +_cluster_queue_flavor_tags = [*_cluster_queue_tags, 'kueue_resource_flavor:default-flavor'] +_local_queue_tags = ['kueue_local_queue:user-queue', 'namespace:default', 'replica_role:leader'] +_local_queue_flavor_tags = [*_local_queue_tags, 'kueue_resource_flavor:default-flavor'] + +# Keys: metrics we assert in both unit (mock metrics.txt) and e2e (live cluster). +# Values: tags that must appear on at least one series for that metric (empty = metric presence only, no tags checked). +EXPECTED_METRIC_TAGS = { + 'kueue.build_info': [], + 'kueue.go.goroutines': [], + 'kueue.go.info': ['go_version:go1.26.3'], + 'kueue.cluster_queue.info': ['kueue_cluster_queue:cluster-queue'], + 'kueue.cluster_queue.status': ['kueue_cluster_queue:cluster-queue'], + 'kueue.cluster_queue.nominal_quota.cpu': _cluster_queue_flavor_tags, + 'kueue.cluster_queue.nominal_quota.memory': _cluster_queue_flavor_tags, + 'kueue.cluster_queue.resource_pending.cpu': _cluster_queue_tags, + 'kueue.cluster_queue.resource_pending.memory': _cluster_queue_tags, + 'kueue.cluster_queue.resource_reservation.cpu': _cluster_queue_flavor_tags, + 'kueue.cluster_queue.resource_reservation.memory': _cluster_queue_flavor_tags, + 'kueue.cluster_queue.resource_usage.cpu': _cluster_queue_flavor_tags, + 'kueue.cluster_queue.resource_usage.memory': _cluster_queue_flavor_tags, + 'kueue.local_queue.status': [], + 'kueue.admitted.active_workloads': _cluster_queue_tags, + 'kueue.local_queue.admitted.active_workloads': _local_queue_tags, + 'kueue.pending_workloads': [*_cluster_queue_tags, 'status:inadmissible'], + 'kueue.local_queue.pending_workloads': [*_local_queue_tags, 'status:inadmissible'], + 'kueue.local_queue.resource_reservation.cpu': _local_queue_flavor_tags, + 'kueue.local_queue.resource_reservation.memory': _local_queue_flavor_tags, + 'kueue.local_queue.resource_usage.cpu': _local_queue_flavor_tags, + 'kueue.local_queue.resource_usage.memory': _local_queue_flavor_tags, + 'kueue.controller.runtime.active_workers': [], + 'kueue.process.uptime.seconds': [], + 'kueue.workqueue.depth': [], +} + +# Same metrics as EXPECTED_METRIC_TAGS keys (single source of truth for unit + e2e). +UNIT_E2E_METRICS = tuple(EXPECTED_METRIC_TAGS) + +# Extra Datadog metric names covered by tests/fixtures/metrics.txt but not required on the e2e cluster. +FIXTURE_ONLY_METRICS = ( + 'kueue.cluster_queue.resource_usage.gpu', + 'kueue.cluster_queue.resource_usage.other', + 'kueue.cluster_queue.resource_pending.gpu', + 'kueue.resource_flavor.quota_reserved_workloads', +) + +# All metrics for unit test_check presence + instance tag assertions. +UNIT_METRICS = (*UNIT_E2E_METRICS, *FIXTURE_ONLY_METRICS) diff --git a/kueue/tests/conftest.py b/kueue/tests/conftest.py new file mode 100644 index 0000000000000..df93128847dc2 --- /dev/null +++ b/kueue/tests/conftest.py @@ -0,0 +1,171 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import os +import time +from contextlib import ExitStack + +import pytest + +from datadog_checks.dev import get_here +from datadog_checks.dev.kind import kind_run +from datadog_checks.dev.kube_port_forward import port_forward +from datadog_checks.dev.subprocess import run_command + +from .common import MOCKED_INSTANCE + +HERE = get_here() +KUEUE_VERSION = os.environ.get('KUEUE_VERSION', 'v0.18.0') +KUEUE_NAMESPACE = 'kueue-system' # hardcoded in the Kueue manifests + + +def wait_for_controller(): + run_command( + [ + 'kubectl', + 'rollout', + 'status', + 'deployment/kueue-controller-manager', + '-n', + KUEUE_NAMESPACE, + '--timeout=300s', + ] + ) + run_command( + [ + 'kubectl', + 'wait', + 'deployment/kueue-controller-manager', + '--for=condition=Available', + '-n', + KUEUE_NAMESPACE, + '--timeout=300s', + ] + ) + + +def setup_kueue(): + run_command( + [ + 'kubectl', + 'apply', + '--server-side', + '-f', + f'https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml', + ] + ) + + # Ensure the controller is ready + wait_for_controller() + + run_command(['kubectl', 'apply', '-f', os.path.join(HERE, 'kind', 'kueue-config.yaml')]) + # Restart the controller to pick up the new config + run_command(['kubectl', 'rollout', 'restart', 'deployment/kueue-controller-manager', '-n', KUEUE_NAMESPACE]) + wait_for_controller() + + run_command(['kubectl', 'apply', '-f', os.path.join(HERE, 'kind', 'metrics-reader.yaml')]) + # The deployment can be `Available` before the webhook server is actually serving, so wait until the + # webhook service has ready endpoints before applying resources that go through the mutating webhooks. + run_command( + [ + 'kubectl', + 'wait', + '--for=jsonpath={.subsets[*].addresses[*].ip}', + 'endpoints/kueue-webhook-service', + '-n', + KUEUE_NAMESPACE, + '--timeout=300s', + ] + ) + apply_queue_manifests() + run_command(['kubectl', 'wait', 'clusterqueue/cluster-queue', '--for=condition=Active', '--timeout=300s']) + run_command( + ['kubectl', 'wait', 'localqueue/user-queue', '-n', 'default', '--for=condition=Active', '--timeout=300s'] + ) + run_command(['kubectl', 'apply', '-f', os.path.join(HERE, 'kind', 'workloads.yaml')]) + wait_for_job_workload_condition('scheduled-workload', 'Admitted=True') + wait_for_job_workload_condition('unschedulable-workload', 'QuotaReserved=False') + + +def apply_queue_manifests(): + # The webhook can still reject calls for a short window after its endpoints become ready (cert + # propagation), so retry the apply a few times before giving up. + queue_manifest = os.path.join(HERE, 'kind', 'queue.yaml') + last_error = None + for _ in range(10): + try: + run_command(['kubectl', 'apply', '-f', queue_manifest], check=True) + return + except Exception as e: + last_error = e + time.sleep(5) + raise RuntimeError(f'Failed to apply queue manifests after retries: {last_error}') + + +def wait_for_job_workload_condition(job_name: str, condition: str) -> None: + job_uid = run_command( + ['kubectl', 'get', 'job', job_name, '-n', 'default', '-o', 'jsonpath={.metadata.uid}'], capture=True + ).stdout.strip() + workload_name = '' + for _ in range(10): + workload_name = run_command( + [ + 'kubectl', + 'get', + 'workloads.kueue.x-k8s.io', + '-n', + 'default', + '-l', + f'kueue.x-k8s.io/job-uid={job_uid}', + '-o', + 'jsonpath={.items[0].metadata.name}', + ], + capture=True, + ).stdout.strip() + if workload_name: + break + time.sleep(1) + if not workload_name: + raise RuntimeError(f'Failed to find Kueue Workload for Job {job_name}') + run_command( + [ + 'kubectl', + 'wait', + f'workload/{workload_name}', + '-n', + 'default', + f'--for=condition={condition}', + '--timeout=300s', + ] + ) + + +def get_service_account_token(): + result = run_command( + ['kubectl', 'create', 'token', 'kueue-metrics-reader', '-n', 'default'], + capture=True, + ) + return result.stdout.strip() + + +@pytest.fixture(scope='session') +def dd_environment(): + kind_config = os.path.join(HERE, 'kind', 'kind-config.yaml') + with kind_run(conditions=[setup_kueue], kind_config=kind_config, sleep=10) as kubeconfig, ExitStack() as stack: + kueue_host, kueue_port = stack.enter_context( + port_forward(kubeconfig, 'kueue-system', 8443, 'service', 'kueue-controller-manager-metrics-service') + ) + instances = [ + { + 'openmetrics_endpoint': f'https://{kueue_host}:{kueue_port}/metrics', + 'tls_verify': False, + 'extra_headers': {'Authorization': f'Bearer {get_service_account_token()}'}, + } + ] + + yield {'instances': instances} + + +@pytest.fixture +def instance(): + return MOCKED_INSTANCE.copy() diff --git a/kueue/tests/fixtures/metrics.txt b/kueue/tests/fixtures/metrics.txt new file mode 100644 index 0000000000000..3ce478a6672f1 --- /dev/null +++ b/kueue/tests/fixtures/metrics.txt @@ -0,0 +1,619 @@ +# HELP certwatcher_read_certificate Kueue certwatcher read certificate metric. +# TYPE certwatcher_read_certificate gauge +certwatcher_read_certificate{replica_role="leader"} 1 + +# HELP certwatcher_read_certificate_errors Kueue certwatcher read certificate errors metric. +# TYPE certwatcher_read_certificate_errors gauge +certwatcher_read_certificate_errors{replica_role="leader"} 1 + +# HELP controller_runtime_active_workers Kueue controller runtime active workers metric. +# TYPE controller_runtime_active_workers gauge +controller_runtime_active_workers{replica_role="leader"} 1 + +# HELP controller_runtime_conversion_webhook_panics Kueue controller runtime conversion webhook panics metric. +# TYPE controller_runtime_conversion_webhook_panics gauge +controller_runtime_conversion_webhook_panics{replica_role="leader"} 1 + +# HELP controller_runtime_max_concurrent_reconciles Kueue controller runtime max concurrent reconciles metric. +# TYPE controller_runtime_max_concurrent_reconciles gauge +controller_runtime_max_concurrent_reconciles{replica_role="leader"} 1 + +# HELP controller_runtime_reconcile Kueue controller runtime reconcile metric. +# TYPE controller_runtime_reconcile gauge +controller_runtime_reconcile{replica_role="leader"} 1 + +# HELP controller_runtime_reconcile_errors Kueue controller runtime reconcile errors metric. +# TYPE controller_runtime_reconcile_errors gauge +controller_runtime_reconcile_errors{replica_role="leader"} 1 + +# HELP controller_runtime_reconcile_panics Kueue controller runtime reconcile panics metric. +# TYPE controller_runtime_reconcile_panics gauge +controller_runtime_reconcile_panics{replica_role="leader"} 1 + +# HELP controller_runtime_reconcile_time_seconds Kueue controller runtime reconcile time seconds metric. +# TYPE controller_runtime_reconcile_time_seconds gauge +controller_runtime_reconcile_time_seconds{replica_role="leader"} 1 + +# HELP controller_runtime_reconcile_timeouts Kueue controller runtime reconcile timeouts metric. +# TYPE controller_runtime_reconcile_timeouts gauge +controller_runtime_reconcile_timeouts{replica_role="leader"} 1 + +# HELP controller_runtime_terminal_reconcile_errors Kueue controller runtime terminal reconcile errors metric. +# TYPE controller_runtime_terminal_reconcile_errors gauge +controller_runtime_terminal_reconcile_errors{replica_role="leader"} 1 + +# HELP controller_runtime_webhook_latency_seconds Kueue controller runtime webhook latency seconds metric. +# TYPE controller_runtime_webhook_latency_seconds gauge +controller_runtime_webhook_latency_seconds{replica_role="leader"} 1 + +# HELP controller_runtime_webhook_panics Kueue controller runtime webhook panics metric. +# TYPE controller_runtime_webhook_panics gauge +controller_runtime_webhook_panics{replica_role="leader"} 1 + +# HELP controller_runtime_webhook_requests Kueue controller runtime webhook requests metric. +# TYPE controller_runtime_webhook_requests gauge +controller_runtime_webhook_requests{replica_role="leader"} 1 + +# HELP controller_runtime_webhook_requests_in_flight Kueue controller runtime webhook requests in flight metric. +# TYPE controller_runtime_webhook_requests_in_flight gauge +controller_runtime_webhook_requests_in_flight{replica_role="leader"} 1 + +# HELP go_gc_duration_seconds Kueue go gc duration seconds metric. +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds{replica_role="leader",quantile="0"} 1 +go_gc_duration_seconds{replica_role="leader",quantile="0.25"} 1 +go_gc_duration_seconds{replica_role="leader",quantile="0.5"} 1 +go_gc_duration_seconds{replica_role="leader",quantile="0.75"} 1 +go_gc_duration_seconds{replica_role="leader",quantile="1"} 1 +go_gc_duration_seconds_sum{replica_role="leader"} 1 +go_gc_duration_seconds_count{replica_role="leader"} 1 + +# HELP go_gc_gogc_percent Kueue go gc gogc percent metric. +# TYPE go_gc_gogc_percent gauge +go_gc_gogc_percent{replica_role="leader"} 1 + +# HELP go_gc_gomemlimit_bytes Kueue go gc gomemlimit bytes metric. +# TYPE go_gc_gomemlimit_bytes gauge +go_gc_gomemlimit_bytes{replica_role="leader"} 1 + +# HELP go_goroutines Kueue go goroutines metric. +# TYPE go_goroutines gauge +go_goroutines{replica_role="leader"} 1 + +# HELP go_info Kueue go info metric. +# TYPE go_info gauge +go_info{replica_role="leader",version="go1.26.3"} 1 + +# HELP go_memstats_alloc_bytes Kueue go memstats alloc bytes metric. +# TYPE go_memstats_alloc_bytes gauge +go_memstats_alloc_bytes{replica_role="leader"} 1 + +# HELP go_memstats_buck_hash_sys_bytes Kueue go memstats buck hash sys bytes metric. +# TYPE go_memstats_buck_hash_sys_bytes gauge +go_memstats_buck_hash_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_frees Kueue go memstats frees metric. +# TYPE go_memstats_frees gauge +go_memstats_frees{replica_role="leader"} 1 + +# HELP go_memstats_gc_sys_bytes Kueue go memstats gc sys bytes metric. +# TYPE go_memstats_gc_sys_bytes gauge +go_memstats_gc_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_heap_alloc_bytes Kueue go memstats heap alloc bytes metric. +# TYPE go_memstats_heap_alloc_bytes gauge +go_memstats_heap_alloc_bytes{replica_role="leader"} 1 + +# HELP go_memstats_heap_idle_bytes Kueue go memstats heap idle bytes metric. +# TYPE go_memstats_heap_idle_bytes gauge +go_memstats_heap_idle_bytes{replica_role="leader"} 1 + +# HELP go_memstats_heap_inuse_bytes Kueue go memstats heap inuse bytes metric. +# TYPE go_memstats_heap_inuse_bytes gauge +go_memstats_heap_inuse_bytes{replica_role="leader"} 1 + +# HELP go_memstats_heap_objects Kueue go memstats heap objects metric. +# TYPE go_memstats_heap_objects gauge +go_memstats_heap_objects{replica_role="leader"} 1 + +# HELP go_memstats_heap_released_bytes Kueue go memstats heap released bytes metric. +# TYPE go_memstats_heap_released_bytes gauge +go_memstats_heap_released_bytes{replica_role="leader"} 1 + +# HELP go_memstats_heap_sys_bytes Kueue go memstats heap sys bytes metric. +# TYPE go_memstats_heap_sys_bytes gauge +go_memstats_heap_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_last_gc_time_seconds Kueue go memstats last gc time seconds metric. +# TYPE go_memstats_last_gc_time_seconds gauge +go_memstats_last_gc_time_seconds{replica_role="leader"} 1 + +# HELP go_memstats_mallocs Kueue go memstats mallocs metric. +# TYPE go_memstats_mallocs gauge +go_memstats_mallocs{replica_role="leader"} 1 + +# HELP go_memstats_mcache_inuse_bytes Kueue go memstats mcache inuse bytes metric. +# TYPE go_memstats_mcache_inuse_bytes gauge +go_memstats_mcache_inuse_bytes{replica_role="leader"} 1 + +# HELP go_memstats_mcache_sys_bytes Kueue go memstats mcache sys bytes metric. +# TYPE go_memstats_mcache_sys_bytes gauge +go_memstats_mcache_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_mspan_inuse_bytes Kueue go memstats mspan inuse bytes metric. +# TYPE go_memstats_mspan_inuse_bytes gauge +go_memstats_mspan_inuse_bytes{replica_role="leader"} 1 + +# HELP go_memstats_mspan_sys_bytes Kueue go memstats mspan sys bytes metric. +# TYPE go_memstats_mspan_sys_bytes gauge +go_memstats_mspan_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_next_gc_bytes Kueue go memstats next gc bytes metric. +# TYPE go_memstats_next_gc_bytes gauge +go_memstats_next_gc_bytes{replica_role="leader"} 1 + +# HELP go_memstats_other_sys_bytes Kueue go memstats other sys bytes metric. +# TYPE go_memstats_other_sys_bytes gauge +go_memstats_other_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_stack_inuse_bytes Kueue go memstats stack inuse bytes metric. +# TYPE go_memstats_stack_inuse_bytes gauge +go_memstats_stack_inuse_bytes{replica_role="leader"} 1 + +# HELP go_memstats_stack_sys_bytes Kueue go memstats stack sys bytes metric. +# TYPE go_memstats_stack_sys_bytes gauge +go_memstats_stack_sys_bytes{replica_role="leader"} 1 + +# HELP go_memstats_sys_bytes Kueue go memstats sys bytes metric. +# TYPE go_memstats_sys_bytes gauge +go_memstats_sys_bytes{replica_role="leader"} 1 + +# HELP go_threads Kueue go threads metric. +# TYPE go_threads gauge +go_threads{replica_role="leader"} 1 + +# HELP kueue_admission_attempt_duration_seconds The latency of an admission attempt. The label 'result' can have the following values:- 'success' means that at least one workload was admitted.,- 'inadmissible' means that no workload was admitted. +# TYPE kueue_admission_attempt_duration_seconds histogram +kueue_admission_attempt_duration_seconds_bucket{replica_role="leader",result="success",le="1"} 1 +kueue_admission_attempt_duration_seconds_bucket{replica_role="leader",result="success",le="+Inf"} 1 +kueue_admission_attempt_duration_seconds_sum{replica_role="leader",result="success"} 0.5 +kueue_admission_attempt_duration_seconds_count{replica_role="leader",result="success"} 1 + +# HELP kueue_admission_attempts_total The total number of attempts to admit workloads.Each admission attempt might try to admit more than one workload. The label 'result' can have the following values:- 'success' means that at least one workload was admitted.,- 'inadmissible' means that no workload was admitted. +# TYPE kueue_admission_attempts_total counter +kueue_admission_attempts_total{replica_role="leader",priority_class="default-priority",result="success"} 1 + +# HELP kueue_admission_checks_wait_time_seconds The time from when a workload got the quota reservation until admission, per 'cluster_queue' +# TYPE kueue_admission_checks_wait_time_seconds histogram +kueue_admission_checks_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_admission_checks_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_admission_checks_wait_time_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_admission_checks_wait_time_seconds_count{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_admission_cycle_preemption_skips The number of Workloads in the ClusterQueue that got preemption candidates but had to be skipped because other ClusterQueues needed the same resources in the same cycle +# TYPE kueue_admission_cycle_preemption_skips gauge +kueue_admission_cycle_preemption_skips{replica_role="leader"} 1 + +# HELP kueue_admission_wait_time_seconds The time between a workload was created or requeued until admission, per 'cluster_queue' +# TYPE kueue_admission_wait_time_seconds histogram +kueue_admission_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_admission_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_admission_wait_time_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_admission_wait_time_seconds_count{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_admitted_until_ready_wait_time_seconds The time between a workload was admitted until ready, per 'cluster_queue' +# TYPE kueue_admitted_until_ready_wait_time_seconds histogram +kueue_admitted_until_ready_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_admitted_until_ready_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_admitted_until_ready_wait_time_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_admitted_until_ready_wait_time_seconds_count{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_admitted_active_workloads The number of admitted Workloads that are active, per 'cluster_queue' +# TYPE kueue_admitted_active_workloads gauge +kueue_admitted_active_workloads{replica_role="leader",cluster_queue="cluster-queue"} 1 + +# HELP kueue_admitted_workloads_total The total number of admitted workloads per 'cluster_queue' +# TYPE kueue_admitted_workloads_total counter +kueue_admitted_workloads_total{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_build_info Kueue build information. 1 labeled by git version, git commit, build date, go version, compiler, platform +# TYPE kueue_build_info gauge +kueue_build_info{replica_role="leader",git_version="v0.11.0",git_commit="abc123",build_date="2026-01-01",go_version="go1.24",compiler="gc",platform="linux/amd64"} 1 + +# HELP kueue_cluster_queue_info Reports ClusterQueue hierarchy information. The metric has value 1 and can be joined using labels. +# TYPE kueue_cluster_queue_info gauge +kueue_cluster_queue_info{replica_role="leader",cluster_queue="cluster-queue"} 1 + +# HELP kueue_cluster_queue_status Reports 'cluster_queue' with its 'status' (with possible values 'pending', 'active' or 'terminated').For a ClusterQueue, the metric only reports a value of 1 for one of the statuses. +# TYPE kueue_cluster_queue_status gauge +kueue_cluster_queue_status{replica_role="leader",cluster_queue="cluster-queue",status="inadmissible"} 1 + +# HELP kueue_cluster_queue_weighted_share Reports a value that representing the maximum of the ratios of usage above nominalquota to the lendable resources in the cohort, among all the resources provided bythe ClusterQueue, and divided by the weight.If zero, it means that the usage of the ClusterQueue is below the nominal quota.If the ClusterQueue has a weight of zero and is borrowing, this will return NaN. +# TYPE kueue_cluster_queue_weighted_share gauge +kueue_cluster_queue_weighted_share{replica_role="leader",cluster_queue="cluster-queue"} 1 + +# HELP kueue_cohort_info Reports Cohort hierarchy information. The metric has value 1 and can be joined using labels. +# TYPE kueue_cohort_info gauge +kueue_cohort_info{replica_role="leader",cohort="default"} 1 + +# HELP kueue_cohort_subtree_admitted_active_workloads The number of admitted Workloads that are active, per cohort's subtree +# TYPE kueue_cohort_subtree_admitted_active_workloads gauge +kueue_cohort_subtree_admitted_active_workloads{replica_role="leader",cluster_queue="cluster-queue",cohort="default"} 1 + +# HELP kueue_cohort_subtree_admitted_workloads_total The total number of admitted workloads per cohort's subtree +# TYPE kueue_cohort_subtree_admitted_workloads_total counter +kueue_cohort_subtree_admitted_workloads_total{replica_role="leader",cluster_queue="cluster-queue",cohort="default",priority_class="default-priority"} 1 + +# HELP kueue_cohort_weighted_share Reports a value that representing the maximum of the ratios of usage above nominalquota to the lendable resources in the Cohort, among all the resources provided bythe Cohort, and divided by the weight.If zero, it means that the usage of the Cohort is below the nominal quota.If the Cohort has a weight of zero and is borrowing, this will return NaN. +# TYPE kueue_cohort_weighted_share gauge +kueue_cohort_weighted_share{replica_role="leader",cohort="default"} 1 + +# HELP kueue_evicted_workloads_total The number of evicted workloads per 'cluster_queue', The label 'reason' can have the following values:- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.- "LocalQueueStopped" means that the workload was evicted because the LocalQueue is stopped.- "NodeFailures" means that the workload was evicted due to node failures when using TopologyAwareScheduling.- "Deactivated" means that the workload was evicted because spec.active is set to false. The label 'underlying_cause' can have the following values:- "" means that the value in 'reason' label is the root cause for eviction.- "AdmissionCheck" means that the workload was evicted by Kueue due to a rejected admission check.- "MaximumExecutionTimeExceeded" means that the workload was evicted by Kueue due to maximum execution time exceeded.- "RequeuingLimitExceeded" means that the workload was evicted by Kueue due to requeuing limit exceeded. +# TYPE kueue_evicted_workloads_total counter +kueue_evicted_workloads_total{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 1 + +# HELP kueue_evicted_workloads_once_total The number of unique workload evictions per 'cluster_queue', The label 'reason' can have the following values:- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.- "LocalQueueStopped" means that the workload was evicted because the LocalQueue is stopped.- "NodeFailures" means that the workload was evicted due to node failures when using TopologyAwareScheduling.- "Deactivated" means that the workload was evicted because spec.active is set to false. The label 'underlying_cause' can have the following values:- "" means that the value in 'reason' label is the root cause for eviction.- "WaitForStart" means that the pods have not been ready since admission, or the workload is not admitted.- "WaitForRecovery" means that the Pods were ready since the workload admission, but some pod has failed.- "AdmissionCheck" means that the workload was evicted by Kueue due to a rejected admission check.- "MaximumExecutionTimeExceeded" means that the workload was evicted by Kueue due to maximum execution time exceeded.- "RequeuingLimitExceeded" means that the workload was evicted by Kueue due to requeuing limit exceeded. +# TYPE kueue_evicted_workloads_once_total counter +kueue_evicted_workloads_once_total{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 1 + +# HELP kueue_finished_workloads The number of finished workloads per 'cluster_queue'. +# TYPE kueue_finished_workloads gauge +kueue_finished_workloads{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_pending_workloads The number of pending workloads, per 'cluster_queue' and 'status'.'status' can have the following values:- "active" means that the workloads are in the admission queue.- "inadmissible" means there was a failed admission attempt for these workloads and they won't be retried until cluster conditions, which could make this workload admissible, change +# TYPE kueue_pending_workloads gauge +kueue_pending_workloads{replica_role="leader",cluster_queue="cluster-queue",status="inadmissible"} 1 + +# HELP kueue_pods_ready_to_evicted_time_seconds The number of seconds between a workload's pods being ready and eviction workloads per 'cluster_queue', The label 'reason' can have the following values:- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.- "LocalQueueStopped" means that the workload was evicted because the LocalQueue is stopped.- "NodeFailures" means that the workload was evicted due to node failures when using TopologyAwareScheduling.- "Deactivated" means that the workload was evicted because spec.active is set to false. The label 'underlying_cause' can have the following values:- "" means that the value in 'reason' label is the root cause for eviction.- "AdmissionCheck" means that the workload was evicted by Kueue due to a rejected admission check.- "MaximumExecutionTimeExceeded" means that the workload was evicted by Kueue due to maximum execution time exceeded.- "RequeuingLimitExceeded" means that the workload was evicted by Kueue due to requeuing limit exceeded. +# TYPE kueue_pods_ready_to_evicted_time_seconds histogram +kueue_pods_ready_to_evicted_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause="",le="1"} 1 +kueue_pods_ready_to_evicted_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause="",le="+Inf"} 1 +kueue_pods_ready_to_evicted_time_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 0.5 +kueue_pods_ready_to_evicted_time_seconds_count{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 1 + +# HELP kueue_preempted_workloads_total The number of preempted workloads per 'preempting_cluster_queue', The label 'reason' can have the following values:- "InClusterQueue" means that the workload was preempted by a workload in the same ClusterQueue.- "InCohortReclamation" means that the workload was preempted by a workload in the same cohort due to reclamation of nominal quota.- "InCohortFairSharing" means that the workload was preempted by a workload in the same cohort Fair Sharing.- "InCohortReclaimWhileBorrowing" means that the workload was preempted by a workload in the same cohort due to reclamation of nominal quota while borrowing. +# TYPE kueue_preempted_workloads_total counter +kueue_preempted_workloads_total{replica_role="leader",reason="Preempted",underlying_cause="",preempting_cluster_queue="cluster-queue"} 1 + +# HELP kueue_quota_reserved_wait_time_seconds The time between a workload was created or requeued until it got quota reservation, per 'cluster_queue' +# TYPE kueue_quota_reserved_wait_time_seconds histogram +kueue_quota_reserved_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_quota_reserved_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_quota_reserved_wait_time_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_quota_reserved_wait_time_seconds_count{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_quota_reserved_workloads_total The total number of quota reserved workloads per 'cluster_queue' +# TYPE kueue_quota_reserved_workloads_total counter +kueue_quota_reserved_workloads_total{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_ready_wait_time_seconds The time between a workload was created or requeued until ready, per 'cluster_queue' +# TYPE kueue_ready_wait_time_seconds histogram +kueue_ready_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_ready_wait_time_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_ready_wait_time_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_ready_wait_time_seconds_count{replica_role="leader",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_replaced_workload_slices_total The number of replaced workload slices per 'cluster_queue' +# TYPE kueue_replaced_workload_slices_total counter +kueue_replaced_workload_slices_total{replica_role="leader",cluster_queue="cluster-queue"} 1 + +# HELP kueue_resource_flavor_quota_reserved_workloads Kueue kueue resource flavor quota reserved workloads metric. +# TYPE kueue_resource_flavor_quota_reserved_workloads gauge +kueue_resource_flavor_quota_reserved_workloads{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu",priority_class="default-priority"} 1 + +# HELP kueue_reserving_active_workloads The number of Workloads that are reserving quota, per 'cluster_queue' +# TYPE kueue_reserving_active_workloads gauge +kueue_reserving_active_workloads{replica_role="leader",cluster_queue="cluster-queue"} 1 + +# HELP kueue_workload_creation_latency_seconds The time between a job was created until its workload was created, per 'job_kind'. Entries are only recorded for objects with generation 1. +# TYPE kueue_workload_creation_latency_seconds histogram +kueue_workload_creation_latency_seconds_bucket{replica_role="leader",job_kind="Job",le="1"} 1 +kueue_workload_creation_latency_seconds_bucket{replica_role="leader",job_kind="Job",le="+Inf"} 1 +kueue_workload_creation_latency_seconds_sum{replica_role="leader",job_kind="Job"} 0.5 +kueue_workload_creation_latency_seconds_count{replica_role="leader",job_kind="Job"} 1 + +# HELP kueue_workload_eviction_latency_seconds The time from workload eviction (WorkloadEvicted condition becomes True) until the workload returns to Pending (quota released).Observed on status transition from admitted or quota-reserved to pending while WorkloadEvicted remains True.Each matching update observes one latency sample (seconds) into this histogram; Prometheus aggregates samples across workloads.Uses the eviction condition LastTransitionTime on the updated object as the start time; cluster_queue is taken from status.admission.cluster_queue on the pre-update object when set and non-empty (otherwise no sample is recorded for that update). The label 'reason' can have the following values:- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.- "LocalQueueStopped" means that the workload was evicted because the LocalQueue is stopped.- "NodeFailures" means that the workload was evicted due to node failures when using TopologyAwareScheduling.- "Deactivated" means that the workload was evicted because spec.active is set to false. +# TYPE kueue_workload_eviction_latency_seconds histogram +kueue_workload_eviction_latency_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause="",le="1"} 1 +kueue_workload_eviction_latency_seconds_bucket{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause="",le="+Inf"} 1 +kueue_workload_eviction_latency_seconds_sum{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 0.5 +kueue_workload_eviction_latency_seconds_count{replica_role="leader",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 1 + +# HELP leader_election_master_status Kueue leader election master status metric. +# TYPE leader_election_master_status gauge +leader_election_master_status{replica_role="leader",status="inadmissible"} 1 + +# HELP process_cpu_seconds Kueue process cpu seconds metric. +# TYPE process_cpu_seconds gauge +process_cpu_seconds{replica_role="leader"} 1 + +# HELP process_max_fds Kueue process max fds metric. +# TYPE process_max_fds gauge +process_max_fds{replica_role="leader"} 1 + +# HELP process_network_receive_bytes Kueue process network receive bytes metric. +# TYPE process_network_receive_bytes gauge +process_network_receive_bytes{replica_role="leader"} 1 + +# HELP process_network_transmit_bytes Kueue process network transmit bytes metric. +# TYPE process_network_transmit_bytes gauge +process_network_transmit_bytes{replica_role="leader"} 1 + +# HELP process_open_fds Kueue process open fds metric. +# TYPE process_open_fds gauge +process_open_fds{replica_role="leader"} 1 + +# HELP process_resident_memory_bytes Kueue process resident memory bytes metric. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes{replica_role="leader"} 1 + +# HELP process_start_time_seconds Kueue process start time seconds metric. +# TYPE process_start_time_seconds gauge +process_start_time_seconds{replica_role="leader"} 1 + +# HELP process_virtual_memory_bytes Kueue process virtual memory bytes metric. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes{replica_role="leader"} 1 + +# HELP process_virtual_memory_max_bytes Kueue process virtual memory max bytes metric. +# TYPE process_virtual_memory_max_bytes gauge +process_virtual_memory_max_bytes{replica_role="leader"} 1 + +# HELP workqueue_adds Kueue workqueue adds metric. +# TYPE workqueue_adds gauge +workqueue_adds{replica_role="leader"} 1 + +# HELP workqueue_depth Kueue workqueue depth metric. +# TYPE workqueue_depth gauge +workqueue_depth{replica_role="leader"} 1 + +# HELP workqueue_longest_running_processor_seconds Kueue workqueue longest running processor seconds metric. +# TYPE workqueue_longest_running_processor_seconds gauge +workqueue_longest_running_processor_seconds{replica_role="leader"} 1 + +# HELP workqueue_queue_duration_seconds Kueue workqueue queue duration seconds metric. +# TYPE workqueue_queue_duration_seconds gauge +workqueue_queue_duration_seconds{replica_role="leader"} 1 + +# HELP workqueue_retries Kueue workqueue retries metric. +# TYPE workqueue_retries gauge +workqueue_retries{replica_role="leader"} 1 + +# HELP workqueue_unfinished_work_seconds Kueue workqueue unfinished work seconds metric. +# TYPE workqueue_unfinished_work_seconds gauge +workqueue_unfinished_work_seconds{replica_role="leader",cluster_queue="cluster-queue"} 1 + +# HELP workqueue_work_duration_seconds Kueue workqueue work duration seconds metric. +# TYPE workqueue_work_duration_seconds gauge +workqueue_work_duration_seconds{replica_role="leader"} 1 + +# HELP kueue_local_queue_admission_checks_wait_time_seconds The time from when a workload got the quota reservation until admission, per 'local_queue' +# TYPE kueue_local_queue_admission_checks_wait_time_seconds histogram +kueue_local_queue_admission_checks_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_local_queue_admission_checks_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_local_queue_admission_checks_wait_time_seconds_sum{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_local_queue_admission_checks_wait_time_seconds_count{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_admission_wait_time_seconds The time between a workload was created or requeued until admission, per 'local_queue' +# TYPE kueue_local_queue_admission_wait_time_seconds histogram +kueue_local_queue_admission_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_local_queue_admission_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_local_queue_admission_wait_time_seconds_sum{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_local_queue_admission_wait_time_seconds_count{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_admitted_until_ready_wait_time_seconds The time between a workload was admitted until ready, per 'local_queue' +# TYPE kueue_local_queue_admitted_until_ready_wait_time_seconds histogram +kueue_local_queue_admitted_until_ready_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_local_queue_admitted_until_ready_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_local_queue_admitted_until_ready_wait_time_seconds_sum{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_local_queue_admitted_until_ready_wait_time_seconds_count{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_admitted_active_workloads The number of admitted Workloads that are active, per 'localQueue' +# TYPE kueue_local_queue_admitted_active_workloads gauge +kueue_local_queue_admitted_active_workloads{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue"} 1 + +# HELP kueue_local_queue_admitted_workloads_total The total number of admitted workloads per 'local_queue' +# TYPE kueue_local_queue_admitted_workloads_total counter +kueue_local_queue_admitted_workloads_total{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_evicted_workloads_total The number of evicted workloads per 'local_queue', The label 'reason' can have the following values:- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.- "LocalQueueStopped" means that the workload was evicted because the LocalQueue is stopped.- "NodeFailures" means that the workload was evicted due to node failures when using TopologyAwareScheduling.- "Deactivated" means that the workload was evicted because spec.active is set to false. The label 'underlying_cause' can have the following values:- "" means that the value in 'reason' label is the root cause for eviction.- "AdmissionCheck" means that the workload was evicted by Kueue due to a rejected admission check.- "MaximumExecutionTimeExceeded" means that the workload was evicted by Kueue due to maximum execution time exceeded.- "RequeuingLimitExceeded" means that the workload was evicted by Kueue due to requeuing limit exceeded. +# TYPE kueue_local_queue_evicted_workloads_total counter +kueue_local_queue_evicted_workloads_total{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",reason="Preempted",underlying_cause=""} 1 + +# HELP kueue_local_queue_finished_workloads The number of finished workloads, per 'local_queue'. +# TYPE kueue_local_queue_finished_workloads gauge +kueue_local_queue_finished_workloads{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_finished_workloads_total The total number of finished workloads per 'local_queue' +# TYPE kueue_local_queue_finished_workloads_total counter +kueue_local_queue_finished_workloads_total{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_pending_workloads The number of pending workloads, per 'local_queue' and 'status'.'status' can have the following values:- "active" means that the workloads are in the admission queue.- "inadmissible" means there was a failed admission attempt for these workloads and they won't be retried until cluster conditions, which could make this workload admissible, change +# TYPE kueue_local_queue_pending_workloads gauge +kueue_local_queue_pending_workloads{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",status="inadmissible"} 1 + +# HELP kueue_local_queue_quota_reserved_wait_time_seconds The time between a workload was created or requeued until it got quota reservation, per 'local_queue' +# TYPE kueue_local_queue_quota_reserved_wait_time_seconds histogram +kueue_local_queue_quota_reserved_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_local_queue_quota_reserved_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_local_queue_quota_reserved_wait_time_seconds_sum{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_local_queue_quota_reserved_wait_time_seconds_count{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_quota_reserved_workloads_total The total number of quota reserved workloads per 'local_queue' +# TYPE kueue_local_queue_quota_reserved_workloads_total counter +kueue_local_queue_quota_reserved_workloads_total{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_ready_wait_time_seconds The time between a workload was created or requeued until ready, per 'local_queue' +# TYPE kueue_local_queue_ready_wait_time_seconds histogram +kueue_local_queue_ready_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="1"} 1 +kueue_local_queue_ready_wait_time_seconds_bucket{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority",le="+Inf"} 1 +kueue_local_queue_ready_wait_time_seconds_sum{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 0.5 +kueue_local_queue_ready_wait_time_seconds_count{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue",priority_class="default-priority"} 1 + +# HELP kueue_local_queue_reserving_active_workloads The number of Workloads that are reserving quota, per 'localQueue' +# TYPE kueue_local_queue_reserving_active_workloads gauge +kueue_local_queue_reserving_active_workloads{replica_role="leader",name="user-queue",namespace="default",cluster_queue="cluster-queue"} 1 + +# HELP kueue_local_queue_status Reports 'localQueue' with its 'active' status (with possible values 'True', 'False', or 'Unknown').For a LocalQueue, the metric only reports a value of 1 for one of the statuses. +# TYPE kueue_local_queue_status gauge +kueue_local_queue_status{replica_role="leader",name="user-queue",namespace="default",status="active"} 1 + +# HELP kueue_cluster_queue_borrowing_limit Reports the cluster_queue's resource borrowing limit within all the flavors +# TYPE kueue_cluster_queue_borrowing_limit gauge +kueue_cluster_queue_borrowing_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cluster_queue_borrowing_limit Reports the cluster_queue's resource borrowing limit within all the flavors +# TYPE kueue_cluster_queue_borrowing_limit gauge +kueue_cluster_queue_borrowing_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cluster_queue_borrowing_limit Reports the cluster_queue's resource borrowing limit within all the flavors +# TYPE kueue_cluster_queue_borrowing_limit gauge +kueue_cluster_queue_borrowing_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cluster_queue_borrowing_limit Reports the cluster_queue's resource borrowing limit within all the flavors +# TYPE kueue_cluster_queue_borrowing_limit gauge +kueue_cluster_queue_borrowing_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cluster_queue_lending_limit Reports the cluster_queue's resource lending limit within all the flavors +# TYPE kueue_cluster_queue_lending_limit gauge +kueue_cluster_queue_lending_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cluster_queue_lending_limit Reports the cluster_queue's resource lending limit within all the flavors +# TYPE kueue_cluster_queue_lending_limit gauge +kueue_cluster_queue_lending_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cluster_queue_lending_limit Reports the cluster_queue's resource lending limit within all the flavors +# TYPE kueue_cluster_queue_lending_limit gauge +kueue_cluster_queue_lending_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cluster_queue_lending_limit Reports the cluster_queue's resource lending limit within all the flavors +# TYPE kueue_cluster_queue_lending_limit gauge +kueue_cluster_queue_lending_limit{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cluster_queue_nominal_quota Reports the cluster_queue's resource nominal quota within all the flavors +# TYPE kueue_cluster_queue_nominal_quota gauge +kueue_cluster_queue_nominal_quota{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cluster_queue_nominal_quota Reports the cluster_queue's resource nominal quota within all the flavors +# TYPE kueue_cluster_queue_nominal_quota gauge +kueue_cluster_queue_nominal_quota{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cluster_queue_nominal_quota Reports the cluster_queue's resource nominal quota within all the flavors +# TYPE kueue_cluster_queue_nominal_quota gauge +kueue_cluster_queue_nominal_quota{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cluster_queue_nominal_quota Reports the cluster_queue's resource nominal quota within all the flavors +# TYPE kueue_cluster_queue_nominal_quota gauge +kueue_cluster_queue_nominal_quota{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cluster_queue_resource_pending Reports the cluster_queue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors. +# TYPE kueue_cluster_queue_resource_pending gauge +kueue_cluster_queue_resource_pending{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cluster_queue_resource_pending Reports the cluster_queue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors. +# TYPE kueue_cluster_queue_resource_pending gauge +kueue_cluster_queue_resource_pending{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cluster_queue_resource_pending Reports the cluster_queue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors. +# TYPE kueue_cluster_queue_resource_pending gauge +kueue_cluster_queue_resource_pending{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cluster_queue_resource_pending Reports the cluster_queue's total pending resource requests. Unlike resource_reservation, pending workloads have not yet been assigned to flavors. +# TYPE kueue_cluster_queue_resource_pending gauge +kueue_cluster_queue_resource_pending{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cluster_queue_resource_reservation Reports the cluster_queue's total resource reservation within all the flavors +# TYPE kueue_cluster_queue_resource_reservation gauge +kueue_cluster_queue_resource_reservation{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cluster_queue_resource_reservation Reports the cluster_queue's total resource reservation within all the flavors +# TYPE kueue_cluster_queue_resource_reservation gauge +kueue_cluster_queue_resource_reservation{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cluster_queue_resource_reservation Reports the cluster_queue's total resource reservation within all the flavors +# TYPE kueue_cluster_queue_resource_reservation gauge +kueue_cluster_queue_resource_reservation{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cluster_queue_resource_reservation Reports the cluster_queue's total resource reservation within all the flavors +# TYPE kueue_cluster_queue_resource_reservation gauge +kueue_cluster_queue_resource_reservation{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cluster_queue_resource_usage Reports the cluster_queue's total resource usage within all the flavors +# TYPE kueue_cluster_queue_resource_usage gauge +kueue_cluster_queue_resource_usage{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cluster_queue_resource_usage Reports the cluster_queue's total resource usage within all the flavors +# TYPE kueue_cluster_queue_resource_usage gauge +kueue_cluster_queue_resource_usage{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cluster_queue_resource_usage Reports the cluster_queue's total resource usage within all the flavors +# TYPE kueue_cluster_queue_resource_usage gauge +kueue_cluster_queue_resource_usage{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cluster_queue_resource_usage Reports the cluster_queue's total resource usage within all the flavors +# TYPE kueue_cluster_queue_resource_usage gauge +kueue_cluster_queue_resource_usage{replica_role="leader",cluster_queue="cluster-queue",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cohort_subtree_quota Reports the cohort's nominal quota aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_quota gauge +kueue_cohort_subtree_quota{replica_role="leader",cohort="default",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cohort_subtree_quota Reports the cohort's nominal quota aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_quota gauge +kueue_cohort_subtree_quota{replica_role="leader",cohort="default",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cohort_subtree_quota Reports the cohort's nominal quota aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_quota gauge +kueue_cohort_subtree_quota{replica_role="leader",cohort="default",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cohort_subtree_quota Reports the cohort's nominal quota aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_quota gauge +kueue_cohort_subtree_quota{replica_role="leader",cohort="default",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_cohort_subtree_resource_reservations Reports the cohort's resource reservations aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_resource_reservations gauge +kueue_cohort_subtree_resource_reservations{replica_role="leader",cohort="default",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_cohort_subtree_resource_reservations Reports the cohort's resource reservations aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_resource_reservations gauge +kueue_cohort_subtree_resource_reservations{replica_role="leader",cohort="default",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_cohort_subtree_resource_reservations Reports the cohort's resource reservations aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_resource_reservations gauge +kueue_cohort_subtree_resource_reservations{replica_role="leader",cohort="default",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_cohort_subtree_resource_reservations Reports the cohort's resource reservations aggregated within the cohort's subtree. The values are reported per resource and flavor +# TYPE kueue_cohort_subtree_resource_reservations gauge +kueue_cohort_subtree_resource_reservations{replica_role="leader",cohort="default",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_local_queue_resource_reservation Reports the localQueue's total resource reservation within all the flavors +# TYPE kueue_local_queue_resource_reservation gauge +kueue_local_queue_resource_reservation{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_local_queue_resource_reservation Reports the localQueue's total resource reservation within all the flavors +# TYPE kueue_local_queue_resource_reservation gauge +kueue_local_queue_resource_reservation{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_local_queue_resource_reservation Reports the localQueue's total resource reservation within all the flavors +# TYPE kueue_local_queue_resource_reservation gauge +kueue_local_queue_resource_reservation{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_local_queue_resource_reservation Reports the localQueue's total resource reservation within all the flavors +# TYPE kueue_local_queue_resource_reservation gauge +kueue_local_queue_resource_reservation{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="example.com/fpga"} 1 + +# HELP kueue_local_queue_resource_usage Reports the localQueue's total resource usage within all the flavors +# TYPE kueue_local_queue_resource_usage gauge +kueue_local_queue_resource_usage{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="cpu"} 1 + +# HELP kueue_local_queue_resource_usage Reports the localQueue's total resource usage within all the flavors +# TYPE kueue_local_queue_resource_usage gauge +kueue_local_queue_resource_usage{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="memory"} 1 + +# HELP kueue_local_queue_resource_usage Reports the localQueue's total resource usage within all the flavors +# TYPE kueue_local_queue_resource_usage gauge +kueue_local_queue_resource_usage{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="nvidia.com/gpu"} 1 + +# HELP kueue_local_queue_resource_usage Reports the localQueue's total resource usage within all the flavors +# TYPE kueue_local_queue_resource_usage gauge +kueue_local_queue_resource_usage{replica_role="leader",name="user-queue",namespace="default",flavor="default-flavor",resource="example.com/fpga"} 1 diff --git a/kueue/tests/kind/kind-config.yaml b/kueue/tests/kind/kind-config.yaml new file mode 100644 index 0000000000000..2aa5449a65fce --- /dev/null +++ b/kueue/tests/kind/kind-config.yaml @@ -0,0 +1,12 @@ +apiVersion: kind.x-k8s.io/v1alpha4 +kind: Cluster +networking: + # Use non-default CIDRs to avoid colliding with the host environment's + # Kubernetes service/pod networks, which otherwise hijacks in-cluster + # traffic to the API server and breaks Kueue's webhook cert bootstrap. + apiServerAddress: 127.0.0.1 + serviceSubnet: 10.245.0.0/16 + podSubnet: 10.246.0.0/16 +nodes: +- role: control-plane + image: kindest/node:v1.34.8@sha256:02722c2dedddcfc00febf5d27fbeb9b7b2c14294c82109ff4a85d89ac9ba3256 diff --git a/kueue/tests/kind/kueue-config.yaml b/kueue/tests/kind/kueue-config.yaml new file mode 100644 index 0000000000000..b51d5a14e4d73 --- /dev/null +++ b/kueue/tests/kind/kueue-config.yaml @@ -0,0 +1,50 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kueue-manager-config + namespace: kueue-system +data: + controller_manager_config.yaml: | + apiVersion: config.kueue.x-k8s.io/v1beta2 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8443 + enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + Cohort.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + integrations: + frameworks: + - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "ray.io/rayservice" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + - "kubeflow.org/jaxjob" + - "workload.codeflare.dev/appwrapper" + - "trainer.kubeflow.org/trainjob" + - "pod" + - "deployment" + - "statefulset" + - "leaderworkerset.x-k8s.io/leaderworkerset" diff --git a/kueue/tests/kind/metrics-reader.yaml b/kueue/tests/kind/metrics-reader.yaml new file mode 100644 index 0000000000000..3868102a84b62 --- /dev/null +++ b/kueue/tests/kind/metrics-reader.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kueue-metrics-reader + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kueue-metrics-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kueue-metrics-reader +subjects: +- kind: ServiceAccount + name: kueue-metrics-reader + namespace: default diff --git a/kueue/tests/kind/queue.yaml b/kueue/tests/kind/queue.yaml new file mode 100644 index 0000000000000..c37c29b7e472e --- /dev/null +++ b/kueue/tests/kind/queue.yaml @@ -0,0 +1,30 @@ +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ResourceFlavor +metadata: + name: default-flavor +--- +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ClusterQueue +metadata: + name: cluster-queue +spec: + namespaceSelector: {} + resourceGroups: + - coveredResources: + - cpu + - memory + flavors: + - name: default-flavor + resources: + - name: cpu + nominalQuota: 1 + - name: memory + nominalQuota: 1Gi +--- +apiVersion: kueue.x-k8s.io/v1beta2 +kind: LocalQueue +metadata: + name: user-queue + namespace: default +spec: + clusterQueue: cluster-queue diff --git a/kueue/tests/kind/workloads.yaml b/kueue/tests/kind/workloads.yaml new file mode 100644 index 0000000000000..036211b1c44ab --- /dev/null +++ b/kueue/tests/kind/workloads.yaml @@ -0,0 +1,45 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: scheduled-workload + namespace: default + labels: + kueue.x-k8s.io/queue-name: user-queue +spec: + suspend: true + parallelism: 1 + completions: 1 + template: + spec: + restartPolicy: Never + containers: + - name: workload + image: alpine:3.19.1 + command: ['sh', '-c', 'sleep 3600'] + resources: + requests: + cpu: 100m + memory: 128Mi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: unschedulable-workload + namespace: default + labels: + kueue.x-k8s.io/queue-name: user-queue +spec: + suspend: true + parallelism: 1 + completions: 1 + template: + spec: + restartPolicy: Never + containers: + - name: workload + image: alpine:3.19.1 + command: ['sh', '-c', 'sleep 3600'] + resources: + requests: + cpu: '2' + memory: 2Gi diff --git a/kueue/tests/test_e2e.py b/kueue/tests/test_e2e.py new file mode 100644 index 0000000000000..95c89fedfc686 --- /dev/null +++ b/kueue/tests/test_e2e.py @@ -0,0 +1,21 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +import pytest + +from datadog_checks.dev.utils import get_metadata_metrics + +from .common import EXPECTED_METRIC_TAGS + + +@pytest.mark.e2e +def test_e2e(dd_agent_check): + aggregator = dd_agent_check() + + aggregator.assert_metrics_using_metadata(get_metadata_metrics(), check_submission_type=True) + + for metric, tags in EXPECTED_METRIC_TAGS.items(): + aggregator.assert_metric(metric, at_least=1) + for tag in tags: + aggregator.assert_metric_has_tag(metric, tag) diff --git a/kueue/tests/test_unit.py b/kueue/tests/test_unit.py new file mode 100644 index 0000000000000..adfa51eb0d530 --- /dev/null +++ b/kueue/tests/test_unit.py @@ -0,0 +1,97 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +import pytest + +from datadog_checks.dev.utils import get_metadata_metrics +from datadog_checks.kueue import KueueCheck +from datadog_checks.kueue.check import OTHER_RESOURCE_NAME, RESOURCE_NAME_MAP +from datadog_checks.kueue.metrics import LOCAL_QUEUE_METRIC_MAP, METRIC_MAP, RESOURCE_METRIC_MAP + +from .common import EXPECTED_METRIC_TAGS, UNIT_METRICS, get_fixture_path + +pytestmark = pytest.mark.unit + + +def test_mapped_metrics_are_in_metadata(): + mapped_metrics = set() + mapped_metrics.update(_metadata_metric_name(metric_name) for metric_name in METRIC_MAP.values()) + mapped_metrics.update(_metadata_metric_name(metric_name) for metric_name in LOCAL_QUEUE_METRIC_MAP.values()) + + resource_names = {OTHER_RESOURCE_NAME, *RESOURCE_NAME_MAP.values()} + for metric_name in RESOURCE_METRIC_MAP.values(): + mapped_metrics.update( + _metadata_metric_name(f'{metric_name}.{resource_name}') for resource_name in resource_names + ) + + metadata_metrics = {_base_metric_name(metric) for metric in get_metadata_metrics()} + assert mapped_metrics.issubset(metadata_metrics) + + +def _metadata_metric_name(metric_name): + if isinstance(metric_name, dict): + metric_name = metric_name['name'] + + return f'kueue.{metric_name}' + + +def _base_metric_name(metric_name): + for suffix in ('.bucket', '.count', '.sum'): + if metric_name.endswith(suffix): + return metric_name[: -len(suffix)] + + return metric_name + + +def test_check(dd_run_check, aggregator, instance, mock_http_response): + mock_http_response(file_path=get_fixture_path('metrics.txt')) + + check = KueueCheck('kueue', {}, [instance]) + dd_run_check(check) + + for metric in UNIT_METRICS: + aggregator.assert_metric(metric) + aggregator.assert_metric_has_tag(metric, 'test:tag') + + for metric, tags in EXPECTED_METRIC_TAGS.items(): + for tag in tags: + aggregator.assert_metric_has_tag(metric, tag) + aggregator.assert_metrics_using_metadata( + get_metadata_metrics(), + check_submission_type=True, + check_symmetric_inclusion=True, + ) + + +def test_resource_name_map(dd_run_check, aggregator, instance, mock_http_response): + mock_http_response(file_path=get_fixture_path('metrics.txt')) + instance = { + **instance, + 'resource_name_map': { + 'example.com/fpga': 'fpga', + 'nvidia.com/gpu': 'custom_gpu', + }, + } + + check = KueueCheck('kueue', {}, [instance]) + dd_run_check(check) + + aggregator.assert_metric('kueue.cluster_queue.resource_usage.cpu') + aggregator.assert_metric('kueue.cluster_queue.resource_usage.gpu') + aggregator.assert_metric('kueue.cluster_queue.resource_usage.memory') + aggregator.assert_metric('kueue.cluster_queue.resource_usage.fpga') + aggregator.assert_metric('kueue.cluster_queue.resource_usage.custom_gpu', count=0) + aggregator.assert_metric_has_tag('kueue.cluster_queue.resource_usage.fpga', 'test:tag') + aggregator.assert_metric_has_tag('kueue.cluster_queue.resource_usage.fpga', 'kueue_cluster_queue:cluster-queue') + aggregator.assert_metric_has_tag('kueue.cluster_queue.resource_usage.fpga', 'kueue_resource_flavor:default-flavor') + aggregator.assert_metric_has_tag('kueue.cluster_queue.resource_usage.fpga', 'replica_role:leader') + + +def test_empty_instance(dd_run_check): + with pytest.raises( + Exception, + match='openmetrics_endpoint\\n Field required', + ): + check = KueueCheck('kueue', {}, [{}]) + dd_run_check(check) From 2f66fc4a71e0549d6bd53c983ff2ab11b83385f0 Mon Sep 17 00:00:00 2001 From: Lucia Date: Fri, 19 Jun 2026 10:51:51 +0200 Subject: [PATCH 2/4] [Release] Bumped ddev version to 17.0.1 (#24108) --- ddev/CHANGELOG.md | 7 +++++++ ddev/changelog.d/23987.fixed | 1 - ddev/changelog.d/24099.fixed | 1 - 3 files changed, 7 insertions(+), 2 deletions(-) delete mode 100644 ddev/changelog.d/23987.fixed delete mode 100644 ddev/changelog.d/24099.fixed diff --git a/ddev/CHANGELOG.md b/ddev/CHANGELOG.md index 3c20d8b5bc731..dbd1654266e33 100644 --- a/ddev/CHANGELOG.md +++ b/ddev/CHANGELOG.md @@ -2,6 +2,13 @@ +## 17.0.1 / 2026-06-19 + +***Fixed***: + +* Gate `ddev release branch create` and `update-build-agent-yaml.yml` on the matching `DataDog/datadog-agent` branch existing so neither writer can produce a release-branch pointer to a missing upstream branch. ([#23987](https://github.com/DataDog/integrations-core/pull/23987)) +* Skip code coverage gate validation for extras repos, which do not enforce a required per-integration coverage threshold. ([#24099](https://github.com/DataDog/integrations-core/pull/24099)) + ## 17.0.0 / 2026-06-16 ***Changed***: diff --git a/ddev/changelog.d/23987.fixed b/ddev/changelog.d/23987.fixed deleted file mode 100644 index 85ff8c7d32545..0000000000000 --- a/ddev/changelog.d/23987.fixed +++ /dev/null @@ -1 +0,0 @@ -Gate `ddev release branch create` and `update-build-agent-yaml.yml` on the matching `DataDog/datadog-agent` branch existing so neither writer can produce a release-branch pointer to a missing upstream branch. diff --git a/ddev/changelog.d/24099.fixed b/ddev/changelog.d/24099.fixed deleted file mode 100644 index 890e7774162f0..0000000000000 --- a/ddev/changelog.d/24099.fixed +++ /dev/null @@ -1 +0,0 @@ -Skip code coverage gate validation for extras repos, which do not enforce a required per-integration coverage threshold. From 54edd6e439985c67464d5c03ffbb8ff366e947dc Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 19 Jun 2026 12:31:39 +0200 Subject: [PATCH 3/4] Remove black, use ruff for generated config_models (#23588) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Drop black as a direct dependency, use ruff format for generated config models - Remove explicit `black==23.12.1` from `datadog_checks_dev` - Replace `apply_black` calls in the model consumer with `ruff format -`, using the repo's centralized `[tool.ruff]` configuration - Drop the now-unused `code_formatter` plumbing through `ModelConsumer` / `build_model_file` / `validate models` - Drop the [tool.black] block from `ddev/pyproject.toml` and the matching python-version-bump logic in `update_py_config.py` (with test fixture) - Update README badge from black to ruff The root `[tool.black]` section is kept (with an explanatory comment) because `datamodel-code-generator` reads it transitively through its own internal formatter, and removing it changes line-length to 88 which breaks our list[...] -> tuple[..., ...] line-by-line transform. * Add changelog entries * Make ruff a hard dep of datadog_checks_dev[cli], invoke via python -m ruff CI installs ddev with `pip install -e ./datadog_checks_dev[cli]` and never adds ruff to PATH. The previous helper called `shutil.which('ruff')` and silently returned the input unchanged when ruff was missing — leaving long lines and missing wraps in 401 generated config-model files, surfacing as "not in sync" in the validate workflow. - Declare `ruff>=0.11` in `datadog_checks_dev[cli]` so the package is always installed alongside the model generator. - Switch the helper to `sys.executable -m ruff` so the in-venv package is used regardless of PATH. - Raise loudly on missing ruff or non-zero ruff exit instead of silently degrading, so any future regression fails the workflow with a clear error. * Pin ruff to 0.11.10 to match ddev's hatch lint env * Remove [tool.black] from root pyproject.toml and update _fix_types Make `_fix_types` operate on the joined document (as UTF-8 bytes) instead of line by line, so the bracket-tracking pass works regardless of how datamodel-code-generator's internal formatter wrapped `list[...]`. Place the `, ...` sentinel right after the last non-whitespace byte before the closing `]`, so output stays on the previous content line even when the parser pre-wrapped the closing bracket onto its own line. With those changes the generator no longer relies on `[tool.black]` existing in the repo, so the section and its accompanying comment are removed from `pyproject.toml`. The black-related comment near the config_models lint exclusion and the black badge in `README.md` go too. Four config_models files (kafka_actions, win32_event_log, yarn x2) regenerate with different — but semantically identical — wrapping. They were the only ones whose pre-wrapped form was sensitive to the change in upstream line-length default; future regens are stable. * Add changelogs for regenerated config_models in kafka_actions, win32_event_log, yarn * Address PR review: docs, error-path hint, focused _fix_types tests - Replace remaining "code style - black" references in developer docs (`docs/developer/index.md` badge, `docs/developer/guidelines/style.md` style section, link reference) with ruff equivalents. - Update the stale `ddev test postgres -l` example output in `docs/developer/testing.md` to drop `black==22.12.0` and reflect the current lint env contents (`ruff==0.11.10`, `pydantic==2.11.5`). - Move the "ruff is not installed" install hint in `format_with_ruff` from the `FileNotFoundError` branch to the `CalledProcessError` branch and gate it on `"No module named 'ruff'"` in stderr — the previous layout was effectively dead code because `sys.executable` always resolves, so missing-ruff surfaces as a non-zero exit, not a missing binary. - Add `tests/tooling/configuration/consumers/model/test_fix_types.py` with focused coverage for the `_fix_types` post-processing pass: the multiline-wrapped `list[Literal[...]]` regression case the PR was written to fix, dict and nested-list translations, unicode in descriptions, and verbatim pass-through when no `list[`/`dict[` is present. * Address PR review (round 2): code_formatter robustness + direct tests code_formatter.py: - Guard `_resolve_ruff_config` with `if root_str:` so an unset `get_root()` (returns '') doesn't fall into the `Path('').is_dir()` branch (which is True — it resolves to CWD), and unit tests actually walk back to the repo pyproject.toml as the docstring claims. - Replace the loose `'[tool.ruff' in text` substring with a line-anchored scan that only matches actual TOML table headers (`[tool.ruff]` or `[tool.ruff.…]`), so a comment or string value can't false-positive. - Surface argv (via `shlex.join`), stderr, and stdout in the error message for non-missing-package failures, so a future ruff config change emitting actionable output is debuggable from the message alone. Tests: - New `test_code_formatter.py` (17 tests): direct coverage for `format_with_ruff` (line wrapping, quote-style preservation, short passthrough, missing-ruff hint, full-context error on other failures) and `_resolve_ruff_config` (root path success, fallback walk on empty root, fallback walk when root has no `[tool.ruff]`, returns None when nothing is found, parametrized header recognition for `_has_ruff_section`). - `test_update_py_config.py`: add explicit content assertions on the rewritten `ddev/pyproject.toml` — no `[tool.black]` block survives, `[tool.ruff].target-version` is updated to the new pinned version, the old token is gone. Captures the actual contract instead of relying on the success-counter being 9. * Relax ddev's datadog-checks-dev pin to span the dcd 39 release gap This PR bumps `datadog-checks-dev` to 39 (`black` is dropped from `[cli]` extras, so per semver it's a major). The current `~=38.0` constraint in ddev's `pyproject.toml` would block any GitHub Action that installs both packages from the local repo — `pip install -e ./ddev` would fail to resolve the local dcd 39 against ddev's pin. Relax the pin to `>=38.0,<40` for the duration of the gap between this PR landing and the next ddev release. The release PR for ddev MUST tighten this back to `~=39.0`. * Apply ruff format to new helper and tests * Restore relaxed datadog-checks-dev pin for transition period --- README.md | 8 +- datadog_checks_dev/changelog.d/23588.changed | 1 + .../dev/tooling/commands/validate/models.py | 4 +- .../consumers/model/code_formatter.py | 89 +++++++++ .../consumers/model/model_consumer.py | 18 +- .../consumers/model/model_file.py | 66 ++++--- datadog_checks_dev/pyproject.toml | 8 +- .../consumers/model/test_code_formatter.py | 176 ++++++++++++++++++ .../consumers/model/test_fix_types.py | 104 +++++++++++ ddev/README.md | 2 +- ddev/changelog.d/23588.changed | 1 + ddev/pyproject.toml | 17 +- .../ddev/cli/meta/scripts/update_py_config.py | 10 - ddev/tests/cli/meta/scripts/conftest.py | 5 +- .../cli/meta/scripts/test_update_py_config.py | 16 +- docs/developer/.snippets/links.txt | 2 +- docs/developer/guidelines/style.md | 46 ++--- docs/developer/index.md | 2 +- docs/developer/testing.md | 6 +- kafka_actions/changelog.d/23588.fixed | 1 + .../kafka_actions/config_models/instance.py | 21 ++- pyproject.toml | 15 -- win32_event_log/changelog.d/23588.fixed | 1 + .../win32_event_log/config_models/instance.py | 11 +- yarn/changelog.d/23588.fixed | 1 + .../yarn/config_models/instance.py | 13 +- .../yarn/config_models/shared.py | 12 +- 27 files changed, 522 insertions(+), 134 deletions(-) create mode 100644 datadog_checks_dev/changelog.d/23588.changed create mode 100644 datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/code_formatter.py create mode 100644 datadog_checks_dev/tests/tooling/configuration/consumers/model/test_code_formatter.py create mode 100644 datadog_checks_dev/tests/tooling/configuration/consumers/model/test_fix_types.py create mode 100644 ddev/changelog.d/23588.changed create mode 100644 kafka_actions/changelog.d/23588.fixed create mode 100644 win32_event_log/changelog.d/23588.fixed create mode 100644 yarn/changelog.d/23588.fixed diff --git a/README.md b/README.md index bf828e8b81e7d..e843088ff463d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ | --- | --- | | CI/CD | [![CI - Test][1]][2] | | Docs | [![Docs - Release][19]][20] | -| Meta | [![Hatch project][26]][27] [![Linting - Ruff][24]][25] [![Code style - black][21]][22] [![Typing - Mypy][28]][29] [![License - BSD-3-Clause][30]][31] | +| Meta | [![Hatch project][26]][27] [![Linting - Ruff][24]][25] [![Typing - Mypy][28]][29] [![License - BSD-3-Clause][30]][31] | This repository contains open source integrations that Datadog officially develops and supports. To add a new integration, please see the [Integrations Extras][5] repository and the @@ -43,10 +43,8 @@ For more information on integrations, please reference our [documentation][11] a [16]: https://github.com/DataDog/integrations-core/blob/ea2dfbf1e8859333af4c8db50553eb72a3b466f9/requirements-agent-release.txt [19]: https://github.com/DataDog/integrations-core/workflows/docs/badge.svg [20]: https://github.com/DataDog/integrations-core/actions?workflow=docs -[21]: https://img.shields.io/badge/code%20style-black-000000.svg -[22]: https://github.com/ambv/black -[24]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v0.json -[25]: https://github.com/charliermarsh/ruff +[24]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json +[25]: https://github.com/astral-sh/ruff [26]: https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg [27]: https://github.com/pypa/hatch [28]: https://img.shields.io/badge/typing-Mypy-blue.svg diff --git a/datadog_checks_dev/changelog.d/23588.changed b/datadog_checks_dev/changelog.d/23588.changed new file mode 100644 index 0000000000000..79cc7800688a6 --- /dev/null +++ b/datadog_checks_dev/changelog.d/23588.changed @@ -0,0 +1 @@ +Stop declaring `black` as a direct dependency. The `apply_black` calls used to format auto-generated config-model files now go through `ruff format`, using the repo's centralized `[tool.ruff]` configuration. diff --git a/datadog_checks_dev/datadog_checks/dev/tooling/commands/validate/models.py b/datadog_checks_dev/datadog_checks/dev/tooling/commands/validate/models.py index 7b670c18d45aa..8b1fd90734d59 100644 --- a/datadog_checks_dev/datadog_checks/dev/tooling/commands/validate/models.py +++ b/datadog_checks_dev/datadog_checks/dev/tooling/commands/validate/models.py @@ -95,8 +95,6 @@ def models(ctx, check, sync, verbose): license_header_lines = get_license_header().splitlines(True) + ['\n', '\n'] documentation_header_lines = get_config_models_documentation().splitlines(True) + ['\n'] - code_formatter = ModelConsumer.create_code_formatter() - if is_core_check: checks = checks.difference(INTEGRATIONS_WITHOUT_MODELS) @@ -135,7 +133,7 @@ def models(ctx, check, sync, verbose): if not sync and not dir_exists(models_location) and not is_core_check: continue - model_consumer = ModelConsumer(spec.data, code_formatter) + model_consumer = ModelConsumer(spec.data) # So formatters see config files with chdir(root): diff --git a/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/code_formatter.py b/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/code_formatter.py new file mode 100644 index 0000000000000..300bd95c0cd50 --- /dev/null +++ b/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/code_formatter.py @@ -0,0 +1,89 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import shlex +import subprocess +import sys +from pathlib import Path + +from datadog_checks.dev.tooling.constants import get_root + + +def format_with_ruff(source: str) -> str: + """Format Python source via ``ruff format -`` (stdin/stdout). + + Replaces the line-wrapping role previously played by black on auto-generated + config_models files. Uses the repo's centralized ruff configuration so the + output matches the rest of the codebase. Invokes ruff through the active + interpreter (``python -m ruff``) so the package installed alongside + ``datadog_checks_dev[cli]`` is always picked up, regardless of PATH. + """ + args = [sys.executable, '-m', 'ruff', 'format', '--quiet', '--stdin-filename=model.py'] + config_path = _resolve_ruff_config() + if config_path is not None: + args.extend(['--config', str(config_path)]) + else: + args.extend(['--isolated', '--config', "format.quote-style='preserve'", '--line-length=120']) + args.append('-') + + try: + result = subprocess.run( + args, + input=source, + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + # `python -m ruff` exits non-zero when the ruff package is missing, + # surfacing as ModuleNotFoundError on stderr. Promote that to a + # clearer install hint; otherwise propagate the underlying error + # with enough context to reproduce the failure manually. + stderr = e.stderr or '' + if "No module named 'ruff'" in stderr: + raise RuntimeError( + "Cannot format auto-generated config models: the `ruff` package is not installed in the active " + "interpreter. Reinstall `datadog_checks_dev[cli]` (or run `pip install ruff`) and retry." + ) from e + details = [f'{shlex.join(args)} failed', f'stderr: {stderr.strip()}'] + if e.stdout: + details.append(f'stdout: {e.stdout.strip()}') + raise RuntimeError( + '`ruff format` failed while formatting auto-generated config models. ' + '; '.join(details) + ) from e + return result.stdout + + +def _resolve_ruff_config() -> Path | None: + """Locate the repo pyproject.toml that holds the central ruff configuration. + + Prefer the path reported by ``get_root`` (set by ddev commands). Fall back + to walking up from this module so unit tests, which never call ``set_root``, + still pick up the same configuration as model regeneration. + """ + root_str = get_root() + if root_str: + root = Path(root_str) + if root.is_dir(): + candidate = root / 'pyproject.toml' + if _has_ruff_section(candidate): + return candidate + + for parent in Path(__file__).resolve().parents: + candidate = parent / 'pyproject.toml' + if _has_ruff_section(candidate): + return candidate + return None + + +def _has_ruff_section(pyproject: Path) -> bool: + if not pyproject.is_file(): + return False + try: + text = pyproject.read_text() + except OSError: + return False + return any( + stripped == '[tool.ruff]' or stripped.startswith('[tool.ruff.') + for stripped in (line.strip() for line in text.splitlines()) + ) diff --git a/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_consumer.py b/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_consumer.py index 8889707753ad8..304ee64e3fa9a 100644 --- a/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_consumer.py +++ b/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_consumer.py @@ -2,20 +2,19 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) import warnings -from pathlib import Path from typing import Dict, List, Tuple import yaml from datamodel_code_generator import DataModelType -from datamodel_code_generator.format import CodeFormatter, PythonVersion +from datamodel_code_generator.format import PythonVersion from datamodel_code_generator.model import get_data_model_types from datamodel_code_generator.parser import LiteralType from datamodel_code_generator.parser.openapi import OpenAPIParser +from datadog_checks.dev.tooling.configuration.consumers.model.code_formatter import format_with_ruff from datadog_checks.dev.tooling.configuration.consumers.model.model_file import build_model_file from datadog_checks.dev.tooling.configuration.consumers.model.model_info import ModelInfo from datadog_checks.dev.tooling.configuration.consumers.openapi_document import build_openapi_document -from datadog_checks.dev.tooling.constants import get_root PYTHON_VERSION = PythonVersion.PY_39 @@ -32,9 +31,8 @@ class ModelConsumer: - def __init__(self, spec: dict, code_formatter: CodeFormatter = None): + def __init__(self, spec: dict): self.spec = spec - self.code_formatter = code_formatter or self.create_code_formatter() def render(self) -> Dict[str, Dict[str, str]]: """ @@ -137,7 +135,6 @@ def _process_section(self, section) -> (List[Tuple[str, str]], dict, ModelInfo): model_id, section_name, model_info, - self.code_formatter, ) # instance.py or shared.py model_files[model_file_name] = (model_file_contents, errors) @@ -207,11 +204,6 @@ def _merge_instances(self, section: dict, errors: List[str]) -> dict: return new_section - @staticmethod - def create_code_formatter(): - path = Path(get_root()) - return CodeFormatter(PYTHON_VERSION, settings_path=path if path.is_dir() else None) - def _build_deprecation_file(self, deprecation_data): file_needs_formatting = False deprecations_file_lines = [] @@ -226,7 +218,7 @@ def _build_deprecation_file(self, deprecation_data): deprecations_file_lines.append('') deprecations_file_contents = '\n'.join(deprecations_file_lines) if file_needs_formatting: - deprecations_file_contents = self.code_formatter.apply_black(deprecations_file_contents) + deprecations_file_contents = format_with_ruff(deprecations_file_contents) return deprecations_file_contents @staticmethod @@ -255,5 +247,5 @@ def _build_defaults_file(self, model_info: ModelInfo): model_info.defaults_file_lines.append('') defaults_file_contents = '\n'.join(model_info.defaults_file_lines) if model_info.defaults_file_needs_value_normalization: - defaults_file_contents = self.code_formatter.apply_black(defaults_file_contents) + defaults_file_contents = format_with_ruff(defaults_file_contents) return defaults_file_contents diff --git a/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_file.py b/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_file.py index 86617b1f8be17..c763cfbedb39c 100644 --- a/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_file.py +++ b/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model/model_file.py @@ -1,8 +1,7 @@ # (C) Datadog, Inc. 2021-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -from datamodel_code_generator.format import CodeFormatter - +from datadog_checks.dev.tooling.configuration.consumers.model.code_formatter import format_with_ruff from datadog_checks.dev.tooling.configuration.consumers.model.model_info import ModelInfo @@ -11,14 +10,12 @@ def build_model_file( model_id: str, section_name: str, model_info: ModelInfo, - code_formatter: CodeFormatter, ): """ :param parsed_document: OpenApi parsed document :param model_id: instance or shared :param section_name: init or instances :param model_info: Information to build the model file - :param code_formatter: """ # Whether or not there are options with default values options_with_defaults = len(model_info.defaults_file_lines) > 0 @@ -54,7 +51,7 @@ def build_model_file( model_file_lines.append('') model_file_contents = '\n'.join(model_file_lines) if any(len(line) > 120 for line in model_file_lines): - model_file_contents = code_formatter.apply_black(model_file_contents) + model_file_contents = format_with_ruff(model_file_contents) return model_file_contents @@ -109,27 +106,44 @@ def _add_imports(model_file_lines, need_defaults, need_deprecations): def _fix_types(model_file_lines): - for i, line in enumerate(model_file_lines): - line = model_file_lines[i] = line.replace('dict[', 'MappingProxyType[') - if 'list[' not in line: - continue - - buffer = bytearray() - containers = [] - - for char in line: - if char == '[': - if buffer[-4:] == b'list': - containers.append(True) - buffer[-4:] = b'tuple' - else: - containers.append(False) - elif char == ']' and containers.pop(): - buffer.extend(b', ...') - - buffer.append(ord(char)) - - model_file_lines[i] = buffer.decode('utf-8') + # Operate on the joined document (as UTF-8 bytes) so the bracket-tracking + # pass below works even when the upstream parser pre-wraps `list[...]` + # across multiple lines. Iterating bytes keeps the algorithm safe for + # non-ASCII content (descriptions, examples) since `[`, `]`, and `list` + # are all single-byte ASCII while UTF-8 continuation bytes never collide + # with them. + content = '\n'.join(model_file_lines).replace('dict[', 'MappingProxyType[') + if 'list[' not in content: + model_file_lines[:] = content.split('\n') + return + + encoded = content.encode('utf-8') + buffer = bytearray() + containers = [] + open_bracket = ord(b'[') + close_bracket = ord(b']') + whitespace = (ord(b' '), ord(b'\t'), ord(b'\n')) + + for byte in encoded: + if byte == open_bracket: + if buffer[-4:] == b'list': + containers.append(True) + buffer[-4:] = b'tuple' + else: + containers.append(False) + elif byte == close_bracket and containers and containers.pop(): + # Insert `, ...` after the last non-whitespace byte already in the + # buffer so the sentinel sits on the same line as the previous + # content (`tuple[X], ...` style) even when the parser wrapped the + # closing `]` onto its own line. + insert_at = len(buffer) + while insert_at > 0 and buffer[insert_at - 1] in whitespace: + insert_at -= 1 + buffer[insert_at:insert_at] = b', ...' + + buffer.append(byte) + + model_file_lines[:] = buffer.decode('utf-8').split('\n') def _add_secure_fields_constant(model_file_lines, require_trusted_providers): diff --git a/datadog_checks_dev/pyproject.toml b/datadog_checks_dev/pyproject.toml index ddbb76eb893ba..c2c38e2bfce8b 100644 --- a/datadog_checks_dev/pyproject.toml +++ b/datadog_checks_dev/pyproject.toml @@ -52,7 +52,6 @@ cli = [ "aiomultiprocess", "atomicwrites", "beautifulsoup4==4.12.3", - "black==23.12.1", # TODO Remove once https://github.com/koxudaxi/datamodel-code-generator/issues/1821 is fixed "build>=0.7.0", "click~=8.1.6", "codespell", @@ -69,6 +68,13 @@ cli = [ "platformdirs>=2.0.0a3", "pydantic>=2.0.2", "pysmi==1.6.2", + # ruff is invoked as a subprocess by the model generator to format + # auto-generated config_models files (see datadog_checks/dev/tooling/ + # configuration/consumers/model/code_formatter.py). Keep this pin in sync + # with the version used by ddev's hatch lint env (currently `ruff==0.11.10` + # in ddev/src/ddev/plugin/external/hatch/environment_collector.py) until + # the commands here are migrated into ddev itself. + "ruff==0.11.10", "securesystemslib[crypto]==0.28.0", "semver>=2.13.0", "tabulate>=0.8.9", diff --git a/datadog_checks_dev/tests/tooling/configuration/consumers/model/test_code_formatter.py b/datadog_checks_dev/tests/tooling/configuration/consumers/model/test_code_formatter.py new file mode 100644 index 0000000000000..4c8eabc0ded4b --- /dev/null +++ b/datadog_checks_dev/tests/tooling/configuration/consumers/model/test_code_formatter.py @@ -0,0 +1,176 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""Direct tests for `format_with_ruff` and `_resolve_ruff_config`. + +The helper is exercised end-to-end by `ddev validate models -s`, but only +when generated content crosses 120 characters or normalization is needed. +These tests pin the behavior the validate flow relies on so a future +refactor or library bump can't silently break it. +""" + +from pathlib import Path + +import pytest + +from datadog_checks.dev.tooling.configuration.consumers.model import code_formatter +from datadog_checks.dev.tooling.configuration.consumers.model.code_formatter import ( + _has_ruff_section, + _resolve_ruff_config, + format_with_ruff, +) + +# --- format_with_ruff ------------------------------------------------------- + +LONG_DICT_LITERAL = ( + "x = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, " + "'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, " + "'r': 18, 's': 19, 't': 20}\n" +) + + +def test_format_with_ruff_wraps_lines_longer_than_120(): + formatted = format_with_ruff(LONG_DICT_LITERAL) + # The original is a single line well over 120 chars; ruff must split it + # into multiple lines, and every output line must be <= 120 chars. + assert formatted.count('\n') > 1 + for line in formatted.splitlines(): + assert len(line) <= 120, f'unexpected long line: {line!r}' + + +def test_format_with_ruff_preserves_quote_style(): + # Repo-wide ruff config sets format.quote-style=preserve (matching the + # legacy black skip-string-normalization). Single quotes must survive. + formatted = format_with_ruff(LONG_DICT_LITERAL) + assert "'a'" in formatted and '"a"' not in formatted + + +def test_format_with_ruff_short_input_passes_through_unchanged(): + source = "x = 1\n" + assert format_with_ruff(source) == source + + +def test_format_with_ruff_surfaces_install_hint_when_ruff_module_missing(monkeypatch): + """If `python -m ruff` exits with `No module named 'ruff'` the helper must + surface the actionable reinstall hint instead of the raw stderr.""" + import subprocess + + def fake_run(args, **kwargs): + raise subprocess.CalledProcessError( + returncode=1, + cmd=args, + output='', + stderr="/usr/bin/python: No module named 'ruff'\n", + ) + + monkeypatch.setattr(code_formatter.subprocess, 'run', fake_run) + with pytest.raises(RuntimeError, match='`ruff` package is not installed'): + format_with_ruff('x = 1\n') + + +def test_format_with_ruff_includes_argv_and_streams_on_other_failures(monkeypatch): + """For non-missing-package failures the error message must include + enough context (argv, stderr, stdout) to reproduce the failure.""" + import subprocess + + def fake_run(args, **kwargs): + raise subprocess.CalledProcessError( + returncode=2, + cmd=args, + output='partial output', + stderr='unexpected ruff error', + ) + + monkeypatch.setattr(code_formatter.subprocess, 'run', fake_run) + with pytest.raises(RuntimeError) as excinfo: + format_with_ruff('x = 1\n') + + msg = str(excinfo.value) + assert 'unexpected ruff error' in msg + assert 'partial output' in msg + assert 'ruff' in msg and 'format' in msg # argv components surface + + +# --- _resolve_ruff_config --------------------------------------------------- + + +def _write_pyproject(directory: Path, body: str) -> Path: + pyproject = directory / 'pyproject.toml' + pyproject.write_text(body) + return pyproject + + +def test_resolve_uses_root_when_get_root_points_to_a_pyproject_with_ruff(monkeypatch, tmp_path): + pyproject = _write_pyproject( + tmp_path, + '[tool.ruff]\nline-length = 120\n', + ) + monkeypatch.setattr(code_formatter, 'get_root', lambda: str(tmp_path)) + + assert _resolve_ruff_config() == pyproject + + +def test_resolve_falls_back_to_module_walk_when_get_root_is_empty(monkeypatch): + """Unit tests don't call `set_root`, so `get_root()` returns ''. The + helper must walk up from the module path and find the repo pyproject.toml, + not probe the test runner's CWD.""" + monkeypatch.setattr(code_formatter, 'get_root', lambda: '') + + resolved = _resolve_ruff_config() + + assert resolved is not None + assert resolved.name == 'pyproject.toml' + # The module lives at /datadog_checks_dev/datadog_checks/dev/tooling/ + # configuration/consumers/model/code_formatter.py, so the resolved config + # must be one of its ancestor pyproject.toml files. + module_file = Path(code_formatter.__file__).resolve() + assert resolved in {parent / 'pyproject.toml' for parent in module_file.parents} + # Sanity-check the file actually has the central ruff settings. + assert _has_ruff_section(resolved) + + +def test_resolve_falls_back_when_root_pyproject_has_no_ruff_section(monkeypatch, tmp_path): + """If `get_root()` points at a directory whose pyproject.toml has no + `[tool.ruff]`, the helper must fall through to the module-path walk + rather than returning a config that ruff would ignore.""" + _write_pyproject(tmp_path, '[project]\nname = "x"\n') + monkeypatch.setattr(code_formatter, 'get_root', lambda: str(tmp_path)) + + resolved = _resolve_ruff_config() + + assert resolved is not None + assert resolved != tmp_path / 'pyproject.toml' + + +def test_resolve_returns_none_when_nothing_can_be_found(monkeypatch, tmp_path): + monkeypatch.setattr(code_formatter, 'get_root', lambda: '') + # Repoint __file__ to a tree that has no pyproject.toml above it. + inner = tmp_path / 'a' / 'b' / 'c' + inner.mkdir(parents=True) + monkeypatch.setattr(code_formatter, '__file__', str(inner / 'code_formatter.py')) + + assert _resolve_ruff_config() is None + + +# --- _has_ruff_section ------------------------------------------------------ + + +@pytest.mark.parametrize( + ('body', 'expected'), + [ + ('[tool.ruff]\nline-length = 120\n', True), + ('[tool.ruff.lint]\nselect = ["E"]\n', True), + ('[tool.ruff.format]\nquote-style = "preserve"\n', True), + ('[project]\nname = "x"\n', False), + ('# Migrate from [tool.ruff.lint.format] later\n', False), # comment, not header + ('description = "explains [tool.ruff] elsewhere"\n', False), # value, not header + ('', False), + ], +) +def test_has_ruff_section_only_matches_actual_table_headers(tmp_path, body, expected): + pyproject = _write_pyproject(tmp_path, body) + assert _has_ruff_section(pyproject) is expected + + +def test_has_ruff_section_returns_false_for_missing_file(tmp_path): + assert _has_ruff_section(tmp_path / 'does-not-exist.toml') is False diff --git a/datadog_checks_dev/tests/tooling/configuration/consumers/model/test_fix_types.py b/datadog_checks_dev/tests/tooling/configuration/consumers/model/test_fix_types.py new file mode 100644 index 0000000000000..1bcebca2a8aa7 --- /dev/null +++ b/datadog_checks_dev/tests/tooling/configuration/consumers/model/test_fix_types.py @@ -0,0 +1,104 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""Focused tests for the `_fix_types` post-processing pass. + +`_fix_types` is responsible for translating mutable container annotations +emitted by the parser (`list[X]`, `dict[K, V]`) into their immutable +equivalents (`tuple[X, ...]`, `MappingProxyType[K, V]`). The bracket-tracking +walker that adds the `, ...` ellipsis must work whether the annotation lives +on a single line or has been wrapped across many lines by the upstream +formatter; the latter case is what made dropping the `[tool.black]` config +block from `pyproject.toml` viable. +""" + +import pytest + +from datadog_checks.dev.tooling.configuration.consumers.model.model_file import _fix_types + + +def _run(source: str) -> str: + lines = source.split('\n') + _fix_types(lines) + return '\n'.join(lines) + + +def test_dict_is_replaced_with_mapping_proxy_type(): + assert _run('headers: dict[str, str]') == 'headers: MappingProxyType[str, str]' + + +def test_list_on_single_line_becomes_variable_length_tuple(): + assert _run('tags: list[str]') == 'tags: tuple[str, ...]' + + +def test_nested_lists_each_get_their_own_ellipsis(): + assert _run('matrix: list[list[int]]') == 'matrix: tuple[tuple[int, ...], ...]' + + +def test_non_list_non_dict_input_is_returned_unchanged_verbatim(): + # Lines that contain neither `list[` nor `dict[` are left alone byte-for-byte. + source = "vals: Optional[Mapping[str, int]] = compute(arg)" + assert _run(source) == source + + +def test_list_of_literal_wrapped_across_multiple_lines(): + """Regression: when the upstream parser pre-wraps `list[Literal[...]]` + across lines (because black's default 88-char line-length is exceeded by + the inner Literal), the per-line walker that used to live here would + silently drop the `, ...` sentinel and emit `tuple[Literal[...]]` + (single-element tuple) — a real type-contract change. Whole-document + bracket tracking preserves the variable-length tuple.""" + source = ( + " states: Optional[\n" + " list[\n" + " Literal[\n" + " 'ALL',\n" + " 'NEW',\n" + " 'NEW_SAVING',\n" + " 'SUBMITTED',\n" + " 'ACCEPTED',\n" + " 'RUNNING',\n" + " 'FINISHED',\n" + " 'FAILED',\n" + " 'KILLED',\n" + " ]\n" + " ]\n" + " ] = None" + ) + out = _run(source) + # The `list` opener becomes `tuple` and gets a `, ...` sentinel before its + # matching `]`; the inner Literal brackets are untouched. + assert 'list[' not in out + assert 'tuple[' in out + # The sentinel lands on the same line as the inner `]` of the Literal + # (immediately after the last non-whitespace byte), not on a line of its own. + assert ' ], ...\n ]\n ] = None' in out + + +def test_unicode_inside_descriptions_does_not_break_walker(): + """The walker iterates UTF-8 bytes so multi-byte sequences in field + descriptions, examples, etc. don't trip the `byte must be in range(0, 256)` + error the original char-iterating implementation would raise.""" + source = "label: Optional[list[str]] = Field(\n None, description='unicode: ✓ — résumé · 日本語'\n)" + out = _run(source) + assert 'tuple[str, ...]' in out + assert '✓' in out + assert '日本語' in out + + +@pytest.mark.parametrize( + ('input_line', 'expected_line'), + [ + # Nested list inside Optional, single line. + ('Optional[list[int]]', 'Optional[tuple[int, ...]]'), + # list and dict combined on the same line. + ('list[dict[str, int]]', 'tuple[MappingProxyType[str, int], ...]'), + ], +) +def test_simple_combinations(input_line: str, expected_line: str): + assert _run(input_line) == expected_line + + +def test_no_list_no_dict_input_is_returned_unchanged(): + source = "name: str = Field('default', examples=['x'])" + assert _run(source) == source diff --git a/ddev/README.md b/ddev/README.md index 97e899d86ca44..7b49f6f304f8e 100644 --- a/ddev/README.md +++ b/ddev/README.md @@ -3,7 +3,7 @@ | | | | --- | --- | | Package | [![PyPI - Version](https://img.shields.io/pypi/v/ddev.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/ddev/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ddev.svg?logo=python&label=Python&logoColor=gold)](https://pypi.org/project/ddev/) | -| Meta | [![code style - black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![types - Mypy](https://img.shields.io/badge/types-Mypy-blue.svg)](https://github.com/python/mypy) [![imports - isort](https://img.shields.io/badge/imports-isort-ef8336.svg)](https://github.com/pycqa/isort) [![License - MIT](https://img.shields.io/badge/license-BSD--3--Clause-9400d3.svg)](https://spdx.org/licenses/) | +| Meta | [![linting - Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![types - Mypy](https://img.shields.io/badge/types-Mypy-blue.svg)](https://github.com/python/mypy) [![imports - isort](https://img.shields.io/badge/imports-isort-ef8336.svg)](https://github.com/pycqa/isort) [![License - MIT](https://img.shields.io/badge/license-BSD--3--Clause-9400d3.svg)](https://spdx.org/licenses/) | ----- diff --git a/ddev/changelog.d/23588.changed b/ddev/changelog.d/23588.changed new file mode 100644 index 0000000000000..78653680b50be --- /dev/null +++ b/ddev/changelog.d/23588.changed @@ -0,0 +1 @@ +Drop the `[tool.black]` block from `ddev/pyproject.toml` and the matching code path in `ddev meta scripts update-python-config` (formatting is fully handled by ruff). diff --git a/ddev/pyproject.toml b/ddev/pyproject.toml index cbb148a00091e..1d5c35bfbec0c 100644 --- a/ddev/pyproject.toml +++ b/ddev/pyproject.toml @@ -30,7 +30,13 @@ dependencies = [ "click~=8.1.6", "coverage", "datadog-api-client==2.20.0", - "datadog-checks-dev[cli]~=39.0", + # TEMPORARY: relaxed from the usual `~=38.0` pin to span the gap between + # this PR landing (which bumps datadog-checks-dev to 39 because black is + # dropped from its [cli] extras) and the next ddev release. Without this, + # GitHub Actions that install ddev from the local repo can't resolve dcd + # 39 against ddev's tighter pin. The PR that releases ddev MUST tighten + # this back to `~=39.0` (see https://github.com/DataDog/integrations-core/pull/23588). + "datadog-checks-dev[cli]>=38.0,<40", "hatch>=1.13.0", "httpx", "jsonpointer", @@ -86,15 +92,6 @@ scripts = ["ddev"] asyncio_mode = "auto" norecursedirs = ["src/ddev/cli/create/templates"] -# Keep Black configuration to generate models through validate -# Switch to Ruff after it provides a Python API -[tool.black] -include = '\.pyi?$' -line-length = 120 -skip-string-normalization = true -target-version = ["py313"] -extend-exclude = "src/ddev/_version.py" - [tool.ruff] extend = "../pyproject.toml" exclude = [ diff --git a/ddev/src/ddev/cli/meta/scripts/update_py_config.py b/ddev/src/ddev/cli/meta/scripts/update_py_config.py index 437dc5f05dbaf..102e470dbd89f 100644 --- a/ddev/src/ddev/cli/meta/scripts/update_py_config.py +++ b/ddev/src/ddev/cli/meta/scripts/update_py_config.py @@ -131,16 +131,6 @@ def update_ddev_pyproject_file(app: Application, new_version: str, old_version: new_version = f"py{new_version.replace('.', '')}" old_version = f"py{old_version.replace('.', '')}" - if black_config := config.get('tool', {}).get('black', {}): - target_version = black_config.get('target-version', []) - - for index, version in enumerate(target_version): - if version == old_version: - target_version[index] = new_version - tracker.success() - changed = True - break - if ruff_config := config.get('tool', {}).get('ruff', {}): if ruff_config.get('target-version') == old_version: ruff_config['target-version'] = new_version diff --git a/ddev/tests/cli/meta/scripts/conftest.py b/ddev/tests/cli/meta/scripts/conftest.py index c4df63a4a9d63..1130bf3cb13e4 100644 --- a/ddev/tests/cli/meta/scripts/conftest.py +++ b/ddev/tests/cli/meta/scripts/conftest.py @@ -123,10 +123,7 @@ def fake_repo(tmp_path_factory, config_file, local_repo, ddev, mocker): write_file( repo_path / 'ddev', 'pyproject.toml', - f"""[tool.black] -target-version = ["py{OLD_PYTHON_VERSION.replace('.', '')}"] - -[tool.ruff] + f"""[tool.ruff] target-version = "py{OLD_PYTHON_VERSION.replace('.', '')}" """, ) diff --git a/ddev/tests/cli/meta/scripts/test_update_py_config.py b/ddev/tests/cli/meta/scripts/test_update_py_config.py index 6c60257ee7e80..0943b8e2e9aa2 100644 --- a/ddev/tests/cli/meta/scripts/test_update_py_config.py +++ b/ddev/tests/cli/meta/scripts/test_update_py_config.py @@ -14,7 +14,7 @@ def test_update_py_config(fake_repo, ddev): result = ddev('meta', 'scripts', 'update-python-config', NEW_PYTHON_VERSION) assert result.exit_code == 0, result.output - assert result.output.endswith('Python upgrades\n\nPassed: 10\n') + assert result.output.endswith('Python upgrades\n\nPassed: 9\n') contents = constant_file.read_text() assert f'PYTHON_VERSION = {OLD_PYTHON_VERSION!r}' not in contents @@ -56,3 +56,17 @@ def test_update_py_config(fake_repo, ddev): contents = template_file.read_text() assert f'Programming Language :: Python :: {OLD_PYTHON_VERSION}' not in contents assert f'Programming Language :: Python :: {NEW_PYTHON_VERSION}' in contents + + # Explicit content assertions on the rewritten ddev/pyproject.toml: this + # captures the actual contract `update_ddev_pyproject_file` upholds today + # (no `[tool.black]` block survives, and `[tool.ruff].target-version` is + # the only target-version key bumped). Counter-only assertions above can + # mask regressions where some unrelated tracker increment compensates for + # a missed rewrite. + ddev_pyproject = fake_repo.path / 'ddev' / 'pyproject.toml' + new_target_token = f"py{NEW_PYTHON_VERSION.replace('.', '')}" + old_target_token = f"py{OLD_PYTHON_VERSION.replace('.', '')}" + contents = ddev_pyproject.read_text() + assert '[tool.black]' not in contents + assert f'target-version = "{new_target_token}"' in contents + assert f'target-version = "{old_target_token}"' not in contents diff --git a/docs/developer/.snippets/links.txt b/docs/developer/.snippets/links.txt index 48502a9038fb6..d35f578bd9dbc 100644 --- a/docs/developer/.snippets/links.txt +++ b/docs/developer/.snippets/links.txt @@ -11,7 +11,6 @@ [azp-templates-linux]: https://github.com/DataDog/integrations-core/blob/master/.azure-pipelines/templates/test-single-linux.yml [azp-templates-setup]: https://github.com/DataDog/integrations-core/blob/master/.azure-pipelines/templates/set-up-integrations.yml [azp-templates-windows]: https://github.com/DataDog/integrations-core/blob/master/.azure-pipelines/templates/test-single-windows.yml -[black-github]: https://github.com/psf/black [click-github]: https://github.com/pallets/click [config-spec-example-consumer]: https://github.com/DataDog/integrations-core/blob/master/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/example.py [config-spec-model-consumer]: https://github.com/DataDog/integrations-core/blob/master/datadog_checks_dev/datadog_checks/dev/tooling/configuration/consumers/model.py @@ -116,6 +115,7 @@ [python-markdown-extensions]: https://github.com/Python-Markdown/markdown/wiki/Third-Party-Extensions [python-pdb]: https://docs.python.org/3/library/pdb.html [requests-github]: https://github.com/psf/requests +[ruff-github]: https://github.com/astral-sh/ruff [requests-unixsocket-pypi]: https://pypi.org/project/requests-unixsocket/ [release-base.yml]: https://github.com/DataDog/integrations-core/blob/master/.github/workflows/release-base.yml [release-dev.yml]: https://github.com/DataDog/integrations-core/blob/master/.github/workflows/release-dev.yml diff --git a/docs/developer/guidelines/style.md b/docs/developer/guidelines/style.md index 909b33c0ffe45..179aad3e1faa2 100644 --- a/docs/developer/guidelines/style.md +++ b/docs/developer/guidelines/style.md @@ -4,45 +4,27 @@ These are all the checkers used by our [style enforcement](../ddev/plugins.md#style). -## [black][black-github] +## [ruff][ruff-github] -An opinionated formatter, like JavaScript's [prettier][prettier-github] and Golang's [gofmt][gofmt-docs]. +An extremely fast Python linter and formatter that subsumes the roles previously played by black, isort, flake8, and several flake8 plugins. The repo's centralized configuration lives under `[tool.ruff]` in the root `pyproject.toml`; per-integration `hatch lint` envs install a pinned ruff version. -## [isort][isort-github] +### Formatting -A tool to sort imports lexicographically, by section, and by type. We use the 5 standard sections: `__future__`, stdlib, third party, first party, and local. +`ruff format` is the formatter (a drop-in for black). Quote style is set to `preserve` so existing single-quoted strings are kept as-is. Run via `ddev test -fs `. -`datadog_checks` is configured as a first party namespace. +### Linting -## [flake8][flake8-github] +`ruff check` enforces the rule sets selected in `[tool.ruff.lint]`: -An easy-to-use wrapper around [pycodestyle][pycodestyle-github] and [pyflakes][pyflakes-github]. We select everything it provides and only ignore a few things to give precedence to other tools. +- `E`, `W` — pycodestyle errors and warnings +- `F` — pyflakes +- `B` — flake8-bugbear (likely bugs and design problems) +- `C` — mccabe complexity +- `G` — flake8-logging-format (consistent logging format) +- `I` — isort (`datadog_checks` is configured as a first-party namespace) +- `TID252` — flake8-tidy-imports (no relative imports of parent modules) -### [bugbear][flake8-bugbear-github] - -A `flake8` plugin for finding likely bugs and design problems in programs. We enable: - -- `B001`: Do not use bare `except:`, it also catches unexpected events like memory errors, interrupts, system exit, and so on. Prefer `except Exception:`. -- `B003`: Assigning to `os.environ` doesn't clear the environment. Subprocesses are going to see outdated variables, in disagreement with the current process. Use `os.environ.clear()` or the `env=` argument to Popen. -- `B006`: Do not use mutable data structures for argument defaults. All calls reuse one instance of that data structure, persisting changes between them. -- `B007`: Loop control variable not used within the loop body. If this is intended, start the name with an underscore. -- `B301`: Python 3 does not include `.iter*` methods on dictionaries. The default behavior is to return iterables. Simply remove the `iter` prefix from the method. For Python 2 compatibility, also prefer the Python 3 equivalent if you expect that the size of the dict to be small and bounded. The performance regression on Python 2 will be negligible and the code is going to be the clearest. Alternatively, use `six.iter*`. -- `B305`: `.next()` is not a thing on Python 3. Use the `next()` builtin. For Python 2 compatibility, use `six.next()`. -- `B306`: `BaseException.message` has been deprecated as of Python 2.6 and is removed in Python 3. Use `str(e)` to access the user-readable message. Use `e.args` to access arguments passed to the exception. -- `B902`: Invalid first argument used for method. Use `self` for instance methods, and `cls` for class methods. - -### [logging-format][flake8-logging-format-github] - -A `flake8` plugin for ensuring a consistent logging format. We enable: - -- `G001`: Logging statements should not use `string.format()` for their first argument -- `G002`: Logging statements should not use `%` formatting for their first argument -- `G003`: Logging statements should not use `+` concatenation for their first argument -- `G004`: Logging statements should not use `f"..."` for their first argument (only in Python 3.6+) -- `G010`: Logging statements should not use `warn` (use `warning` instead) -- `G100`: Logging statements should not use `extra` arguments unless whitelisted -- `G201`: Logging statements should not use `error(..., exc_info=True)` (use `exception(...)` instead) -- `G202`: Logging statements should not use redundant `exc_info=True` in `exception` +Run via `ddev test -ls `. Use `--fix` to auto-apply fixes where ruff can. ## [Mypy][mypy-github] diff --git a/docs/developer/index.md b/docs/developer/index.md index 161ac6ca58803..87266fe6af68a 100644 --- a/docs/developer/index.md +++ b/docs/developer/index.md @@ -4,7 +4,7 @@ [![GitHub contributors](https://img.shields.io/github/contributors/DataDog/integrations-core)](https://github.com/DataDog/integrations-core) [![Downloads](https://pepy.tech/badge/datadog-checks-dev)](https://pepy.tech/project/datadog-checks-dev) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/datadog-checks-dev)](https://pypi.org/project/datadog-checks-dev) -[![Code style - black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Linting - Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![License - BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-9400d3.svg)](https://choosealicense.com/licenses/bsd-3-clause) ----- diff --git a/docs/developer/testing.md b/docs/developer/testing.md index 1169f2bb46667..061ecbaf8b156 100644 --- a/docs/developer/testing.md +++ b/docs/developer/testing.md @@ -16,9 +16,9 @@ $ ddev test postgres -l ┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ Name ┃ Type ┃ Features ┃ Dependencies ┃ Environment variables ┃ Scripts ┃ ┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ -│ lint │ virtual │ │ black==22.12.0 │ │ all │ -│ │ │ │ pydantic==2.7.3 │ │ fmt │ -│ │ │ │ ruff==0.0.257 │ │ style │ +│ lint │ virtual │ │ ruff==0.11.10 │ │ all │ +│ │ │ │ pydantic==2.11.5 │ │ fmt │ +│ │ │ │ pip │ │ style │ ├────────┼─────────┼──────────┼─────────────────┼─────────────────────────┼───────────┤ │ latest │ virtual │ deps │ │ POSTGRES_VERSION=latest │ benchmark │ │ │ │ │ │ │ test │ diff --git a/kafka_actions/changelog.d/23588.fixed b/kafka_actions/changelog.d/23588.fixed new file mode 100644 index 0000000000000..f9abf12736f81 --- /dev/null +++ b/kafka_actions/changelog.d/23588.fixed @@ -0,0 +1 @@ +Reformat auto-generated `config_models` files following the migration of the model formatter from black to ruff. No behavior or type-contract change. diff --git a/kafka_actions/datadog_checks/kafka_actions/config_models/instance.py b/kafka_actions/datadog_checks/kafka_actions/config_models/instance.py index 2c4fa36ed500b..ac656dd818ecd 100644 --- a/kafka_actions/datadog_checks/kafka_actions/config_models/instance.py +++ b/kafka_actions/datadog_checks/kafka_actions/config_models/instance.py @@ -92,7 +92,9 @@ class ProduceMessage(BaseModel): examples=['MTIzNDU='], ) partition: Optional[int] = Field( - -1, description='Specific partition (-1 for automatic partitioning)', examples=[-1] + -1, + description='Specific partition (-1 for automatic partitioning)', + examples=[-1], ) topic: str = Field(..., description='Topic to produce to', examples=['test-topic']) value: str = Field( @@ -141,7 +143,9 @@ class ReadMessages(BaseModel): examples=[10], ) partition: Optional[int] = Field( - -1, description='Specific partition to read from (-1 for all partitions)', examples=[-1] + -1, + description='Specific partition to read from (-1 for all partitions)', + examples=[-1], ) start_offset: Optional[int] = Field( -1, @@ -217,7 +221,10 @@ class UpdateConsumerGroupOffsets(BaseModel): ..., description='List of topic-partition-offset tuples to update', examples=[ - [{'offset': 1000, 'partition': 0, 'topic': 'orders'}, {'offset': 1500, 'partition': 1, 'topic': 'orders'}] + [ + {'offset': 1000, 'partition': 0, 'topic': 'orders'}, + {'offset': 1500, 'partition': 1, 'topic': 'orders'}, + ] ], ) @@ -234,10 +241,14 @@ class UpdateTopicConfig(BaseModel): examples=[{'max.message.bytes': '2097152', 'retention.ms': '1209600000'}], ) delete_configs: Optional[tuple[str, ...]] = Field( - None, description='Configuration keys to reset to defaults', examples=[['retention.bytes', 'compression.type']] + None, + description='Configuration keys to reset to defaults', + examples=[['retention.bytes', 'compression.type']], ) num_partitions: Optional[int] = Field( - None, description='New partition count (can only increase, cannot decrease)', examples=[12] + None, + description='New partition count (can only increase, cannot decrease)', + examples=[12], ) topic: str = Field(..., description='Topic name to update', examples=['orders']) diff --git a/pyproject.toml b/pyproject.toml index 0e419fec84120..c897094bded8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,3 @@ -# NOTE: You have to use single-quoted strings in TOML for regular expressions. -# It's the equivalent of r-strings in Python. Multiline strings are treated as -# verbose regular expressions by Black. Use [ ] to denote a significant space -# character. - -# Keep Black configuration to generate models through validate -# Switch to Ruff after it provides a Python API -[tool.black] -include = '\.pyi?$' -line-length = 120 -skip-string-normalization = true -target-version = ["py313"] - - [tool.mypy] plugins = "pydantic.mypy" # Follows imports and type-check imported modules. @@ -54,7 +40,6 @@ exclude = [ "**/datadog_checks/dev/tooling/signing.py", "**/datadog_checks/*/vendor/*", # Avoid linting or formatting autogenerated files from ddev validate models - # They are formatted with Black and Ruff does not yet offer an API to format them "**/config_models/instance.py", "**/config_models/shared.py", "**/config_models/defaults.py", diff --git a/win32_event_log/changelog.d/23588.fixed b/win32_event_log/changelog.d/23588.fixed new file mode 100644 index 0000000000000..f9abf12736f81 --- /dev/null +++ b/win32_event_log/changelog.d/23588.fixed @@ -0,0 +1 @@ +Reformat auto-generated `config_models` files following the migration of the model formatter from black to ruff. No behavior or type-contract change. diff --git a/win32_event_log/datadog_checks/win32_event_log/config_models/instance.py b/win32_event_log/datadog_checks/win32_event_log/config_models/instance.py index f4f6789ab4c22..fd03ea3eaa7cc 100644 --- a/win32_event_log/datadog_checks/win32_event_log/config_models/instance.py +++ b/win32_event_log/datadog_checks/win32_event_log/config_models/instance.py @@ -28,7 +28,16 @@ class Filters(BaseModel): id: Optional[tuple[int, ...]] = None source: Optional[tuple[str, ...]] = None type: Optional[ - tuple[Literal['success', 'error', 'warning', 'information', 'success audit', 'failure audit'], ...] + tuple[ + Literal[ + 'success', + 'error', + 'warning', + 'information', + 'success audit', + 'failure audit', + ], ... + ] ] = None diff --git a/yarn/changelog.d/23588.fixed b/yarn/changelog.d/23588.fixed new file mode 100644 index 0000000000000..f9abf12736f81 --- /dev/null +++ b/yarn/changelog.d/23588.fixed @@ -0,0 +1 @@ +Reformat auto-generated `config_models` files following the migration of the model formatter from black to ruff. No behavior or type-contract change. diff --git a/yarn/datadog_checks/yarn/config_models/instance.py b/yarn/datadog_checks/yarn/config_models/instance.py index 87d0c60057d2b..90acca2a4bc69 100644 --- a/yarn/datadog_checks/yarn/config_models/instance.py +++ b/yarn/datadog_checks/yarn/config_models/instance.py @@ -73,7 +73,18 @@ class InstanceConfig(BaseModel): collect_apps_all_states: Optional[bool] = None collect_apps_states_list: Optional[ tuple[ - Literal['ALL', 'NEW', 'NEW_SAVING', 'SUBMITTED', 'ACCEPTED', 'RUNNING', 'FINISHED', 'FAILED', 'KILLED'], ... + Literal[ + 'ALL', + 'NEW', + 'NEW_SAVING', + 'SUBMITTED', + 'ACCEPTED', + 'RUNNING', + 'FINISHED', + 'FAILED', + 'KILLED', + ], + ..., ] ] = None collect_node_metrics: Optional[bool] = None diff --git a/yarn/datadog_checks/yarn/config_models/shared.py b/yarn/datadog_checks/yarn/config_models/shared.py index 79e25562b8366..4426500b93321 100644 --- a/yarn/datadog_checks/yarn/config_models/shared.py +++ b/yarn/datadog_checks/yarn/config_models/shared.py @@ -39,7 +39,17 @@ class SharedConfig(BaseModel): collect_apps_all_states: Optional[bool] = None collect_apps_states_list: Optional[ tuple[ - Literal['ALL', 'NEW', 'NEW_SAVING', 'SUBMITTED', 'ACCEPTED', 'RUNNING', 'FINISHED', 'FAILED', 'KILLED'], ... + Literal[ + 'ALL', + 'NEW', + 'NEW_SAVING', + 'SUBMITTED', + 'ACCEPTED', + 'RUNNING', + 'FINISHED', + 'FAILED', + 'KILLED', + ], ... ] ] = None proxy: Optional[Proxy] = None From a034114605d3f30d1365040e05e5cf38256dedc6 Mon Sep 17 00:00:00 2001 From: Lucia Date: Fri, 19 Jun 2026 14:10:02 +0200 Subject: [PATCH 4/4] Add AI config block to ddev config model (#23894) * Add ai block to ddev config model for Anthropic API key and flow dirs. Co-Authored-By: Claude Sonnet 4.6 * Add changelog for ddev AI config * Cover ddev AI config display * Fix changelog filename to match PR #23894. Co-Authored-By: Claude Sonnet 4.6 * Move get_anthropic_api_key to top of model.py and add precedence test Move get_anthropic_api_key() next to get_github_token() so all module-level env-var helpers are grouped before any class definitions. Add a test verifying config-file value takes precedence over DD_ANTHROPIC_API_KEY env var. Co-Authored-By: Claude Sonnet 4.6 * refactor(ddev/config): replace hardcoded scrub_config with glob-based _scrub_path helper - Extract _scrub_path to scrub arbitrary nested config paths using dot-notation globs - Replace per-field scrub_config logic with a SCRUBBED_GLOBS-driven loop Rationale: makes adding new sensitive fields trivial without touching scrubbing logic This commit made by [/dd:git:commit:quick](https://github.com/DataDog/claude-marketplace/tree/main/dd/commands/git/commit/quick.md) * Add unit tests for scrub_config traversal semantics Co-Authored-By: Claude Sonnet 4.6 * Use app.config.ai.anthropic_api_key in dynamicd command * Mention DD_ANTHROPIC_API_KEY in missing-key error message Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- ddev/changelog.d/23894.added | 1 + .../ddev/cli/meta/scripts/_dynamicd/cli.py | 13 +- ddev/src/ddev/config/model.py | 76 ++++++++ ddev/src/ddev/config/utils.py | 45 +++-- ddev/tests/cli/config/test_show.py | 13 ++ ddev/tests/cli/meta/scripts/test_dynamicd.py | 37 ++++ ddev/tests/config/test_model.py | 169 +++++++++++++++++- ddev/tests/config/test_utils.py | 82 +++++++++ 8 files changed, 411 insertions(+), 25 deletions(-) create mode 100644 ddev/changelog.d/23894.added create mode 100644 ddev/tests/config/test_utils.py diff --git a/ddev/changelog.d/23894.added b/ddev/changelog.d/23894.added new file mode 100644 index 0000000000000..b2bbe88f404b6 --- /dev/null +++ b/ddev/changelog.d/23894.added @@ -0,0 +1 @@ +Add an AI configuration block for ddev. diff --git a/ddev/src/ddev/cli/meta/scripts/_dynamicd/cli.py b/ddev/src/ddev/cli/meta/scripts/_dynamicd/cli.py index cb0b4a2e4be73..845f7e485e48f 100644 --- a/ddev/src/ddev/cli/meta/scripts/_dynamicd/cli.py +++ b/ddev/src/ddev/cli/meta/scripts/_dynamicd/cli.py @@ -5,7 +5,6 @@ from __future__ import annotations -import os from typing import TYPE_CHECKING import click @@ -78,19 +77,15 @@ def _get_api_keys(app: Application) -> tuple[str, str]: Returns (llm_api_key, dd_api_key) or aborts if not configured. """ - # Get LLM API key from config or environment variable - llm_api_key = app.config.raw_data.get("dynamicd", {}).get("llm_api_key") - if not llm_api_key: - llm_api_key = os.environ.get("ANTHROPIC_API_KEY") + llm_api_key = app.config.ai.anthropic_api_key if not llm_api_key: app.display_error( - "LLM API key not configured. Either:\n" - " 1. Set env var: export ANTHROPIC_API_KEY=\n" - " 2. Or run: ddev config set dynamicd.llm_api_key " + "Anthropic API key not configured. Either:\n" + " 1. Set env var: export ANTHROPIC_API_KEY= (or DD_ANTHROPIC_API_KEY)\n" + " 2. Or run: ddev config set ai.anthropic_api_key " ) app.abort() - # Get Datadog API key from org config dd_api_key = app.config.org.config.get("api_key") if not dd_api_key: app.display_error( diff --git a/ddev/src/ddev/config/model.py b/ddev/src/ddev/config/model.py index 493cc089a4938..5a1c9ebe986b5 100644 --- a/ddev/src/ddev/config/model.py +++ b/ddev/src/ddev/config/model.py @@ -19,6 +19,10 @@ def get_github_token(): return os.environ.get('DD_GITHUB_TOKEN', '') or os.environ.get('GH_TOKEN', '') or os.environ.get('GITHUB_TOKEN', '') +def get_anthropic_api_key(): + return os.environ.get('DD_ANTHROPIC_API_KEY', '') or os.environ.get('ANTHROPIC_API_KEY', '') + + class ConfigurationError(Exception): def __init__(self, *args, location): self.location = location @@ -71,6 +75,7 @@ def __init__(self, *args, **kwargs): self._field_pypi = FIELD_TO_PARSE self._field_trello = FIELD_TO_PARSE self._field_terminal = FIELD_TO_PARSE + self._field_ai = FIELD_TO_PARSE self._field_upgrade_check = FIELD_TO_PARSE @property @@ -331,6 +336,27 @@ def terminal(self, value): self.raw_data['terminal'] = value self._field_terminal = FIELD_TO_PARSE + @property + def ai(self): + if self._field_ai is FIELD_TO_PARSE: + if 'ai' in self.raw_data: + ai = self.raw_data['ai'] + if not isinstance(ai, dict): + self.raise_error('must be a table') + + self._field_ai = AIConfig(ai, ('ai',)) + else: + ai = {} + self.raw_data['ai'] = ai + self._field_ai = AIConfig(ai, ('ai',)) + + return self._field_ai + + @ai.setter + def ai(self, value): + self.raw_data['ai'] = value + self._field_ai = FIELD_TO_PARSE + class RepoConfig(LazilyParsedConfig): def __init__(self, *args, **kwargs): @@ -780,3 +806,53 @@ def spinner(self): def spinner(self, value): self.raw_data['spinner'] = value self._field_spinner = FIELD_TO_PARSE + + +class AIConfig(LazilyParsedConfig): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._field_anthropic_api_key = FIELD_TO_PARSE + self._field_flow_dirs = FIELD_TO_PARSE + + @property + def anthropic_api_key(self): + if self._field_anthropic_api_key is FIELD_TO_PARSE: + if 'anthropic_api_key' in self.raw_data: + key = self.raw_data['anthropic_api_key'] + if not isinstance(key, str): + self.raise_error('must be a string') + + self._field_anthropic_api_key = key + else: + self._field_anthropic_api_key = get_anthropic_api_key() + + return self._field_anthropic_api_key + + @anthropic_api_key.setter + def anthropic_api_key(self, value): + self.raw_data['anthropic_api_key'] = value + self._field_anthropic_api_key = FIELD_TO_PARSE + + @property + def flow_dirs(self): + if self._field_flow_dirs is FIELD_TO_PARSE: + if 'flow_dirs' in self.raw_data: + flow_dirs = self.raw_data['flow_dirs'] + if not isinstance(flow_dirs, list): + self.raise_error('must be an array') + + for i, entry in enumerate(flow_dirs): + if not isinstance(entry, str): + self.raise_error('must be a string', extra_steps=(str(i),)) + + self._field_flow_dirs = flow_dirs + else: + self._field_flow_dirs = self.raw_data['flow_dirs'] = [] + + return self._field_flow_dirs + + @flow_dirs.setter + def flow_dirs(self, value): + self.raw_data['flow_dirs'] = value + self._field_flow_dirs = FIELD_TO_PARSE diff --git a/ddev/src/ddev/config/utils.py b/ddev/src/ddev/config/utils.py index fbe211fc5d7f0..d799e98d712c0 100644 --- a/ddev/src/ddev/config/utils.py +++ b/ddev/src/ddev/config/utils.py @@ -7,7 +7,14 @@ from ddev.utils.fs import Path SCRUBBED_VALUE = '*****' -SCRUBBED_GLOBS = ('github.token', 'pypi.auth', 'trello.token', 'orgs.*.api_key', 'orgs.*.app_key') +SCRUBBED_GLOBS = ( + 'github.token', + 'pypi.auth', + 'trello.token', + 'orgs.*.api_key', + 'orgs.*.app_key', + 'ai.anthropic_api_key', +) def save_toml_document(document: TOMLDocument, path: Path): @@ -23,17 +30,25 @@ def load_toml_data(path: Path) -> dict: return tomlkit.loads(path.read_text()) -def scrub_config(config: dict): - if 'token' in config.get('github', {}): - config['github']['token'] = SCRUBBED_VALUE - - if 'auth' in config.get('pypi', {}): - config['pypi']['auth'] = SCRUBBED_VALUE - - if 'token' in config.get('trello', {}): - config['trello']['token'] = SCRUBBED_VALUE - - for data in config.get('orgs', {}).values(): - for key in ('api_key', 'app_key'): - if key in data: - data[key] = SCRUBBED_VALUE +def _scrub_path(config: dict, path: str) -> None: + parts = path.split('.') + nodes = [config] + for part in parts[:-1]: + next_nodes: list[dict] = [] + for node in nodes: + if not isinstance(node, dict): + continue + if part == '*': + next_nodes.extend(node.values()) + elif part in node: + next_nodes.append(node[part]) + nodes = next_nodes + leaf = parts[-1] + for node in nodes: + if isinstance(node, dict) and leaf in node: + node[leaf] = SCRUBBED_VALUE + + +def scrub_config(config: dict) -> None: + for glob in SCRUBBED_GLOBS: + _scrub_path(config, glob) diff --git a/ddev/tests/cli/config/test_show.py b/ddev/tests/cli/config/test_show.py index c33ec3fe92f95..71d5703cc5920 100644 --- a/ddev/tests/cli/config/test_show.py +++ b/ddev/tests/cli/config/test_show.py @@ -57,6 +57,10 @@ waiting = "bold magenta" debug = "bold" spinner = "simpleDotsScrolling" + +[ai] +flow_dirs = [] +anthropic_api_key = "*****" """ EXPECTED_NON_SCRUBBED_OUTPUT = f""" @@ -105,6 +109,10 @@ waiting = "bold magenta" debug = "bold" spinner = "simpleDotsScrolling" + +[ai] +flow_dirs = [] +anthropic_api_key = "sk-test" """ @@ -114,6 +122,7 @@ def valid_config_file(config_file): config_file.model.orgs['default']['api_key'] = 'foo' config_file.model.orgs['default']['app_key'] = 'bar' config_file.model.github = {'user': '', 'token': ''} + config_file.model.ai.anthropic_api_key = 'sk-test' config_file.save() @@ -182,6 +191,10 @@ def build_expected_output_with_line_sources(expected: str, config_file: ConfigFi 42: 'GlobalConfig:43', 43: 'GlobalConfig:44', 44: 'GlobalConfig:45', + # Blank line + 46: 'GlobalConfig:47', + 47: 'GlobalConfig:48', + 48: 'GlobalConfig:49', } # Add a blank line at the end to match the expected output diff --git a/ddev/tests/cli/meta/scripts/test_dynamicd.py b/ddev/tests/cli/meta/scripts/test_dynamicd.py index af73ffa9616ac..f260d515bdc9c 100644 --- a/ddev/tests/cli/meta/scripts/test_dynamicd.py +++ b/ddev/tests/cli/meta/scripts/test_dynamicd.py @@ -7,6 +7,8 @@ and tag values from real integration dashboards. """ +from unittest.mock import MagicMock + import pytest from ddev.repo.core import Repository @@ -141,6 +143,41 @@ def test_to_prompt_context_generates_string(self, real_repo): assert 'redis' in prompt.lower() +class TestGetApiKeys: + """Tests for _get_api_keys helper.""" + + def _make_app(self, anthropic_api_key: str, dd_api_key: str) -> MagicMock: + app = MagicMock() + app.config.ai.anthropic_api_key = anthropic_api_key + app.config.org.config = {"api_key": dd_api_key} + return app + + def test_returns_both_keys_when_configured(self): + from ddev.cli.meta.scripts._dynamicd.cli import _get_api_keys + + app = self._make_app("sk-ant-test", "dd-key-123") + llm_key, dd_key = _get_api_keys(app) + + assert llm_key == "sk-ant-test" + assert dd_key == "dd-key-123" + + def test_aborts_when_llm_key_missing(self): + from ddev.cli.meta.scripts._dynamicd.cli import _get_api_keys + + app = self._make_app("", "dd-key-123") + _get_api_keys(app) + + app.abort.assert_called_once() + + def test_aborts_when_dd_key_missing(self): + from ddev.cli.meta.scripts._dynamicd.cli import _get_api_keys + + app = self._make_app("sk-ant-test", "") + _get_api_keys(app) + + app.abort.assert_called_once() + + class TestReadMetrics: """Tests for _read_metrics function.""" diff --git a/ddev/tests/config/test_model.py b/ddev/tests/config/test_model.py index 6170b4cd4abac..010a9b5207fbf 100644 --- a/ddev/tests/config/test_model.py +++ b/ddev/tests/config/test_model.py @@ -5,7 +5,12 @@ import pytest -from ddev.config.model import ConfigurationError, RootConfig, get_github_token, get_github_user +from ddev.config.model import ( + ConfigurationError, + RootConfig, + get_github_token, + get_github_user, +) def test_default(): @@ -55,6 +60,9 @@ def test_default(): 'spinner': 'simpleDotsScrolling', }, }, + 'ai': { + 'flow_dirs': [], + }, } @@ -1474,3 +1482,162 @@ def test_github_config_with_environment_variables(self, monkeypatch): # raw_data should still be empty assert config.raw_data['github'] == {} + + +class TestAI: + def test_default(self, monkeypatch): + monkeypatch.delenv('DD_ANTHROPIC_API_KEY', raising=False) + monkeypatch.delenv('ANTHROPIC_API_KEY', raising=False) + config = RootConfig({}) + + assert config.ai.anthropic_api_key == config.ai.anthropic_api_key == '' + assert config.ai.flow_dirs == config.ai.flow_dirs == [] + assert config.raw_data == {'ai': {'flow_dirs': []}} + + def test_not_table(self, helpers): + config = RootConfig({'ai': 9000}) + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai + must be a table""" + ), + ): + _ = config.ai + + def test_set_lazy_error(self, helpers): + config = RootConfig({}) + + config.ai = 9000 + assert config.raw_data == {'ai': 9000} + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai + must be a table""" + ), + ): + _ = config.ai + + def test_anthropic_api_key_from_config(self): + config = RootConfig({'ai': {'anthropic_api_key': 'sk-test'}}) + + assert config.ai.anthropic_api_key == 'sk-test' + assert config.raw_data == {'ai': {'anthropic_api_key': 'sk-test'}} + + def test_anthropic_api_key_dd_env_var(self, monkeypatch): + monkeypatch.setenv('DD_ANTHROPIC_API_KEY', 'dd-key') + monkeypatch.delenv('ANTHROPIC_API_KEY', raising=False) + config = RootConfig({}) + + assert config.ai.anthropic_api_key == 'dd-key' + assert config.raw_data == {'ai': {}} + + def test_anthropic_api_key_env_var(self, monkeypatch): + monkeypatch.delenv('DD_ANTHROPIC_API_KEY', raising=False) + monkeypatch.setenv('ANTHROPIC_API_KEY', 'anth-key') + config = RootConfig({}) + + assert config.ai.anthropic_api_key == 'anth-key' + assert config.raw_data == {'ai': {}} + + def test_anthropic_api_key_config_takes_precedence_over_env(self, monkeypatch): + monkeypatch.setenv('DD_ANTHROPIC_API_KEY', 'env-key') + config = RootConfig({'ai': {'anthropic_api_key': 'config-key'}}) + + assert config.ai.anthropic_api_key == 'config-key' + + def test_anthropic_api_key_dd_takes_precedence(self, monkeypatch): + monkeypatch.setenv('DD_ANTHROPIC_API_KEY', 'dd-key') + monkeypatch.setenv('ANTHROPIC_API_KEY', 'anth-key') + config = RootConfig({}) + + assert config.ai.anthropic_api_key == 'dd-key' + + def test_anthropic_api_key_not_string(self, helpers): + config = RootConfig({'ai': {'anthropic_api_key': 9000}}) + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai -> anthropic_api_key + must be a string""" + ), + ): + _ = config.ai.anthropic_api_key + + def test_anthropic_api_key_set_lazy_error(self, helpers): + config = RootConfig({}) + + config.ai.anthropic_api_key = 9000 + assert config.raw_data == {'ai': {'anthropic_api_key': 9000}} + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai -> anthropic_api_key + must be a string""" + ), + ): + _ = config.ai.anthropic_api_key + + def test_flow_dirs(self): + config = RootConfig({'ai': {'flow_dirs': ['~/foo', './bar', '../baz']}}) + + assert config.ai.flow_dirs == ['~/foo', './bar', '../baz'] + assert config.raw_data == {'ai': {'flow_dirs': ['~/foo', './bar', '../baz']}} + + def test_flow_dirs_not_list(self, helpers): + config = RootConfig({'ai': {'flow_dirs': 9000}}) + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai -> flow_dirs + must be an array""" + ), + ): + _ = config.ai.flow_dirs + + def test_flow_dirs_entry_not_string(self, helpers): + config = RootConfig({'ai': {'flow_dirs': [9000]}}) + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai -> flow_dirs -> 0 + must be a string""" + ), + ): + _ = config.ai.flow_dirs + + def test_flow_dirs_set_lazy_error(self, helpers): + config = RootConfig({}) + + config.ai.flow_dirs = 9000 + assert config.raw_data == {'ai': {'flow_dirs': 9000}} + + with pytest.raises( + ConfigurationError, + match=helpers.dedent( + """ + Error parsing config: + ai -> flow_dirs + must be an array""" + ), + ): + _ = config.ai.flow_dirs diff --git a/ddev/tests/config/test_utils.py b/ddev/tests/config/test_utils.py new file mode 100644 index 0000000000000..6ac45173deb5a --- /dev/null +++ b/ddev/tests/config/test_utils.py @@ -0,0 +1,82 @@ +# (C) Datadog, Inc. 2022-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + +from ddev.config.utils import SCRUBBED_VALUE, scrub_config + + +@pytest.fixture +def full_config(): + return { + 'orgs': { + 'a': {'api_key': 'A', 'app_key': 'B'}, + 'b': {'api_key': 'C'}, + }, + 'github': {'token': 'gh-secret'}, + 'pypi': {'auth': 'pypi-secret'}, + 'trello': {'token': 'trello-secret'}, + 'ai': {'anthropic_api_key': 'sk-test'}, + } + + +def test_scrub_wildcard_siblings(full_config): + scrub_config(full_config) + + assert full_config['orgs']['a']['api_key'] == SCRUBBED_VALUE + assert full_config['orgs']['a']['app_key'] == SCRUBBED_VALUE + assert full_config['orgs']['b']['api_key'] == SCRUBBED_VALUE + + +def test_scrub_top_level_secrets(full_config): + scrub_config(full_config) + + assert full_config['github']['token'] == SCRUBBED_VALUE + assert full_config['pypi']['auth'] == SCRUBBED_VALUE + assert full_config['trello']['token'] == SCRUBBED_VALUE + assert full_config['ai']['anthropic_api_key'] == SCRUBBED_VALUE + + +def test_scrub_non_secret_fields_untouched(full_config): + full_config['orgs']['a']['site'] = 'datadoghq.com' + scrub_config(full_config) + + assert full_config['orgs']['a']['site'] == 'datadoghq.com' + + +def test_scrub_missing_org_keys_does_not_crash(): + config = {'orgs': {'a': {'api_key': 'A'}}} + scrub_config(config) + + assert config['orgs']['a']['api_key'] == SCRUBBED_VALUE + + +def test_scrub_non_dict_org_value_does_not_crash(): + config = {'orgs': {'bad': 'text'}} + scrub_config(config) + + assert config['orgs']['bad'] == 'text' + + +def test_scrub_mixed_dict_and_non_dict_org_values(): + config = {'orgs': {'a': {'api_key': 'A'}, 'b': 'text'}} + scrub_config(config) + + assert config['orgs']['a']['api_key'] == SCRUBBED_VALUE + assert config['orgs']['b'] == 'text' + + +def test_scrub_missing_top_level_section_does_not_crash(): + scrub_config({}) + + +def test_scrub_missing_nested_section_does_not_crash(): + scrub_config({'github': {}}) + scrub_config({'orgs': {}}) + + +def test_scrub_already_empty_string(): + config = {'github': {'token': ''}} + scrub_config(config) + + assert config['github']['token'] == SCRUBBED_VALUE