From 12c11b54b2cfc0ad281605616b5a2c980ae3ad1e Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Wed, 27 May 2026 09:34:58 +0100 Subject: [PATCH 1/4] Show exact run URL and add lifecycle comment to wheel promotion (#23828) * Show exact run URL and add lifecycle comment to wheel promotion - Extend dispatch_workflow with return_run_details so callers can get back the new run's html_url instead of a generic recent-runs link. - ddev dep promote now prints the exact workflow run URL and suppresses noisy httpx request logs around the API calls. - Replace the single success comment in dependency-wheel-promotion.yaml with a lifecycle comment that updates on start, success, and failure, scoped per (PR, head SHA) via a hidden marker so re-dispatches edit the same comment. * Harden lifecycle comment chaining and github-script inputs - Started-comment step now references find_comment.outputs.comment-id (the previous version pointed at its own step output, so re-dispatches for the same SHA would not have updated the existing comment). - Pass inputs.head_sha into actions/github-script via env: HEAD_SHA and read process.env.HEAD_SHA in the script body, so a hostile workflow_dispatch input cannot break out of the JS string literal and execute arbitrary code. * Type-narrow dispatch_workflow and bail out cleanly on missing run details - Add Literal[True]/Literal[False] overloads to GitHubManager.dispatch_workflow so callers asking for run details get a non-nullable dict back at the type level. - Replace the bare assert in ddev dep promote with an explicit app.abort, run the validity check before printing the success message, and keep the success output inside the httpx-suppression scope. - Add ddev/changelog.d/23828.added so the PR-changelog check passes for the ddev source changes. - Lift the github credentials setup into ddev/tests/cli/dep/conftest.py as an autouse fixture, hoist the test-side logging import, and add coverage for the no-run-details abort path and the failure-path httpx level restoration. - Match the cleaner api_post.call_args.kwargs form already used in the companion test in tests/utils/test_github.py. * Trim runtime imports and share httpx-debug fixture across promote tests - Move Any and Literal under TYPE_CHECKING in github.py; they are only used inside annotations that PEP 563 keeps as strings, so they have no runtime cost. The overload decorator stays at module scope because it runs at class definition time. - Add an httpx_at_debug fixture in tests/cli/dep/conftest.py and use it from both httpx-suppression tests so the get-logger/set-DEBUG/restore boilerplate lives in one place. * Type-annotate the new ddev/tests/cli/dep fixtures --- .../workflows/dependency-wheel-promotion.yaml | 59 +++++++++++--- ddev/changelog.d/23828.added | 1 + ddev/src/ddev/cli/dep/promote.py | 35 ++++---- ddev/src/ddev/utils/github.py | 48 +++++++++-- ddev/tests/cli/dep/conftest.py | 32 ++++++++ ddev/tests/cli/dep/test_promote.py | 79 +++++++++++++++++++ ddev/tests/utils/test_github.py | 45 +++++++++++ 7 files changed, 268 insertions(+), 31 deletions(-) create mode 100644 ddev/changelog.d/23828.added create mode 100644 ddev/tests/cli/dep/conftest.py create mode 100644 ddev/tests/cli/dep/test_promote.py diff --git a/.github/workflows/dependency-wheel-promotion.yaml b/.github/workflows/dependency-wheel-promotion.yaml index 5e26cb45bef8a..e5b522f43586e 100644 --- a/.github/workflows/dependency-wheel-promotion.yaml +++ b/.github/workflows/dependency-wheel-promotion.yaml @@ -27,6 +27,25 @@ jobs: - name: Checkout trusted code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Find existing lifecycle comment + id: find_comment + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4.0.0 + with: + issue-number: ${{ inputs.pr_number }} + body-includes: "" + + - name: Post lifecycle comment (started) + id: started_comment + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 + with: + issue-number: ${{ inputs.pr_number }} + comment-id: ${{ steps.find_comment.outputs.comment-id }} + edit-mode: replace + body: | + + Wheel promotion started for commit `${{ inputs.head_sha }}` by @${{ github.actor }}. + Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + - name: Checkout PR lockfiles only uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: @@ -62,41 +81,57 @@ jobs: - name: Set dependency-wheel-promotion status to success uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + env: + HEAD_SHA: ${{ inputs.head_sha }} with: script: | await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, - sha: '${{ inputs.head_sha }}', + sha: process.env.HEAD_SHA, state: 'success', context: 'dependency-wheel-promotion', description: 'Wheels promoted to stable storage.', target_url: `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, }); - - name: Post success comment - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + - name: Update lifecycle comment (success) + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 with: - script: | - const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: ${{ inputs.pr_number }}, - body: `Wheels promoted to stable storage for commit ${{ inputs.head_sha }} by @${context.actor}. [Workflow run](${runUrl}).`, - }); + issue-number: ${{ inputs.pr_number }} + comment-id: ${{ steps.started_comment.outputs.comment-id }} + edit-mode: replace + body: | + + Wheels promoted to stable storage for commit `${{ inputs.head_sha }}` by @${{ github.actor }}. + Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - name: Set dependency-wheel-promotion status to error if: failure() uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + env: + HEAD_SHA: ${{ inputs.head_sha }} with: script: | await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, - sha: '${{ inputs.head_sha }}', + sha: process.env.HEAD_SHA, state: 'error', context: 'dependency-wheel-promotion', description: 'Wheel promotion failed. Check the Actions tab for details.', target_url: `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, }); + + - name: Update lifecycle comment (failure) + if: failure() + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 + with: + issue-number: ${{ inputs.pr_number }} + comment-id: ${{ steps.started_comment.outputs.comment-id }} + edit-mode: replace + body: | + + Wheel promotion failed for commit `${{ inputs.head_sha }}` by @${{ github.actor }}. + Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + Check the workflow logs before retrying. diff --git a/ddev/changelog.d/23828.added b/ddev/changelog.d/23828.added new file mode 100644 index 0000000000000..1a808e1d73e89 --- /dev/null +++ b/ddev/changelog.d/23828.added @@ -0,0 +1 @@ +Print the exact workflow run URL when dispatching `ddev dep promote`, via a new `return_run_details` option on `GitHubManager.dispatch_workflow`. diff --git a/ddev/src/ddev/cli/dep/promote.py b/ddev/src/ddev/cli/dep/promote.py index d298e7bbe1c23..c7f3210de3671 100644 --- a/ddev/src/ddev/cli/dep/promote.py +++ b/ddev/src/ddev/cli/dep/promote.py @@ -3,6 +3,7 @@ # Licensed under a 3-clause BSD style license (see LICENSE) from __future__ import annotations +import logging import re from typing import TYPE_CHECKING @@ -39,20 +40,26 @@ def promote(app: Application, pr_url: str): pr_number = int(match.group(1)) - with app.status(f'Fetching PR #{pr_number} head...'): - head_sha, head_ref = app.github.get_pr_head(pr_number) + httpx_logger = logging.getLogger('httpx') + previous_level = httpx_logger.level + httpx_logger.setLevel(logging.WARNING) + try: + with app.status(f'Fetching PR #{pr_number} head...'): + head_sha, head_ref = app.github.get_pr_head(pr_number) - app.display_info(f'PR #{pr_number} — branch: {head_ref}, SHA: {head_sha}') + app.display_info(f'PR #{pr_number}: branch {head_ref}, SHA {head_sha}') - with app.status('Dispatching promote workflow...'): - app.github.dispatch_workflow( - workflow_id=PROMOTE_WORKFLOW, - ref=PROMOTE_WORKFLOW_REF, - inputs={'pr_number': str(pr_number), 'head_sha': head_sha}, - ) + with app.status('Dispatching promote workflow...'): + run_details = app.github.dispatch_workflow( + workflow_id=PROMOTE_WORKFLOW, + ref=PROMOTE_WORKFLOW_REF, + inputs={'pr_number': str(pr_number), 'head_sha': head_sha}, + return_run_details=True, + ) - runs_url = ( - f'https://github.com/{app.github.repo_id}/actions/workflows/{PROMOTE_WORKFLOW}?query=event%3Aworkflow_dispatch' - ) - app.display_success(f'Promote workflow dispatched for PR #{pr_number}.') - app.display_info(f'Recent runs: {runs_url}') + if not run_details: + app.abort('Workflow dispatched but no run details were returned.') + app.display_success(f'Promote workflow dispatched for PR #{pr_number}.') + app.display_info(f'Workflow run: {run_details["html_url"]}') + finally: + httpx_logger.setLevel(previous_level) diff --git a/ddev/src/ddev/utils/github.py b/ddev/src/ddev/utils/github.py index ef314fb7d9fe7..bae40dc9ff23c 100644 --- a/ddev/src/ddev/utils/github.py +++ b/ddev/src/ddev/utils/github.py @@ -6,9 +6,11 @@ import json from functools import cached_property from time import time -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, overload if TYPE_CHECKING: + from typing import Any, Literal + from httpx import Client from ddev.cli.terminal import BorrowedStatus @@ -217,12 +219,48 @@ def get_pull_request_labels(self, pr_number: int) -> list[str] | None: return None return [label['name'] for label in response.json().get('labels', [])] - def dispatch_workflow(self, workflow_id: str, ref: str, inputs: dict[str, Any]) -> None: - """Trigger a workflow_dispatch event.""" - self.__api_post( + @overload + def dispatch_workflow( + self, + workflow_id: str, + ref: str, + inputs: dict[str, Any], + return_run_details: Literal[False] = False, + ) -> None: ... + + @overload + def dispatch_workflow( + self, + workflow_id: str, + ref: str, + inputs: dict[str, Any], + return_run_details: Literal[True], + ) -> dict[str, Any]: ... + + def dispatch_workflow( + self, + workflow_id: str, + ref: str, + inputs: dict[str, Any], + return_run_details: bool = False, + ) -> dict[str, Any] | None: + """Trigger a workflow_dispatch event. + + When ``return_run_details`` is true, request the new run's details from + the API and return the parsed JSON response (``workflow_run_id``, + ``run_url``, ``html_url``). The default keeps the prior fire-and-forget + behavior and returns ``None``. + """ + payload: dict[str, Any] = {'ref': ref, 'inputs': inputs} + if return_run_details: + payload['return_run_details'] = True + response = self.__api_post( self.WORKFLOW_DISPATCH_API.format(repo_id=self.repo_id, workflow_id=workflow_id), - content=json.dumps({'ref': ref, 'inputs': inputs}), + content=json.dumps(payload), ) + if not return_run_details: + return None + return response.json() def get_pull_request_comments(self, pr_number: int) -> list[dict]: response = self.__api_get( diff --git a/ddev/tests/cli/dep/conftest.py b/ddev/tests/cli/dep/conftest.py new file mode 100644 index 0000000000000..8758a8cbc804c --- /dev/null +++ b/ddev/tests/cli/dep/conftest.py @@ -0,0 +1,32 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from __future__ import annotations + +import logging +from collections.abc import Generator +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from ddev.config.file import ConfigFileWithOverrides + + +@pytest.fixture(autouse=True) +def configure_github_credentials(config_file: ConfigFileWithOverrides) -> None: + """Provide github credentials so commands that touch app.github do not abort.""" + config_file.model.github = {'user': 'test-user', 'token': 'test-token'} + config_file.save() + + +@pytest.fixture +def httpx_at_debug() -> Generator[logging.Logger, None, None]: + """Force the httpx logger to DEBUG and restore its previous level on teardown.""" + logger = logging.getLogger('httpx') + previous_level = logger.level + logger.setLevel(logging.DEBUG) + try: + yield logger + finally: + logger.setLevel(previous_level) diff --git a/ddev/tests/cli/dep/test_promote.py b/ddev/tests/cli/dep/test_promote.py new file mode 100644 index 0000000000000..2d55db0660c63 --- /dev/null +++ b/ddev/tests/cli/dep/test_promote.py @@ -0,0 +1,79 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import logging + +import pytest + +RUN_DETAILS = { + 'workflow_run_id': 999, + 'run_url': 'https://api.github.com/repos/DataDog/integrations-core/actions/runs/999', + 'html_url': 'https://github.com/DataDog/integrations-core/actions/runs/999', +} + + +def test_promote_dispatches_workflow_and_prints_run_url(ddev, mocker): + mocker.patch('ddev.utils.github.GitHubManager.get_pr_head', return_value=('deadbeef', 'feature-branch')) + dispatch = mocker.patch('ddev.utils.github.GitHubManager.dispatch_workflow', return_value=RUN_DETAILS) + + result = ddev('dep', 'promote', 'https://github.com/DataDog/integrations-core/pull/12345') + + assert result.exit_code == 0, result.output + dispatch.assert_called_once_with( + workflow_id='dependency-wheel-promotion.yaml', + ref='master', + inputs={'pr_number': '12345', 'head_sha': 'deadbeef'}, + return_run_details=True, + ) + assert 'PR #12345' in result.output + assert 'feature-branch' in result.output + assert 'deadbeef' in result.output + assert RUN_DETAILS['html_url'] in result.output + assert 'Recent runs' not in result.output + assert 'query=event%3Aworkflow_dispatch' not in result.output + + +def test_promote_invalid_pr_url_aborts(ddev): + result = ddev('dep', 'promote', 'https://example.invalid/not-a-pr') + + assert result.exit_code != 0 + assert 'Could not extract a PR number' in result.output + + +def test_promote_aborts_when_no_run_details_returned(ddev, mocker): + mocker.patch('ddev.utils.github.GitHubManager.get_pr_head', return_value=('deadbeef', 'feature-branch')) + mocker.patch('ddev.utils.github.GitHubManager.dispatch_workflow', return_value=None) + + result = ddev('dep', 'promote', 'https://github.com/DataDog/integrations-core/pull/12345') + + assert result.exit_code != 0 + assert 'no run details were returned' in result.output + assert 'Promote workflow dispatched' not in result.output + + +def test_promote_suppresses_httpx_logs_and_restores_level(ddev, mocker, httpx_at_debug): + captured_levels = [] + + def capture_level(*_args, **_kwargs): + captured_levels.append(httpx_at_debug.level) + return ('deadbeef', 'feature-branch') + + mocker.patch('ddev.utils.github.GitHubManager.get_pr_head', side_effect=capture_level) + mocker.patch('ddev.utils.github.GitHubManager.dispatch_workflow', return_value=RUN_DETAILS) + + result = ddev('dep', 'promote', 'https://github.com/DataDog/integrations-core/pull/12345') + + assert result.exit_code == 0, result.output + assert captured_levels == [logging.WARNING] + assert httpx_at_debug.level == logging.DEBUG + + +def test_promote_restores_httpx_log_level_on_failure(ddev, mocker, httpx_at_debug): + """Ensure the finally branch restores the previous httpx logger level even when an API call raises.""" + mocker.patch('ddev.utils.github.GitHubManager.get_pr_head', side_effect=RuntimeError('boom')) + mocker.patch('ddev.utils.github.GitHubManager.dispatch_workflow') + + with pytest.raises(RuntimeError, match='boom'): + ddev('dep', 'promote', 'https://github.com/DataDog/integrations-core/pull/12345') + + assert httpx_at_debug.level == logging.DEBUG diff --git a/ddev/tests/utils/test_github.py b/ddev/tests/utils/test_github.py index 59b67d6a97fa5..e1f103dadcb08 100644 --- a/ddev/tests/utils/test_github.py +++ b/ddev/tests/utils/test_github.py @@ -1,6 +1,8 @@ # (C) Datadog, Inc. 2023-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +import json + import pytest from ddev.utils.github import PullRequest @@ -83,3 +85,46 @@ def test_create_label(self, network_replay, github_manager): assert label.json()['name'] == 'my_custom_label' assert label.json()['color'] == 'ff0000' + + +def test_dispatch_workflow_default_returns_none(github_manager, mocker): + """Default dispatch_workflow keeps the prior fire-and-forget behavior.""" + response = mocker.MagicMock() + api_post = mocker.patch('ddev.utils.github.GitHubManager._GitHubManager__api_post', return_value=response) + + result = github_manager.dispatch_workflow( + workflow_id='example.yaml', + ref='master', + inputs={'pr_number': '123', 'head_sha': 'deadbeef'}, + ) + + assert result is None + api_post.assert_called_once() + payload = json.loads(api_post.call_args.kwargs['content']) + assert payload == {'ref': 'master', 'inputs': {'pr_number': '123', 'head_sha': 'deadbeef'}} + assert 'return_run_details' not in payload + + +def test_dispatch_workflow_return_run_details_sends_flag_and_returns_json(github_manager, mocker): + """When return_run_details is true, the payload includes the flag and the parsed JSON is returned.""" + run_details = { + 'workflow_run_id': 42, + 'run_url': 'https://api.github.com/repos/o/r/actions/runs/42', + 'html_url': 'https://github.com/o/r/actions/runs/42', + } + response = mocker.MagicMock() + response.json.return_value = run_details + api_post = mocker.patch('ddev.utils.github.GitHubManager._GitHubManager__api_post', return_value=response) + + result = github_manager.dispatch_workflow( + workflow_id='example.yaml', + ref='master', + inputs={'pr_number': '123', 'head_sha': 'deadbeef'}, + return_run_details=True, + ) + + assert result == run_details + payload = json.loads(api_post.call_args.kwargs['content']) + assert payload['return_run_details'] is True + assert payload['ref'] == 'master' + assert payload['inputs'] == {'pr_number': '123', 'head_sha': 'deadbeef'} From 3345d4d211fa6f28c5be57d5325dfc52130502ff Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Wed, 27 May 2026 13:10:17 +0100 Subject: [PATCH 2/4] Pin coverage datadog action to the latest one (#23845) --- .github/workflows/master-windows.yml | 2 +- .github/workflows/master.yml | 88 ++++++------ .github/workflows/pr-all-windows.yml | 72 +++++----- .github/workflows/pr-all.yml | 78 +++++------ .github/workflows/pr-test.yml | 74 +++++----- .github/workflows/test-fips-e2e.yml | 201 +++++++++++++-------------- 6 files changed, 257 insertions(+), 258 deletions(-) diff --git a/.github/workflows/master-windows.yml b/.github/workflows/master-windows.yml index 7f858efd91012..e1cdd3c037d71 100644 --- a/.github/workflows/master-windows.yml +++ b/.github/workflows/master-windows.yml @@ -102,7 +102,7 @@ jobs: - name: Upload coverage to Datadog if: always() continue-on-error: true - uses: DataDog/coverage-upload-github-action@9bbbf86d16f7db1b14c5b885e61cf0d96053686a # v1.0.0 + uses: DataDog/coverage-upload-github-action@6c4bd935248daa6f0ef94e3e6ba71ad5ad079998 # v1.0.3 with: api_key: ${{ secrets.DD_API_KEY }} files: coverage-reports diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 7fcc2f903aaf0..45786b66173ae 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -3,27 +3,27 @@ name: Master on: push: branches: - - master + - master paths: # List of files/paths that should trigger the run. The intention is to avoid running all tests if the commit only includes changes on assets or README - - '*/datadog_checks/**' - - '*/tests/**' - - 'ddev/**' - - 'datadog_checks_base/**' - - 'datadog_checks_dev/**' - # Contains overrides for testing - - '.ddev/**' - # Want to ensure any change in workflows is validated - - '.github/workflows/**' - # Test matrices and dependencies - - '*/hatch.toml' - - '*/pyproject.toml' - # Some integrations might use this file to validate metrics emission - - '*/metadata.csv' - # In case some linting formatting config has changed - - 'pyproject.toml' + - "*/datadog_checks/**" + - "*/tests/**" + - "ddev/**" + - "datadog_checks_base/**" + - "datadog_checks_dev/**" + # Contains overrides for testing + - ".ddev/**" + # Want to ensure any change in workflows is validated + - ".github/workflows/**" + # Test matrices and dependencies + - "*/hatch.toml" + - "*/pyproject.toml" + # Some integrations might use this file to validate metrics emission + - "*/metadata.csv" + # In case some linting formatting config has changed + - "pyproject.toml" schedule: - - cron: '0 2 * * *' + - cron: "0 2 * * *" jobs: cache: @@ -31,7 +31,7 @@ jobs: test: needs: - - cache + - cache uses: ./.github/workflows/test-all.yml with: @@ -48,12 +48,12 @@ jobs: secrets: inherit permissions: - # needed for compute-matrix in test-target.yml - contents: read + # needed for compute-matrix in test-target.yml + contents: read publish-test-results: needs: - - test + - test if: success() || failure() concurrency: @@ -69,7 +69,7 @@ jobs: upload-coverage: needs: - - test + - test if: > !github.event.repository.private && (success() || failure()) @@ -80,27 +80,27 @@ jobs: contents: read steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Download all coverage artifacts - uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 - with: - pattern: coverage-* - path: coverage-reports - merge-multiple: false + - name: Download all coverage artifacts + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + pattern: coverage-* + path: coverage-reports + merge-multiple: false - - name: Upload coverage to Codecov - uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de - with: - use_oidc: true - directory: coverage-reports - fail_ci_if_error: false + - name: Upload coverage to Codecov + uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de + with: + use_oidc: true + directory: coverage-reports + fail_ci_if_error: false - - name: Upload coverage to Datadog - if: always() - continue-on-error: true - uses: DataDog/coverage-upload-github-action@9bbbf86d16f7db1b14c5b885e61cf0d96053686a # v1.0.0 - with: - api_key: ${{ secrets.DD_API_KEY }} - files: coverage-reports - format: cobertura + - name: Upload coverage to Datadog + if: always() + continue-on-error: true + uses: DataDog/coverage-upload-github-action@6c4bd935248daa6f0ef94e3e6ba71ad5ad079998 # v1.0.3 + with: + api_key: ${{ secrets.DD_API_KEY }} + files: coverage-reports + format: cobertura diff --git a/.github/workflows/pr-all-windows.yml b/.github/workflows/pr-all-windows.yml index 8f1d9c0e34268..a3a5c824a1d7c 100644 --- a/.github/workflows/pr-all-windows.yml +++ b/.github/workflows/pr-all-windows.yml @@ -5,17 +5,17 @@ name: PR All Windows on: pull_request: paths: - - datadog_checks_base/datadog_checks/** - - datadog_checks_dev/datadog_checks/dev/*.py - - ddev/src/** - - "!agent_requirements.in" - # Also run if we modify the workflow files - - '.github/workflows/pr-all-windows.yml' - - '.github/workflows/test-target.yml' - - '.github/workflows/test-all-windows.yml' - # Also run in the action to install test-target scripts changes - - '.github/actions/setup-test-target-scripts/**' - - '.github/actions/setup-ddev/**' + - datadog_checks_base/datadog_checks/** + - datadog_checks_dev/datadog_checks/dev/*.py + - ddev/src/** + - "!agent_requirements.in" + # Also run if we modify the workflow files + - ".github/workflows/pr-all-windows.yml" + - ".github/workflows/test-target.yml" + - ".github/workflows/test-all-windows.yml" + # Also run in the action to install test-target scripts changes + - ".github/actions/setup-test-target-scripts/**" + - ".github/actions/setup-ddev/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} @@ -26,8 +26,8 @@ jobs: uses: ./.github/workflows/test-all-windows.yml permissions: - # needed for compute-matrix in test-target.yml - contents: read + # needed for compute-matrix in test-target.yml + contents: read with: repo: core @@ -39,14 +39,14 @@ jobs: save-event: needs: - - test + - test if: success() || failure() uses: ./.github/workflows/save-event.yml upload-coverage: needs: - - test + - test if: > !github.event.repository.private && (success() || failure()) @@ -57,27 +57,27 @@ jobs: contents: read steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Download all coverage artifacts - uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 - with: - pattern: coverage-* - path: coverage-reports - merge-multiple: false + - name: Download all coverage artifacts + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + pattern: coverage-* + path: coverage-reports + merge-multiple: false - - name: Upload coverage to Codecov - uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de - with: - use_oidc: true - directory: coverage-reports - fail_ci_if_error: false + - name: Upload coverage to Codecov + uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de + with: + use_oidc: true + directory: coverage-reports + fail_ci_if_error: false - - name: Upload coverage to Datadog - if: always() - continue-on-error: true - uses: DataDog/coverage-upload-github-action@9bbbf86d16f7db1b14c5b885e61cf0d96053686a # v1.0.0 - with: - api_key: ${{ secrets.DD_API_KEY }} - files: coverage-reports - format: cobertura + - name: Upload coverage to Datadog + if: always() + continue-on-error: true + uses: DataDog/coverage-upload-github-action@6c4bd935248daa6f0ef94e3e6ba71ad5ad079998 # v1.0.3 + with: + api_key: ${{ secrets.DD_API_KEY }} + files: coverage-reports + format: cobertura diff --git a/.github/workflows/pr-all.yml b/.github/workflows/pr-all.yml index 9ea6dce99667e..fb9fe8ca4cb30 100644 --- a/.github/workflows/pr-all.yml +++ b/.github/workflows/pr-all.yml @@ -3,20 +3,20 @@ name: PR All on: pull_request: paths: - - datadog_checks_base/datadog_checks/** - - datadog_checks_base/pyproject.toml - - datadog_checks_dev/datadog_checks/dev/*.py - - datadog_checks_dev/pyproject.toml - - ddev/src/** - - ddev/pyproject.toml - - "!agent_requirements.in" - # Also run if we modify the workflow files - - '.github/workflows/pr-all.yml' - - '.github/workflows/test-target.yml' - - '.github/workflows/test-all.yml' - # Also run if the action to install test-target scripts changes - - '.github/actions/setup-test-target-scripts/**' - - '.github/actions/setup-ddev/**' + - datadog_checks_base/datadog_checks/** + - datadog_checks_base/pyproject.toml + - datadog_checks_dev/datadog_checks/dev/*.py + - datadog_checks_dev/pyproject.toml + - ddev/src/** + - ddev/pyproject.toml + - "!agent_requirements.in" + # Also run if we modify the workflow files + - ".github/workflows/pr-all.yml" + - ".github/workflows/test-target.yml" + - ".github/workflows/test-all.yml" + # Also run if the action to install test-target scripts changes + - ".github/actions/setup-test-target-scripts/**" + - ".github/actions/setup-ddev/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} @@ -27,8 +27,8 @@ jobs: uses: ./.github/workflows/test-all.yml permissions: - # needed for compute-matrix in test-target.yml - contents: read + # needed for compute-matrix in test-target.yml + contents: read with: repo: core @@ -42,14 +42,14 @@ jobs: save-event: needs: - - test + - test if: success() || failure() uses: ./.github/workflows/save-event.yml upload-coverage: needs: - - test + - test if: > !github.event.repository.private && (success() || failure()) @@ -60,27 +60,27 @@ jobs: contents: read steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Download all coverage artifacts - uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 - with: - pattern: coverage-* - path: coverage-reports - merge-multiple: false + - name: Download all coverage artifacts + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + pattern: coverage-* + path: coverage-reports + merge-multiple: false - - name: Upload coverage to Codecov - uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de - with: - use_oidc: true - directory: coverage-reports - fail_ci_if_error: false + - name: Upload coverage to Codecov + uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de + with: + use_oidc: true + directory: coverage-reports + fail_ci_if_error: false - - name: Upload coverage to Datadog - if: always() - continue-on-error: true - uses: DataDog/coverage-upload-github-action@9bbbf86d16f7db1b14c5b885e61cf0d96053686a # v1.0.0 - with: - api_key: ${{ secrets.DD_API_KEY }} - files: coverage-reports - format: cobertura + - name: Upload coverage to Datadog + if: always() + continue-on-error: true + uses: DataDog/coverage-upload-github-action@6c4bd935248daa6f0ef94e3e6ba71ad5ad079998 # v1.0.3 + with: + api_key: ${{ secrets.DD_API_KEY }} + files: coverage-reports + format: cobertura diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 8f3a14ece990e..824471469d4dc 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,7 +33,7 @@ jobs: test: needs: - - compute-matrix + - compute-matrix if: needs.compute-matrix.outputs.matrix != '[]' && github.event_name != 'merge_group' strategy: fail-fast: false @@ -64,7 +64,7 @@ jobs: test-minimum-base-package: needs: - - compute-matrix + - compute-matrix if: needs.compute-matrix.outputs.matrix != '[]' && github.event_name != 'merge_group' strategy: fail-fast: false @@ -96,16 +96,16 @@ jobs: save-event: needs: - - test - - test-minimum-base-package + - test + - test-minimum-base-package if: success() || failure() uses: ./.github/workflows/save-event.yml upload-coverage: needs: - - test - - test-minimum-base-package + - test + - test-minimum-base-package if: > !github.event.repository.private && (success() || failure()) && @@ -117,35 +117,35 @@ jobs: contents: read steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - - name: Download all coverage artifacts - uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 - with: - pattern: coverage-* - path: coverage-reports - merge-multiple: false - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de - with: - use_oidc: true - directory: coverage-reports - fail_ci_if_error: false - - - name: Upload coverage to Datadog - if: always() - continue-on-error: true - uses: DataDog/coverage-upload-github-action@9bbbf86d16f7db1b14c5b885e61cf0d96053686a # v1.0.0 - with: - api_key: ${{ secrets.DD_API_KEY }} - files: coverage-reports - format: cobertura + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Download all coverage artifacts + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + pattern: coverage-* + path: coverage-reports + merge-multiple: false + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de + with: + use_oidc: true + directory: coverage-reports + fail_ci_if_error: false + + - name: Upload coverage to Datadog + if: always() + continue-on-error: true + uses: DataDog/coverage-upload-github-action@6c4bd935248daa6f0ef94e3e6ba71ad5ad079998 # v1.0.3 + with: + api_key: ${{ secrets.DD_API_KEY }} + files: coverage-reports + format: cobertura check: needs: - - test - - test-minimum-base-package + - test + - test-minimum-base-package # In integrations-core and integrations-extras repos the tests are flaky enough that # it would be a pain to merge PRs with the Merge Queue enabled. # While we work on the tests, we skip the job if it's triggered by Merge Queue. @@ -154,8 +154,8 @@ jobs: runs-on: ubuntu-latest steps: - - name: Check status of required jobs - uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # v1.2.2 - with: - jobs: ${{ toJSON(needs) }} - allowed-skips: test, test-minimum-base-package + - name: Check status of required jobs + uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # v1.2.2 + with: + jobs: ${{ toJSON(needs) }} + allowed-skips: test, test-minimum-base-package diff --git a/.github/workflows/test-fips-e2e.yml b/.github/workflows/test-fips-e2e.yml index 1035573ea9424..23ae8619da71f 100644 --- a/.github/workflows/test-fips-e2e.yml +++ b/.github/workflows/test-fips-e2e.yml @@ -17,10 +17,10 @@ on: type: string pull_request: paths: - - datadog_checks_base/datadog_checks/** - - datadog_checks_base/pyproject.toml + - datadog_checks_base/datadog_checks/** + - datadog_checks_base/pyproject.toml schedule: - - cron: '0 0,8,16 * * *' + - cron: "0 0,8,16 * * *" defaults: run: @@ -43,103 +43,102 @@ jobs: DD_TRACE_ANALYTICS_ENABLED: "true" permissions: - # needed for dd-sts and codecov in test-target.yml, allows the action to get a JWT signed by Github - id-token: write - # needed for compute-matrix in test-target.yml - contents: read + # needed for dd-sts and codecov in test-target.yml, allows the action to get a JWT signed by Github + id-token: write + # needed for compute-matrix in test-target.yml + contents: read steps: - - - name: Set environment variables with sanitized paths - run: | - JOB_NAME="test-fips-e2e" - - echo "TEST_RESULTS_DIR=$TEST_RESULTS_BASE_DIR/$JOB_NAME" >> $GITHUB_ENV - echo "TRACE_CAPTURE_FILE=$TRACE_CAPTURE_BASE_DIR/$JOB_NAME" >> $GITHUB_ENV - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - - name: Set up Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: "${{ env.PYTHON_VERSION }}" - - - name: Get Datadog credentials - id: dd-sts - uses: DataDog/dd-sts-action@2e8187910199bd93129520183c093e19aa585c75 # v1.0.0 - with: - policy: integrations-core-api-key - - - name: Install ddev from local folder - uses: ./.github/actions/setup-ddev - with: - install-mode: local - cache-profile: local-ddev-base - - - name: Configure ddev - run: |- - ddev config set upgrade_check false - ddev config set repos.core . - ddev config set repo core - - - name: Prepare for testing - env: - PYTHONUNBUFFERED: "1" - DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} - DOCKER_ACCESS_TOKEN: ${{ secrets.DOCKER_ACCESS_TOKEN }} - ORACLE_DOCKER_USERNAME: ${{ secrets.ORACLE_DOCKER_USERNAME }} - ORACLE_DOCKER_PASSWORD: ${{ secrets.ORACLE_DOCKER_PASSWORD }} - DD_GITHUB_USER: ${{ github.actor }} - DD_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: ddev ci setup ${{ inputs.target || 'tls' }} - - - name: Run E2E tests with FIPS disabled - env: - DDEV_E2E_AGENT: "${{ inputs.agent-image || 'registry.datadoghq.com/agent-dev:master-py3' }}" - DD_API_KEY: "${{ steps.dd-sts.outputs.api_key }}" - run: | - ddev env test --base --new-env --junit ${{ inputs.target || 'tls' }} -- all -m "fips_off" - - - name: Run E2E tests with FIPS enabled - env: - DDEV_E2E_AGENT: "${{ inputs.agent-image-fips || 'registry.datadoghq.com/agent-dev:master-fips' }}" - DD_API_KEY: "${{ steps.dd-sts.outputs.api_key }}" - run: | - ddev env test --base --new-env --junit ${{ inputs.target || 'tls' }} -- all -k "fips_on" - - - name: Finalize test results - if: always() - run: |- - mkdir -p "${{ env.TEST_RESULTS_DIR }}" - if [[ -d ${{ inputs.target || 'tls' }}/junit ]]; then - mv ${{ inputs.target || 'tls' }}/junit/*.xml "${{ env.TEST_RESULTS_DIR }}" - fi - - - name: Upload test results - if: always() - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: "test-results-${{ inputs.target || 'tls' }}" - path: "${{ env.TEST_RESULTS_BASE_DIR }}" - - - name: Upload coverage data - if: > - !github.event.repository.private && - always() - uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de - with: - use_oidc: true - files: "${{ inputs.target || 'tls' }}/coverage.xml" - flags: "${{ inputs.target || 'tls' }}" - - - name: Upload coverage to Datadog - if: > - !github.event.repository.private && - always() - continue-on-error: true - uses: DataDog/coverage-upload-github-action@9bbbf86d16f7db1b14c5b885e61cf0d96053686a # v1.0.0 - with: - api_key: ${{ secrets.DD_API_KEY }} - files: "${{ inputs.target || 'tls' }}/coverage.xml" - format: cobertura - flags: "${{ inputs.target || 'tls' }}" + - name: Set environment variables with sanitized paths + run: | + JOB_NAME="test-fips-e2e" + + echo "TEST_RESULTS_DIR=$TEST_RESULTS_BASE_DIR/$JOB_NAME" >> $GITHUB_ENV + echo "TRACE_CAPTURE_FILE=$TRACE_CAPTURE_BASE_DIR/$JOB_NAME" >> $GITHUB_ENV + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "${{ env.PYTHON_VERSION }}" + + - name: Get Datadog credentials + id: dd-sts + uses: DataDog/dd-sts-action@2e8187910199bd93129520183c093e19aa585c75 # v1.0.0 + with: + policy: integrations-core-api-key + + - name: Install ddev from local folder + uses: ./.github/actions/setup-ddev + with: + install-mode: local + cache-profile: local-ddev-base + + - name: Configure ddev + run: |- + ddev config set upgrade_check false + ddev config set repos.core . + ddev config set repo core + + - name: Prepare for testing + env: + PYTHONUNBUFFERED: "1" + DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} + DOCKER_ACCESS_TOKEN: ${{ secrets.DOCKER_ACCESS_TOKEN }} + ORACLE_DOCKER_USERNAME: ${{ secrets.ORACLE_DOCKER_USERNAME }} + ORACLE_DOCKER_PASSWORD: ${{ secrets.ORACLE_DOCKER_PASSWORD }} + DD_GITHUB_USER: ${{ github.actor }} + DD_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: ddev ci setup ${{ inputs.target || 'tls' }} + + - name: Run E2E tests with FIPS disabled + env: + DDEV_E2E_AGENT: "${{ inputs.agent-image || 'registry.datadoghq.com/agent-dev:master-py3' }}" + DD_API_KEY: "${{ steps.dd-sts.outputs.api_key }}" + run: | + ddev env test --base --new-env --junit ${{ inputs.target || 'tls' }} -- all -m "fips_off" + + - name: Run E2E tests with FIPS enabled + env: + DDEV_E2E_AGENT: "${{ inputs.agent-image-fips || 'registry.datadoghq.com/agent-dev:master-fips' }}" + DD_API_KEY: "${{ steps.dd-sts.outputs.api_key }}" + run: | + ddev env test --base --new-env --junit ${{ inputs.target || 'tls' }} -- all -k "fips_on" + + - name: Finalize test results + if: always() + run: |- + mkdir -p "${{ env.TEST_RESULTS_DIR }}" + if [[ -d ${{ inputs.target || 'tls' }}/junit ]]; then + mv ${{ inputs.target || 'tls' }}/junit/*.xml "${{ env.TEST_RESULTS_DIR }}" + fi + + - name: Upload test results + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: "test-results-${{ inputs.target || 'tls' }}" + path: "${{ env.TEST_RESULTS_BASE_DIR }}" + + - name: Upload coverage data + if: > + !github.event.repository.private && + always() + uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de + with: + use_oidc: true + files: "${{ inputs.target || 'tls' }}/coverage.xml" + flags: "${{ inputs.target || 'tls' }}" + + - name: Upload coverage to Datadog + if: > + !github.event.repository.private && + always() + continue-on-error: true + uses: DataDog/coverage-upload-github-action@6c4bd935248daa6f0ef94e3e6ba71ad5ad079998 # v1.0.3 + with: + api_key: ${{ secrets.DD_API_KEY }} + files: "${{ inputs.target || 'tls' }}/coverage.xml" + format: cobertura + flags: "${{ inputs.target || 'tls' }}" From b5362c71c254c4a65f7acfb036522dd7e5f893b0 Mon Sep 17 00:00:00 2001 From: dkirov-dd <166512750+dkirov-dd@users.noreply.github.com> Date: Wed, 27 May 2026 14:21:12 +0200 Subject: [PATCH 3/4] feat(downloader): add TUFPointerDownloader for v2 pointer-file format (#23144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(downloader): add TUFPointerDownloader for v2 pointer-file format The new agent-integrations-tuf pipeline produces TUF targets as JSON pointer files (targets//.json) rather than the old HTML simple index + in-toto approach. This commit adds: - TUFPointerDownloader in download_v2.py: TUF-verifies the pointer file, then fetches and sha256-verifies the wheel from S3. - DigestMismatch exception for sha256/length failures. - --format v2 CLI flag: routes through TUFPointerDownloader. --unsafe-disable-verification carries forward; --type and --ignore-python-version are no-ops in v2 with a warning. - 8 offline unit tests covering happy path, missing target, digest mismatch, length mismatch, and disable_verification mode. Co-Authored-By: Claude Sonnet 4.6 * fix(downloader): use --repository URL for wheel fetch, not pointer's baked value The pointer file always contains the prod S3 repository URL. When validating staging, the caller passes --repository to point at the staging bucket; that URL should be used for both the TUF metadata fetch AND the wheel download, not just the metadata. Adds a test that asserts the wheel is fetched from the caller-supplied URL even when the pointer contains a different (prod) repository value. Co-Authored-By: Claude Sonnet 4.6 * refactor(downloader): resolve latest via S3 listing, drop latest.json reliance Replace the ``latest.json`` rolling pointer fetch with an S3 ``ListObjectsV2`` walk over ``targets//``: filter keys to PEP 440 stable versions and pick the maximum. The chosen version is then fetched through TUF as before, so the pointer file the client trusts is still cryptographically verified. Why list S3 instead of parsing the signed targets metadata: once ``path_hash_prefixes`` delegations are in use, a client cannot tell from metadata alone which delegation signs the latest version of a given project. Listing the bucket sidesteps that — TUF still authoritatively verifies the chosen version's pointer. The publisher counterpart in agent-integrations-tuf drops ``latest.json`` entirely; see DataDog/agent-integrations-tuf PR #9. - ``_resolve_latest_version`` lists ``targets//`` via the S3 REST API (no boto3 dep), parses the XML response, follows the continuation-token pagination, and applies a PEP 440 stable filter - ``get_pointer(project, version=None)`` resolves ``version`` itself before delegating to the TUF Updater - 6 new offline tests cover max-version selection, pre-release/dev filtering, post-release support, the no-stable error, paginated listings, and non-pointer key skipping Co-Authored-By: Claude Opus 4.7 (1M context) * Revert "refactor(downloader): resolve latest via S3 listing, drop latest.json reliance" This reverts commit 70688d8d5971d7d0b41f284e4d7ffea75b8c231e. * feat(downloader): bundle 1.root.json; rename --format to --index; drop --root-json - Bundle metadata/root_history/1.root.json from agent-integrations-tuf as a package resource; TUFPointerDownloader loads it via importlib.resources — no TOFU, no --root-json flag needed - Rename --format v2 to --index (boolean flag); v1 remains the default when --index is absent - Remove trust_anchor parameter from TUFPointerDownloader.__init__ - Drop --format and --root-json from instantiate_downloader (v1 path) - Register 1.root.json as a wheel artifact in pyproject.toml - Update tests to match new interface Co-Authored-By: Claude Sonnet 4.6 (1M context) * fix(downloader): rename --index to --v2 Co-Authored-By: Claude Sonnet 4.6 (1M context) * feat(downloader): default to v2 with v1 fallback; add prod URL constant Without any flag the downloader now attempts v2 (against the prod S3 bucket) and falls back to v1 on any failure, so callers get the new format automatically without code changes. Passing --v2 explicitly keeps the strict v2 path with no fallback (used by the pipeline's validate- staging step). V2_REPOSITORY_URL is the prod bucket constant used for the default repository value in _download_v2(); callers can still override it with --repository. Co-Authored-By: Claude Sonnet 4.6 (1M context) * feat(downloader): resolve hash-prefixed targets via N.targets.json The v2 TUF repository uses consistent-snapshot format: pointer files are stored as {sha256}.{version}.json on S3. Two changes to support this: 1. _make_updater now sets UpdaterConfig(prefix_targets_with_hash=True) so the TUF Updater resolves hash-prefixed paths automatically when calling download_target(). 2. get_pointer() now parses N.targets.json (after Updater.refresh()) to enumerate available versions for the project. This replaces the removed latest.json: when version=None, _resolve_version() scans all /.json entries in targets metadata and returns the highest stable PEP 440 version. The disable_verification path fetches the metadata chain (timestamp → snapshot → targets) without verifying signatures to find the hash-prefixed URL, then fetches the pointer directly. Co-Authored-By: Claude Sonnet 4.6 (1M context) * feat(downloader): resolve latest via latest pointer target * Move v2 TUF root metadata * Simplify v2 downloader implementation * feat(downloader): add MissingVersion and MalformedPointerError exceptions Dedicated types replace the prior reuse of TargetNotFoundError for argument validation (which mislabeled the failure category) and the unchecked KeyError raised on a malformed pointer JSON. Co-Authored-By: Claude Opus 4.7 * fix(downloader): harden v2 wheel fetch and pointer handling - Add explicit 60s timeout to urllib.request.urlopen so a stalled wheel fetch does not hang the Agent installer indefinitely. - Validate required pointer JSON keys (digest, length, wheel_path) and raise the new MalformedPointerError instead of an opaque KeyError. - Raise MissingVersion (a CLIError subclass) when --unsafe-disable-verification is set without --version, so the v1 fallback log reports the actual cause instead of "target not found". - Extract _verify_content to drop the pointer-is-None sentinel and make the verified and direct-download branches structurally parallel. - Add `from __future__ import annotations` so the PEP 604 unions stay compatible with the declared requires-python = ">=3.8". - Move logging.basicConfig out of the constructor and into the CLI entry point (separate commit); the class no longer mutates the root logger. Co-Authored-By: Claude Opus 4.7 * fix(downloader): make v2/v1 fallback handle validation errors and --force - Split _download_v2() into instantiate_v2_downloader() and run_v2_downloader() to mirror the v1 instantiate/run split and let the warning/validation branches be tested without patching sys.argv. - Re-raise user-input errors (CLIError, MissingVersion) before the broad except so they propagate as-is instead of triggering a spurious v1 retry and a misleading "v2 download failed" log line. - Add --force as a no-op compat stub on the v2 parser so v1-only callers do not trip parse_args -> SystemExit and silently skip the fallback. - Hoist `import logging` to module top (was lazy-imported in the except block) and own the verbose-to-level + logging.basicConfig setup that used to live inside TUFPointerDownloader.__init__. - Drop the meaningless `--v2 default=True` re-declaration; rename underscore-prefixed argparse dests to plain names. - Note in the fallback block that v1 offline tests now traverse v2 first on every invocation. Co-Authored-By: Claude Opus 4.7 * test(downloader): broaden v2 coverage and parametrize failure categories - Parametrize _v2_failure_category across all five (exc, category) cases and add DownloadError / TimeoutError coverage that the categorizer already handles but previous tests never asserted. - Replace direct calls to TUFPointerDownloader._target_path with a parametrized test that drives get_pointer and asserts on Updater.get_targetinfo so the behavior, not the private helper, is what's pinned. - Add failure-mode tests for malformed pointer JSON (one per required key), urllib HTTPError/URLError mid-download, and wheel_path without a leading slash so the URL-composition contract is visible. - Update test_direct_download_requires_explicit_version to expect MissingVersion now that argument-validation no longer reuses TargetNotFoundError. - Move @pytest.mark.offline from each class to a module-level pytestmark; drop the leading-underscore prefix on module constants to match AGENTS.md style. Co-Authored-By: Claude Opus 4.7 * style(downloader): sort test imports per project ruff config Ruff in CI uses the root ../pyproject.toml which treats datadog_checks as first-party. Reorder the test imports to match. Co-Authored-By: Claude Opus 4.7 * refactor(downloader): address PR #23144 review feedback - exceptions.py: type-hint MalformedPointerError/DigestMismatch __init__; add LengthMismatch (split from the overloaded DigestMismatch). - download_v2.py: drop underscore from WHEEL_FETCH_TIMEOUT_SECONDS and REQUIRED_POINTER_KEYS per AGENTS.md; validate wheel_path leading slash via MalformedPointerError; verify length first (cheap early-out) before the sha256 digest check. - cli.py: add type hints on download(), _v2_parser(), instantiate_v2_downloader(), run_v2_downloader(); drop the unused _args parameter from run_v2_downloader; collapse the redundant (CLIError, MissingVersion) except clause to just CLIError. - test_v2_downloader.py: assert MalformedPointerError when wheel_path lacks a leading slash; split TestLengthMismatch from TestDigestMismatch; cover instantiate_v2_downloader validation/warning branches and the cli.download() v2-then-v1 fallback orchestration; drop the inline Updater patch in TestDisableVerification in favour of the fixture. * Fix v2 downloader blockers: narrow fallback, future import Narrow the v1 fallback in download() to a tuple of network/lookup errors. Previously every non-CLIError exception triggered v1 retry, including DigestMismatch / LengthMismatch / MalformedPointerError — i.e. integrity failures the v2 path is meant to surface were silently masked. Now those propagate; only TargetNotFoundError, DownloadError, TimeoutError, and urllib.error.URLError fall back. Add `from __future__ import annotations` to cli.py: the new module uses PEP 604 unions and PEP 585 subscripted generics at definition time, which crash on Python 3.8/3.9 (pyproject.toml declares requires-python = ">=3.8"). download_v2.py already had the import. Add parametrized test pinning the new behavior — DigestMismatch, LengthMismatch, and MalformedPointerError propagate without invoking the v1 downloader. Other review feedback (refactor download(), gate compat warnings on --v2, validate pointer field types, split download() into verified / direct, etc.) is deferred to a follow-up to keep this PR focused. * Preserve v1 downloader fallback behavior * Format v2 downloader tests * Add v2 downloader reviewer test coverage * Reuse v2 downloader test wheel name * Restore unsafe v1 fallback regression test --------- Co-authored-by: Claude Sonnet 4.6 --- .../changelog.d/23144.added | 1 + .../datadog_checks/downloader/cli.py | 112 +++++- .../downloader/data/v2/metadata/root.json | 191 +++++++++++ .../datadog_checks/downloader/download_v2.py | 140 ++++++++ .../datadog_checks/downloader/exceptions.py | 39 +++ datadog_checks_downloader/pyproject.toml | 3 + datadog_checks_downloader/tests/test_unit.py | 21 ++ .../tests/test_v2_downloader.py | 319 ++++++++++++++++++ 8 files changed, 822 insertions(+), 4 deletions(-) create mode 100644 datadog_checks_downloader/changelog.d/23144.added create mode 100644 datadog_checks_downloader/datadog_checks/downloader/data/v2/metadata/root.json create mode 100644 datadog_checks_downloader/datadog_checks/downloader/download_v2.py create mode 100644 datadog_checks_downloader/tests/test_v2_downloader.py diff --git a/datadog_checks_downloader/changelog.d/23144.added b/datadog_checks_downloader/changelog.d/23144.added new file mode 100644 index 0000000000000..2b3e21333eff5 --- /dev/null +++ b/datadog_checks_downloader/changelog.d/23144.added @@ -0,0 +1 @@ +Add v2 TUF pointer downloader support. diff --git a/datadog_checks_downloader/datadog_checks/downloader/cli.py b/datadog_checks_downloader/datadog_checks/downloader/cli.py index be3776c3d3682..8cdd29f44af20 100644 --- a/datadog_checks_downloader/datadog_checks/downloader/cli.py +++ b/datadog_checks_downloader/datadog_checks/downloader/cli.py @@ -2,16 +2,30 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from __future__ import annotations # 1st party. import argparse +import logging import os import re import sys +import urllib.error # 2nd party. +from tuf.api.exceptions import DownloadError + from .download import DEFAULT_ROOT_LAYOUT_TYPE, REPOSITORY_URL_PREFIX, ROOT_LAYOUTS, TUFDownloader -from .exceptions import NonCanonicalVersion, NonDatadogPackage +from .download_v2 import V2_REPOSITORY_URL, TUFPointerDownloader +from .exceptions import CLIError, MissingVersion, NonCanonicalVersion, NonDatadogPackage, TargetNotFoundError + +V2_FALLBACK_ERRORS: tuple[type[BaseException], ...] = ( + MissingVersion, + TargetNotFoundError, + DownloadError, + TimeoutError, + urllib.error.URLError, +) # Private module functions. @@ -25,6 +39,14 @@ def __is_canonical(version): return re.match(P, version) is not None +def _v2_failure_category(exc: Exception) -> str: + if isinstance(exc, TargetNotFoundError): + return 'target version not found' + if isinstance(exc, (DownloadError, TimeoutError, urllib.error.URLError)): + return 'network error' + return 'other' + + def __find_shipped_integrations(): # Recurse up from site-packages until we find the Agent root directory. # The relative path differs between operating systems. @@ -142,6 +164,88 @@ def run_downloader(tuf_downloader, standard_distribution_name, version, ignore_p # Public module functions. -def download(): - tuf_downloader, standard_distribution_name, version, ignore_python_version = instantiate_downloader() - run_downloader(tuf_downloader, standard_distribution_name, version, ignore_python_version) +def download() -> None: + downloader, name, version, args = instantiate_v2_downloader() + + if args.v2: + warn_v2_ignored_args(args) + run_v2_downloader(downloader, name, version) + return + + try: + run_v2_downloader(downloader, name, version) + except V2_FALLBACK_ERRORS as exc: + # Integrity failures (DigestMismatch / LengthMismatch / MalformedPointerError) are + # intentionally not in V2_FALLBACK_ERRORS — they must propagate, not be masked by v1. + logging.getLogger(__name__).info( + 'v2 download failed (%s, %s: %s), falling back to v1', + _v2_failure_category(exc), + type(exc).__name__, + exc, + ) + run_downloader(*instantiate_downloader()) + except CLIError: + # NonDatadogPackage and NonCanonicalVersion: v1 would raise the same. + raise + + +def _v2_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + + parser.add_argument( + 'standard_distribution_name', + type=str, + help='Standard distribution name of the desired Datadog check, e.g. datadog-postgres.', + ) + parser.add_argument( + '--repository', type=str, default=V2_REPOSITORY_URL, help='HTTPS base URL of the v2 TUF repository.' + ) + parser.add_argument('--version', type=str, default=None, help='Version to download (default: latest stable).') + parser.add_argument( + '--unsafe-disable-verification', + action='store_true', + help='Disable TUF verification and wheel digest checks; requires --version and downloads /wheels directly.', + ) + parser.add_argument('-v', '--verbose', action='count', default=0) + parser.add_argument('--v2', action='store_true', default=False) + + # v1 compat flags accepted as no-ops so callers upgrading from v1 get a warning, not an error. + parser.add_argument('--type', type=str, default=None, dest='ignored_type') + parser.add_argument('--ignore-python-version', action='store_true', dest='ignored_ignore_python_version') + parser.add_argument('--force', action='store_true', dest='ignored_force') + + return parser + + +def warn_v2_ignored_args(args: argparse.Namespace) -> None: + if args.ignored_type is not None: + sys.stderr.write('WARNING: --type is not applicable with --v2 and will be ignored.\n') + if args.ignored_ignore_python_version: + sys.stderr.write( + 'NOTE: --ignore-python-version is not applicable with --v2 (wheel selection happens at publish time).\n' + ) + + +def instantiate_v2_downloader() -> tuple[TUFPointerDownloader, str, str | None, argparse.Namespace]: + args = _v2_parser().parse_args() + + if not args.standard_distribution_name.startswith('datadog-'): + raise NonDatadogPackage(args.standard_distribution_name) + + if args.version and not __is_canonical(args.version): + raise NonCanonicalVersion(args.version) + + remainder = min(args.verbose, 5) % 6 + level = (6 - remainder) * 10 + logging.basicConfig(format='%(levelname)-8s: %(message)s', level=level) + + downloader = TUFPointerDownloader( + repository_url=args.repository, + disable_verification=args.unsafe_disable_verification, + ) + return downloader, args.standard_distribution_name, args.version, args + + +def run_v2_downloader(downloader: TUFPointerDownloader, name: str, version: str | None) -> None: + wheel_path = downloader.download(name, version=version) + print(wheel_path) # pylint: disable=print-statement diff --git a/datadog_checks_downloader/datadog_checks/downloader/data/v2/metadata/root.json b/datadog_checks_downloader/datadog_checks/downloader/data/v2/metadata/root.json new file mode 100644 index 0000000000000..e053044657c99 --- /dev/null +++ b/datadog_checks_downloader/datadog_checks/downloader/data/v2/metadata/root.json @@ -0,0 +1,191 @@ +{ + "signatures": [ + { + "keyid": "ac5d650bc9aa17fdad54753fbf64e083f6f613286d0feef991ff61ec26874f2b", + "sig": "3066023100beb16cde4c9e17c725713c6020cb5b11a65dd60a7b46f6842068815593e8f39cfa547ea89b6169474a8ee6a98c22f934023100f215434d25181f6f0d6a75a1bae4f09814678d3409cc0aaf0f8e1cb2b0a7bcb29acd5394b4df1751f7e73c641a17256b" + }, + { + "keyid": "6f0f52eb4cb14d590aafd5f7eb8d9a79477ac89794be2d3caade9fc39b3735e6", + "sig": "306502306130792e890c5257cc8fc951e7c9a4009a5552affbd6bdf5cef3bac647de18f82918adbd4edfaa6d1624e2c378ee0ca4023100db6cff59e145b0113560116eac4466e52fc24e3ec3f02ee39fe7213718c2184d58e5473a5f29429b1a4c63e7ff87b83b" + }, + { + "keyid": "2d019dcc7a3e8da4d22bf364f0e0cb87937b2ced68339f3c53d305d1a9aadcce", + "sig": "3066023100ed93223b4ab9784c00b73937cd431fe9d6af8906548124a10215ad93432523476a3265e712fe99b7555ea30ce5aeff30023100e55ae85f68da6ac322f912cae2a28facb6b3f014770d348a7c9daee100e5eb8ae78cc86321b20711f3b357d900a075a4" + }, + { + "keyid": "e942404daa3e8cb1143ab5f275df2f8c741ae002194147806bd6f05b8e2e816f", + "sig": "3066023100a1a75a85dbe43db459e3d8c1bd935f2717bae0b1cba79ea5b9a5e785b7eb08cc30e08f96ba5fc0ccb9b9c97b9af456450231008dcc197a5de9a93649dfbd27e3f112321441913138c3377487ae85353a982cbffdc88029d681e432e86cfc14c51196ab" + }, + { + "keyid": "1286a08794005a5f1d679e56322f45fd3b55aa198f87bdc699f8213048602000", + "sig": "3064023075188913725a1c2e9af59f8663b6a178156b64d87da126a5970a3b6a3399bdfe7f5c357099f2f1a4e83d52294551c41e0230080ef2ff77b7d558879cbe0eda409c3ba2fe080860506d4f2ede314374e39dc0b2bc466af51fdd258eb76171344d42af" + }, + { + "keyid": "65ccb05ff16285a3b65ea2db2581ed083bb19acfcbd130d5484c151baf28541f", + "sig": "306602310086b9d6f39f795ad188223318f02e1d78b5798d34e333c1933e55891cc1b11cd6771b2d9ab1f5c1fc707d4815e3cc200d023100eb63f35f7cc2d0166357f2c209ecca63b82fc6bc9c310b9a0fa345957b1a0df102036ea6a6d3825d787eb7d3b3131e70" + }, + { + "keyid": "b59ade3245077bd622dc7bf41163a877e05272590cb4830632dc0d034717d735", + "sig": "3066023100fa26ef91f1bb3cdf779cb6bbc43d70bab67a7c66103e61b8998698f469fad0d44002d4a9399ceac304b8ee1a8823fd99023100ecd415e58696ab4778f4bdf5187be3743ac372b29cd139111b3461a0da42f8f44e5bb3ec83c2bd0ce4b6281e585b8889" + }, + { + "keyid": "a07e905cad57b71374ef5e408d61936c31957b35026de0b8db3938878ccad637", + "sig": "3066023100f4802957c21a0916677154494c4360260f5994c35c435d2bf2df39bc7cccca7fb437563d21ae128bcaa7909ead7d6e7802310097513f90e5e7dbe4bcb3f9b20308e966ec38960e8cad4869a4b32be8bd98726ded4a68c671d5f22858dd10ba3b56b04a" + }, + { + "keyid": "a442c20904f96e3a367e16037665bfb2e002bb2e9586cec4c96d83697a49fa2a", + "sig": "30660231008d58d822ee1accd6bff07e79f171d61d122d35c1d51c86b2f2ada76cff695090fdf859127889f9a8d90e539277b5ab5a023100f0ad8d7ba6a25e27316a91bbc61c9b4d31f42c5c93662ea53d660af6b8ab9ee111b135b6844901ccd281fd9246d5f786" + }, + { + "keyid": "8969905969a712d54c9b327939aead62784587b54d1d03cbaa835f79205069bf", + "sig": "3066023100894ecac9291e64ea6b84d168b886ef5829f4ad5b57c83b0ec745b644e4d19983e29c4681b5f744070a679e71fb4325af023100b6365d50a44e59932ea24d17a9fbc975cc7e7b44539d36dc7208470b1ddc9572b06266e149d1793a12a98cd62b803a95" + } + ], + "signed": { + "_type": "root", + "consistent_snapshot": true, + "expires": "2027-04-21T15:31:05Z", + "keys": { + "1286a08794005a5f1d679e56322f45fd3b55aa198f87bdc699f8213048602000": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEruzzCikai9w8LqTLE4cxf0qRIFU6AQve\nnMmudDdNo22MCiOwbuYjJJ1dvRlMiSVrAGyv1+37h8aXGa5Qbx5nb4TEIRfaDth8\nhMbKJcQ7OOK/6SaltjNZh3VaZ396/WIC\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@nouemankhal" + }, + "2d019dcc7a3e8da4d22bf364f0e0cb87937b2ced68339f3c53d305d1a9aadcce": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAErfozI3wqaB8k6o6Mc7SPFiw8s1dLTaxk\nMmhMsdkk7QIl3t+gFzWNdXANEjN027g4S6Ty2CvdzovU37yD24td9pQBh8LGmfPa\nmU5cxtzRaXkCibibJrrvLxyyZTWZXW6C\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@alexeypilyugin" + }, + "4542ee95093cb434e0d80a4bb9dd9d96e6b67cda12759fa2648a7786f822e97d": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEUV4g/gyxCdXKHK07QWO5z6S9lRhL88DO\nOb22g0dCOtxBB2sKojAUw3wXXz+SaUZRFgqfVvezbtsC4LSkkIlwA5MrJDA83kP2\nJRo4BQPtW8wZmtSvkkRQPSfAdXv975pg\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-online-uri": "awskms:arn:aws:kms:us-east-1:510233252802:key/9efe9e34-88f3-4ad3-8828-5340561e7c42" + }, + "65ccb05ff16285a3b65ea2db2581ed083bb19acfcbd130d5484c151baf28541f": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEi44mg+tnJn41Cy4Lr42lQNRuZaHDY4d+\nB/oYkBRTiHl6n6hc6alGLS/1rWijAfSL7x7wgVeOrA5fp1ornW27vPOkRVWJO5Lv\nZcZXwJYi7svVFBkFjBAtAOF6DGuAEWc9\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@lucia-sb" + }, + "6f0f52eb4cb14d590aafd5f7eb8d9a79477ac89794be2d3caade9fc39b3735e6": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEAbWHH4rfNiJFz9gXLPV/QJK0tky4/nW1\nyMPnUe1GRac6UfGcjZvGA7mpmns4FYG1KuHbPhWlEDOQnLjiIiJkY2+Z96tywq6y\n+/e+0Gc2KSsVr0IAWALkTzQE+Q6ru+lj\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@nubtron" + }, + "8969905969a712d54c9b327939aead62784587b54d1d03cbaa835f79205069bf": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEkztjp5ixZrKt94qnSn4bisyEdgs0Wre/\nheazr1zx7MJUCLiHim0lEDWCB64m/YLru+W3/PLwTiQSavO62lB6y3ggjcq/ygwA\n5yxi0bP/MAJBZ0Hl+y+Q8BfKTZSrTb6j\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@hadhemidd" + }, + "a07e905cad57b71374ef5e408d61936c31957b35026de0b8db3938878ccad637": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEEilQwnno5GxJpoyxulKzkkHa0x0/ERDa\nf3m1ZCpF9SoT2B98T+BwT6noD+qlOwX7VKLFSQwl4/od53tu6Wt3s3P70zFviq+Y\n+chUOSCbA5y/TCvfwx4mLBruXI1QbVOh\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@aarakke" + }, + "a442c20904f96e3a367e16037665bfb2e002bb2e9586cec4c96d83697a49fa2a": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEUJ7k4tiIZrWNLhrNrcBBMh4we3GiMlpo\ntwVy72lNw7aMxisK6ttP0mV30Yh1rX37DO6UUdeiWImrYBVfXFkP7z2QD9qKetny\nCeVHycA7uNby7yb7pljv2l2SpTgXACZk\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@iliakur" + }, + "ac5d650bc9aa17fdad54753fbf64e083f6f613286d0feef991ff61ec26874f2b": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEAWOvhm6nk7iY+EYK8ZnrxS49yqLf/ZTR\nJ74WY9Kz3ikjyXASkD4IgqJyyrmbMoqS9k6/RM/Zk6CAfPeZneDh1puVAlxy9nJD\nZp/OW78dVOqrlw1uQ0d+gfe7b4TcUNG4\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@dkirov-dd" + }, + "b59ade3245077bd622dc7bf41163a877e05272590cb4830632dc0d034717d735": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEd/9wooA4OKbC7hUO1OTZN3pnFbc85PDs\n+izKkDDSqj3yk8Pa39OJstT2BHvrn/B0BKMHhE6T/PN/rhorKVIVZ3UZErn1QCgG\nkkcFfA5MQm92SjIr9zAJea9bVUJhZ+PA\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@sarah-witt" + }, + "e942404daa3e8cb1143ab5f275df2f8c741ae002194147806bd6f05b8e2e816f": { + "keytype": "ecdsa", + "keyval": { + "public": "-----BEGIN PUBLIC KEY-----\nMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAE3VG/DJn/wmXh3bQ/LLjGMyKubQ1f5/1P\nJTVDYgTh5AC5zWxDSD26PoNpS29MecItPoM+pMy5YC99mwkEkxjNdwIke1Aons92\n8SVtL3BYH311oC6jLtFt+oqEunL5EdgJ\n-----END PUBLIC KEY-----\n" + }, + "scheme": "ecdsa-sha2-nistp384", + "x-tuf-on-ci-keyowner": "@kyle-neale" + } + }, + "roles": { + "root": { + "keyids": [ + "ac5d650bc9aa17fdad54753fbf64e083f6f613286d0feef991ff61ec26874f2b", + "6f0f52eb4cb14d590aafd5f7eb8d9a79477ac89794be2d3caade9fc39b3735e6", + "2d019dcc7a3e8da4d22bf364f0e0cb87937b2ced68339f3c53d305d1a9aadcce", + "e942404daa3e8cb1143ab5f275df2f8c741ae002194147806bd6f05b8e2e816f", + "1286a08794005a5f1d679e56322f45fd3b55aa198f87bdc699f8213048602000", + "65ccb05ff16285a3b65ea2db2581ed083bb19acfcbd130d5484c151baf28541f", + "b59ade3245077bd622dc7bf41163a877e05272590cb4830632dc0d034717d735", + "a07e905cad57b71374ef5e408d61936c31957b35026de0b8db3938878ccad637", + "a442c20904f96e3a367e16037665bfb2e002bb2e9586cec4c96d83697a49fa2a", + "8969905969a712d54c9b327939aead62784587b54d1d03cbaa835f79205069bf" + ], + "threshold": 2 + }, + "snapshot": { + "keyids": [ + "4542ee95093cb434e0d80a4bb9dd9d96e6b67cda12759fa2648a7786f822e97d" + ], + "threshold": 1, + "x-tuf-on-ci-expiry-period": 365, + "x-tuf-on-ci-signing-period": 60 + }, + "targets": { + "keyids": [ + "ac5d650bc9aa17fdad54753fbf64e083f6f613286d0feef991ff61ec26874f2b", + "6f0f52eb4cb14d590aafd5f7eb8d9a79477ac89794be2d3caade9fc39b3735e6", + "2d019dcc7a3e8da4d22bf364f0e0cb87937b2ced68339f3c53d305d1a9aadcce", + "e942404daa3e8cb1143ab5f275df2f8c741ae002194147806bd6f05b8e2e816f", + "1286a08794005a5f1d679e56322f45fd3b55aa198f87bdc699f8213048602000", + "65ccb05ff16285a3b65ea2db2581ed083bb19acfcbd130d5484c151baf28541f", + "b59ade3245077bd622dc7bf41163a877e05272590cb4830632dc0d034717d735", + "a07e905cad57b71374ef5e408d61936c31957b35026de0b8db3938878ccad637", + "a442c20904f96e3a367e16037665bfb2e002bb2e9586cec4c96d83697a49fa2a", + "8969905969a712d54c9b327939aead62784587b54d1d03cbaa835f79205069bf" + ], + "threshold": 1 + }, + "timestamp": { + "keyids": [ + "4542ee95093cb434e0d80a4bb9dd9d96e6b67cda12759fa2648a7786f822e97d" + ], + "threshold": 1, + "x-tuf-on-ci-expiry-period-hours": 48, + "x-tuf-on-ci-signing-period-hours": 24 + } + }, + "spec_version": "1.0.31", + "version": 1, + "x-tuf-on-ci-expiry-period": 365, + "x-tuf-on-ci-signing-period": 60 + } +} \ No newline at end of file diff --git a/datadog_checks_downloader/datadog_checks/downloader/download_v2.py b/datadog_checks_downloader/datadog_checks/downloader/download_v2.py new file mode 100644 index 0000000000000..82557de31f4c9 --- /dev/null +++ b/datadog_checks_downloader/datadog_checks/downloader/download_v2.py @@ -0,0 +1,140 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +"""TUF pointer-file downloader for the v2 repository format.""" + +from __future__ import annotations + +import hashlib +import importlib.resources +import json +import logging +import tempfile +import urllib.request +from pathlib import Path + +from tuf.ngclient import Updater +from tuf.ngclient.config import UpdaterConfig + +from .exceptions import ( + DigestMismatch, + LengthMismatch, + MalformedPointerError, + MissingVersion, + TargetNotFoundError, +) + +logger = logging.getLogger(__name__) + +V2_REPOSITORY_URL = "https://agent-integration-wheels-prod.s3.amazonaws.com" + +# tuf.ngclient sets its own fetcher timeout; this applies only to the raw wheel urlopen(). +WHEEL_FETCH_TIMEOUT_SECONDS = 60 + +REQUIRED_POINTER_KEYS = ('digest', 'length', 'wheel_path') + + +class TUFPointerDownloader: + """Downloads Datadog integration wheels from a v2 TUF repository.""" + + def __init__(self, repository_url: str, disable_verification: bool = False): + self._repository_url = repository_url.rstrip('/') + self._disable_verification = disable_verification + + if disable_verification: + logger.warning('Running with TUF verification disabled. Integrity is protected only by TLS (HTTPS).') + + def _bootstrap_metadata_dir(self, metadata_dir: Path) -> None: + dest = metadata_dir / 'root.json' + metadata = importlib.resources.files('datadog_checks.downloader') / 'data' / 'v2' / 'metadata' + dest.write_bytes((metadata / 'root.json').read_bytes()) + + def _make_updater(self, metadata_dir: Path, target_dir: Path) -> Updater: + return Updater( + metadata_dir=str(metadata_dir), + metadata_base_url=f'{self._repository_url}/metadata/', + target_base_url=f'{self._repository_url}/targets/', + target_dir=str(target_dir), + config=UpdaterConfig(prefix_targets_with_hash=True), + ) + + @staticmethod + def _target_path(project: str, version: str | None) -> str: + name = version if version is not None else 'latest' + return f'{project}/{name}.json' + + @staticmethod + def _wheel_filename(project: str, version: str) -> str: + distribution = project.replace('-', '_') + return f'{distribution}-{version}-py3-none-any.whl' + + def _direct_wheel_url(self, project: str, version: str) -> str: + return f'{self._repository_url}/wheels/{project}/{self._wheel_filename(project, version)}' + + @staticmethod + def _validate_pointer(project: str, pointer: dict) -> None: + for key in REQUIRED_POINTER_KEYS: + if key not in pointer: + raise MalformedPointerError(project, key) + if not pointer['wheel_path'].startswith('/'): + raise MalformedPointerError(project, 'wheel_path') + + @staticmethod + def _verify_content(project: str, content: bytes, pointer: dict) -> None: + if len(content) != pointer['length']: + raise LengthMismatch(project, pointer['length'], len(content)) + actual_digest = hashlib.sha256(content).hexdigest() + if actual_digest != pointer['digest']: + raise DigestMismatch(project, pointer['digest'], actual_digest) + + def get_pointer(self, project: str, version: str | None = None) -> dict: + """Return the pointer JSON for *project* at *version* (or 'latest' when None).""" + with tempfile.TemporaryDirectory() as tmp: + metadata_dir = Path(tmp) / 'metadata' + target_dir = Path(tmp) / 'targets' + metadata_dir.mkdir() + target_dir.mkdir() + + target_path = self._target_path(project, version) + self._bootstrap_metadata_dir(metadata_dir) + updater = self._make_updater(metadata_dir, target_dir) + updater.refresh() + + target_info = updater.get_targetinfo(target_path) + if target_info is None: + label = version if version is not None else 'latest stable' + raise TargetNotFoundError(f'No TUF target for {project!r} version {label!r}') + + pointer_path = target_dir / target_path + pointer_path.parent.mkdir(parents=True, exist_ok=True) + updater.download_target(target_info, pointer_path) + + return json.loads(pointer_path.read_text(encoding='utf-8')) + + def download(self, project: str, version: str | None = None, dest_dir: Path | None = None) -> Path: + """Download and verify the wheel for *project* at *version*; return its path.""" + if self._disable_verification: + if version is None: + raise MissingVersion('unsafe-disable-verification requires an explicit --version') + wheel_url = self._direct_wheel_url(project, version) + wheel_filename = self._wheel_filename(project, version) + pointer: dict | None = None + else: + pointer = self.get_pointer(project, version) + self._validate_pointer(project, pointer) + wheel_url = self._repository_url + pointer['wheel_path'] + wheel_filename = Path(pointer['wheel_path']).name + + dest = (dest_dir or Path(tempfile.mkdtemp())) / wheel_filename + + logger.info('Downloading wheel from %s', wheel_url) + with urllib.request.urlopen(wheel_url, timeout=WHEEL_FETCH_TIMEOUT_SECONDS) as resp: + content = resp.read() + + if pointer is not None: + self._verify_content(project, content, pointer) + + dest.write_bytes(content) + logger.info('Wrote %s to %s', wheel_filename, dest) + return dest diff --git a/datadog_checks_downloader/datadog_checks/downloader/exceptions.py b/datadog_checks_downloader/datadog_checks/downloader/exceptions.py index bb6b75e05a156..db8764040a700 100644 --- a/datadog_checks_downloader/datadog_checks/downloader/exceptions.py +++ b/datadog_checks_downloader/datadog_checks/downloader/exceptions.py @@ -30,6 +30,10 @@ def __str__(self): return '{}'.format(self.standard_distribution_name) +class MissingVersion(CLIError): + """Raised when --version is required but absent (e.g. with --unsafe-disable-verification).""" + + # Exceptions for the download module. @@ -37,6 +41,41 @@ class TargetNotFoundError(ChecksDownloaderException): """An exception raised when a target is not found.""" +class MalformedPointerError(ChecksDownloaderException): + """Raised when a TUF-signed pointer JSON is invalid or missing fields.""" + + def __init__(self, project: str, field: str): + self.project = project + self.field = field + + def __str__(self) -> str: + return f'{self.project}: pointer field {self.field!r} is missing or malformed' + + +class DigestMismatch(ChecksDownloaderException): + """Raised when the downloaded wheel's sha256 does not match the pointer.""" + + def __init__(self, project: str, expected: str, actual: str): + self.project = project + self.expected = expected + self.actual = actual + + def __str__(self) -> str: + return f'{self.project}: expected digest {self.expected}, got {self.actual}' + + +class LengthMismatch(ChecksDownloaderException): + """Raised when the downloaded wheel's byte length does not match the pointer.""" + + def __init__(self, project: str, expected: int, actual: int): + self.project = project + self.expected = expected + self.actual = actual + + def __str__(self) -> str: + return f'{self.project}: expected length {self.expected}, got {self.actual}' + + class IncorrectRootLayoutType(ChecksDownloaderException): def __init__(self, found, expected): self.found = found diff --git a/datadog_checks_downloader/pyproject.toml b/datadog_checks_downloader/pyproject.toml index 56ecc4d80baee..b40dcb4e75d39 100644 --- a/datadog_checks_downloader/pyproject.toml +++ b/datadog_checks_downloader/pyproject.toml @@ -55,6 +55,9 @@ include = [ include = [ "/datadog_checks/downloader", ] +artifacts = [ + "/datadog_checks/downloader/data/v2/metadata/root.json", +] dev-mode-dirs = [ ".", ] diff --git a/datadog_checks_downloader/tests/test_unit.py b/datadog_checks_downloader/tests/test_unit.py index 9170abf3ee09a..160b7c584aeef 100644 --- a/datadog_checks_downloader/tests/test_unit.py +++ b/datadog_checks_downloader/tests/test_unit.py @@ -1,7 +1,28 @@ # (C) Datadog, Inc. 2023-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +import urllib.error + +import pytest +from tuf.api.exceptions import DownloadError + +from datadog_checks.downloader.cli import _v2_failure_category from datadog_checks.downloader.download import TUFDownloader +from datadog_checks.downloader.exceptions import TargetNotFoundError + + +@pytest.mark.parametrize( + 'exc,expected', + [ + pytest.param(TargetNotFoundError('missing'), 'target version not found', id='target-not-found'), + pytest.param(urllib.error.URLError('timeout'), 'network error', id='network-urlerror'), + pytest.param(DownloadError('boom'), 'network error', id='network-downloaderror'), + pytest.param(TimeoutError('slow'), 'network error', id='network-timeout'), + pytest.param(ValueError('bad pointer'), 'other', id='other'), + ], +) +def test_v2_failure_category(exc, expected): + assert _v2_failure_category(exc) == expected def test_non_official_wheel_filter(mocker): diff --git a/datadog_checks_downloader/tests/test_v2_downloader.py b/datadog_checks_downloader/tests/test_v2_downloader.py new file mode 100644 index 0000000000000..db1977db2be0d --- /dev/null +++ b/datadog_checks_downloader/tests/test_v2_downloader.py @@ -0,0 +1,319 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +"""Unit tests for TUFPointerDownloader (v2 repository format) and the v2 CLI surface.""" + +import hashlib +import json +import urllib.error +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from tuf.api.exceptions import DownloadError + +from datadog_checks.downloader import cli +from datadog_checks.downloader.download_v2 import TUFPointerDownloader +from datadog_checks.downloader.exceptions import ( + DigestMismatch, + LengthMismatch, + MalformedPointerError, + MissingVersion, + NonCanonicalVersion, + NonDatadogPackage, + TargetNotFoundError, +) + +pytestmark = pytest.mark.offline + +PROJECT = 'datadog-postgres' +VERSION = '14.0.0' +WHEEL_NAME = f'datadog_postgres-{VERSION}-py3-none-any.whl' +WHEEL_CONTENT = b'fake wheel bytes for testing' +WHEEL_DIGEST = hashlib.sha256(WHEEL_CONTENT).hexdigest() +WHEEL_LENGTH = len(WHEEL_CONTENT) +REPO_URL = 'https://agent-integration-wheels-staging.s3.amazonaws.com' + +POINTER = { + 'digest': WHEEL_DIGEST, + 'length': WHEEL_LENGTH, + 'version': VERSION, + 'repository': REPO_URL, + 'wheel_path': f'/wheels/{PROJECT}/{WHEEL_NAME}', + 'attestation_path': f'/attestations/{PROJECT}/{VERSION}.sigstore.json', +} + + +def _mock_tuf_updater(pointer: dict) -> MagicMock: + pointer_bytes = json.dumps(pointer).encode() + mock_updater = MagicMock() + mock_updater.get_targetinfo.return_value = MagicMock() + + def fake_download_target(_target_info, dest_path): + Path(dest_path).parent.mkdir(parents=True, exist_ok=True) + Path(dest_path).write_bytes(pointer_bytes) + + mock_updater.download_target.side_effect = fake_download_target + return mock_updater + + +def _mock_response(content: bytes) -> MagicMock: + response = MagicMock() + response.__enter__ = lambda s: s + response.__exit__ = MagicMock(return_value=False) + response.read.return_value = content + return response + + +@pytest.fixture +def mock_urlopen(): + with patch('datadog_checks.downloader.download_v2.urllib.request.urlopen') as mock: + mock.return_value = _mock_response(WHEEL_CONTENT) + yield mock + + +@pytest.fixture +def mock_updater_cls(): + with patch('datadog_checks.downloader.download_v2.Updater') as mock: + mock.return_value = _mock_tuf_updater(POINTER) + yield mock + + +class TestTargetResolution: + @pytest.mark.parametrize( + 'version,expected_target', + [ + pytest.param(VERSION, f'{PROJECT}/{VERSION}.json', id='explicit-version'), + pytest.param(None, f'{PROJECT}/latest.json', id='missing-version'), + ], + ) + def test_get_pointer_requests_expected_target(self, mock_urlopen, mock_updater_cls, version, expected_target): + downloader = TUFPointerDownloader(repository_url=REPO_URL) + downloader.get_pointer(PROJECT, version=version) + + mock_updater = mock_updater_cls.return_value + assert mock_updater.get_targetinfo.call_args[0][0] == expected_target + + +class TestHappyPath: + def test_download_returns_wheel_path(self, mock_urlopen, mock_updater_cls, tmp_path): + downloader = TUFPointerDownloader(repository_url=REPO_URL) + wheel_path = downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + + assert wheel_path.exists() + assert wheel_path.read_bytes() == WHEEL_CONTENT + assert wheel_path.name == WHEEL_NAME + + def test_repository_flag_overrides_pointer_repository(self, mock_urlopen, mock_updater_cls, tmp_path): + prod_pointer = {**POINTER, 'repository': 'https://agent-integration-wheels-prod.s3.amazonaws.com'} + mock_updater_cls.return_value = _mock_tuf_updater(prod_pointer) + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + + mock_urlopen.assert_called_once_with( + f'{REPO_URL}/wheels/{PROJECT}/{WHEEL_NAME}', + timeout=60, + ) + + +class TestTargetNotFound: + def test_raises_when_tuf_target_absent(self, mock_urlopen, mock_updater_cls): + mock_updater = MagicMock() + mock_updater.get_targetinfo.return_value = None + mock_updater_cls.return_value = mock_updater + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(TargetNotFoundError, match=PROJECT): + downloader.get_pointer(PROJECT, version='99.99.99') + + +class TestDigestMismatch: + def test_raises_on_corrupted_wheel(self, mock_urlopen, mock_updater_cls, tmp_path): + tampered = b'tampered bytes that match the pointer length'[:WHEEL_LENGTH] + mock_urlopen.return_value = _mock_response(tampered) + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(DigestMismatch, match=PROJECT): + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + assert not (tmp_path / WHEEL_NAME).exists() + + +class TestLengthMismatch: + def test_raises_when_pointer_length_does_not_match_wheel(self, mock_urlopen, mock_updater_cls, tmp_path): + bad_pointer = {**POINTER, 'length': WHEEL_LENGTH + 1} + mock_updater_cls.return_value = _mock_tuf_updater(bad_pointer) + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(LengthMismatch) as exc_info: + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + assert exc_info.value.expected == WHEEL_LENGTH + 1 + assert exc_info.value.actual == WHEEL_LENGTH + assert not (tmp_path / WHEEL_NAME).exists() + + +class TestMalformedPointer: + @pytest.mark.parametrize('missing_key', ['digest', 'length', 'wheel_path']) + def test_raises_when_required_key_missing(self, mock_urlopen, mock_updater_cls, tmp_path, missing_key): + broken_pointer = {k: v for k, v in POINTER.items() if k != missing_key} + mock_updater_cls.return_value = _mock_tuf_updater(broken_pointer) + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(MalformedPointerError, match=missing_key): + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + + def test_raises_when_wheel_path_missing_leading_slash(self, mock_urlopen, mock_updater_cls, tmp_path): + no_slash_pointer = {**POINTER, 'wheel_path': f'wheels/{PROJECT}/{WHEEL_NAME}'} + mock_updater_cls.return_value = _mock_tuf_updater(no_slash_pointer) + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(MalformedPointerError, match='wheel_path'): + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + mock_urlopen.assert_not_called() + + +class TestNetworkErrorMidDownload: + def test_http_error_propagates(self, mock_urlopen, mock_updater_cls, tmp_path): + mock_urlopen.side_effect = urllib.error.HTTPError( + url='http://example/x.whl', code=500, msg='boom', hdrs=None, fp=None + ) + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(urllib.error.HTTPError): + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + + def test_url_error_propagates(self, mock_urlopen, mock_updater_cls, tmp_path): + mock_urlopen.side_effect = urllib.error.URLError('unreachable') + + downloader = TUFPointerDownloader(repository_url=REPO_URL) + with pytest.raises(urllib.error.URLError): + downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + + +class TestDisableVerification: + def test_directly_downloads_wheel_without_tuf_or_digest_checks(self, mock_urlopen, mock_updater_cls, tmp_path): + content = b'bytes not matching any signed pointer' + mock_urlopen.return_value = _mock_response(content) + + downloader = TUFPointerDownloader(repository_url=REPO_URL, disable_verification=True) + wheel_path = downloader.download(PROJECT, version=VERSION, dest_dir=tmp_path) + + mock_urlopen.assert_called_once_with( + f'{REPO_URL}/wheels/{PROJECT}/{WHEEL_NAME}', + timeout=60, + ) + assert wheel_path.name == WHEEL_NAME + assert wheel_path.read_bytes() == content + mock_updater_cls.assert_not_called() + + def test_direct_download_requires_explicit_version(self, tmp_path): + downloader = TUFPointerDownloader(repository_url=REPO_URL, disable_verification=True) + with pytest.raises(MissingVersion, match='requires an explicit --version'): + downloader.download(PROJECT, dest_dir=tmp_path) + + +class TestInstantiateV2Downloader: + def test_rejects_non_datadog_package(self, monkeypatch): + monkeypatch.setattr('sys.argv', ['downloader', 'requests']) + with pytest.raises(NonDatadogPackage, match='requests'): + cli.instantiate_v2_downloader() + + def test_rejects_non_canonical_version(self, monkeypatch): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres', '--version', 'banana']) + with pytest.raises(NonCanonicalVersion, match='banana'): + cli.instantiate_v2_downloader() + + def test_does_not_warn_when_v1_compat_flags_are_parsed(self, monkeypatch, capsys): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres', '--type', 'core', '--ignore-python-version']) + cli.instantiate_v2_downloader() + assert capsys.readouterr().err == '' + + def test_warns_for_v1_compat_flags_in_strict_v2_mode(self, monkeypatch, capsys): + monkeypatch.setattr( + 'sys.argv', ['downloader', 'datadog-postgres', '--v2', '--type', 'core', '--ignore-python-version'] + ) + _, _, _, args = cli.instantiate_v2_downloader() + cli.warn_v2_ignored_args(args) + stderr = capsys.readouterr().err + assert 'WARNING: --type' in stderr + assert 'NOTE: --ignore-python-version' in stderr + + def test_force_flag_is_silently_ignored(self, monkeypatch, capsys): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres', '--force']) + cli.instantiate_v2_downloader() + assert capsys.readouterr().err == '' + + +class TestCliDownloadFallback: + """Covers the cli.download() v2-attempt-then-v1-fallback orchestration.""" + + def test_strict_v2_raises_on_v2_failure(self, monkeypatch): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres', '--v2']) + monkeypatch.setattr(cli, 'run_v2_downloader', MagicMock(side_effect=TargetNotFoundError('missing'))) + v1 = MagicMock() + monkeypatch.setattr(cli, 'run_downloader', v1) + monkeypatch.setattr(cli, 'instantiate_downloader', MagicMock(return_value=(None, None, None, None))) + + with pytest.raises(TargetNotFoundError): + cli.download() + v1.assert_not_called() + + @pytest.mark.parametrize( + 'fallback_exc', + [ + pytest.param(MissingVersion('missing'), id='missing-version'), + pytest.param(TargetNotFoundError('missing'), id='target-not-found'), + pytest.param(DownloadError('unreachable'), id='download-error'), + pytest.param(TimeoutError('slow'), id='timeout-error'), + pytest.param(urllib.error.URLError('unreachable'), id='url-error'), + ], + ) + def test_default_falls_back_to_v1_on_expected_v2_failures(self, monkeypatch, fallback_exc): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres']) + monkeypatch.setattr(cli, 'run_v2_downloader', MagicMock(side_effect=fallback_exc)) + v1 = MagicMock() + monkeypatch.setattr(cli, 'run_downloader', v1) + monkeypatch.setattr(cli, 'instantiate_downloader', MagicMock(return_value=('d', 'n', 'v', False))) + + cli.download() + v1.assert_called_once_with('d', 'n', 'v', False) + + def test_default_unsafe_disable_verification_without_version_falls_back_to_v1(self, monkeypatch): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres', '--unsafe-disable-verification']) + monkeypatch.setattr(cli, 'run_v2_downloader', MagicMock(side_effect=MissingVersion('missing'))) + v1 = MagicMock() + monkeypatch.setattr(cli, 'run_downloader', v1) + monkeypatch.setattr(cli, 'instantiate_downloader', MagicMock(return_value=('d', 'n', None, False))) + + cli.download() + v1.assert_called_once_with('d', 'n', None, False) + + def test_non_datadog_package_does_not_fall_back_to_v1(self, monkeypatch): + monkeypatch.setattr('sys.argv', ['downloader', 'requests']) + v1 = MagicMock() + monkeypatch.setattr(cli, 'run_downloader', v1) + monkeypatch.setattr(cli, 'instantiate_downloader', MagicMock()) + + with pytest.raises(NonDatadogPackage): + cli.download() + v1.assert_not_called() + + @pytest.mark.parametrize( + 'integrity_exc', + [ + pytest.param(DigestMismatch(PROJECT, 'a', 'b'), id='digest-mismatch'), + pytest.param(LengthMismatch(PROJECT, 1, 2), id='length-mismatch'), + pytest.param(MalformedPointerError(PROJECT, 'digest'), id='malformed-pointer'), + ], + ) + def test_integrity_errors_do_not_fall_back_to_v1(self, monkeypatch, integrity_exc): + monkeypatch.setattr('sys.argv', ['downloader', 'datadog-postgres']) + monkeypatch.setattr(cli, 'run_v2_downloader', MagicMock(side_effect=integrity_exc)) + v1 = MagicMock() + monkeypatch.setattr(cli, 'run_downloader', v1) + monkeypatch.setattr(cli, 'instantiate_downloader', MagicMock()) + + with pytest.raises(type(integrity_exc)): + cli.download() + v1.assert_not_called() From 01c70a3c0409cef7832ea4fb8908211f7eed41a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vahe=20Karamyan=20=28=D5=8E=D5=A1=D5=B0=D5=A5=20=D5=94?= =?UTF-8?q?=D5=A1=D6=80=D5=A1=D5=B4=D5=B5=D5=A1=D5=B6=29?= Date: Wed, 27 May 2026 17:33:34 +0400 Subject: [PATCH 4/4] [MOPU-312] Add related links to monitor templates (#23245) * [MOPU-288] Add related links to kubernetes monitor templates Co-Authored-By: Claude Sonnet 4.6 * [MOPU-288] Add related links to nginx monitor templates Co-Authored-By: Claude Sonnet 4.6 * [MOPU-288] Add related links to postgres and redis monitor templates Co-Authored-By: Claude Sonnet 4.6 * Fix broken Infrastructure links in monitor templates The /infrastructure?filters=... links pointed to a non-existent path with an unsupported query param, and used template variables not in each monitor's group-by. - nginx (4xx, 5xx, upstream_peer_fails): remove (upstream is not a host/pod/container resource) - k8s deployments_replicas, statefulset_replicas, pods_failed_state: remove (no host/pod template var in group-by) - k8s node_unavailable: replace with Hosts page scoped to kube_cluster_name - k8s pod_crashloopbackoff, pod_imagepullbackoff, pod_oomkilled, pods_restarting: replace with Pod Explorer scoped to pod_name Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Sonnet 4.6 --- kubernetes/assets/monitors/monitor_deployments_replicas.json | 4 ++-- kubernetes/assets/monitors/monitor_node_unavailable.json | 4 ++-- kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json | 4 ++-- kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json | 4 ++-- kubernetes/assets/monitors/monitor_pod_oomkilled.json | 4 ++-- kubernetes/assets/monitors/monitor_pods_failed_state.json | 4 ++-- kubernetes/assets/monitors/monitor_pods_restarting.json | 4 ++-- kubernetes/assets/monitors/monitor_statefulset_replicas.json | 4 ++-- nginx/assets/monitors/4xx.json | 4 ++-- nginx/assets/monitors/5xx.json | 4 ++-- nginx/assets/monitors/upstream_peer_fails.json | 4 ++-- postgres/assets/monitors/percent_usage_connections.json | 4 ++-- postgres/assets/monitors/replication_delay.json | 4 ++-- redisdb/assets/monitors/high_mem.json | 4 ++-- 14 files changed, 28 insertions(+), 28 deletions(-) diff --git a/kubernetes/assets/monitors/monitor_deployments_replicas.json b/kubernetes/assets/monitors/monitor_deployments_replicas.json index 39b6fb9816c5f..374a02aae8603 100644 --- a/kubernetes/assets/monitors/monitor_deployments_replicas.json +++ b/kubernetes/assets/monitors/monitor_deployments_replicas.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-07-28", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Kubernetes Deployment Replicas are failing", "tags": [ "integration:kubernetes" ], "description": "Kubernetes replicas are clones that facilitate self-healing for pods. Each pod has a desired number of replica Pods that should be running at any given time. This monitor tracks the number of replicas that are failing per deployment.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nThere are at least 2 or more missing replicas for Deployment {{kube_namespace.name}}/{{kube_deployment.name}} over the last 15 minutes.\n\n{{/is_alert}}", + "message": "{{#is_alert}}\n\n## What's happening?\nThere are at least 2 or more missing replicas for Deployment {{kube_namespace.name}}/{{kube_deployment.name}} over the last 15 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+kube_deployment:{{kube_deployment.name}}+kube_namespace:{{kube_namespace.name}})\n- [Metrics Explorer (kubernetes_state.deployment.replicas_desired)](/metric/explorer?exp_metric=kubernetes_state.deployment.replicas_desired&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_deployment:{{kube_deployment.name}},kube_namespace:{{kube_namespace.name}}&exp_agg=avg&exp_type=line)\n- [Metrics Explorer (kubernetes_state.deployment.replicas_available)](/metric/explorer?exp_metric=kubernetes_state.deployment.replicas_available&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_deployment:{{kube_deployment.name}},kube_namespace:{{kube_namespace.name}}&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}", "name": "[Kubernetes] Monitor Kubernetes Deployments Replica Pods", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_node_unavailable.json b/kubernetes/assets/monitors/monitor_node_unavailable.json index 37ff9c574dcec..cc57835121156 100644 --- a/kubernetes/assets/monitors/monitor_node_unavailable.json +++ b/kubernetes/assets/monitors/monitor_node_unavailable.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-07-28", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Nodes are unavailable", "tags": [ "integration:kubernetes" ], "description": "Kubernetes nodes can either be schedulable or unschedulable. When unschedulable, the node prevents the scheduler from placing new pods onto that node. This monitor tracks the percentage of schedulable nodes.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nThe percentage of schedulable nodes is below 80% for status:schedulable on ({{kube_cluster_name.name}} cluster over the last 15 minutes.\n\n{{/is_alert}}\n\n Keep in mind that this might be expected based on your infrastructure.", + "message": "{{#is_alert}}\n\n## What's happening?\nThe percentage of schedulable nodes is below 80% for status:schedulable on ({{kube_cluster_name.name}} cluster over the last 15 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+status:schedulable)\n- [Hosts](/infrastructure/hosts?scope=kube_cluster_name:{{kube_cluster_name.name}})\n- [Metrics Explorer (kubernetes_state.node.status)](/metric/explorer?exp_metric=kubernetes_state.node.status&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},status:schedulable&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}\n\n Keep in mind that this might be expected based on your infrastructure.", "name": "[Kubernetes] Monitor Unschedulable Kubernetes Nodes", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json b/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json index 1b14f874c716a..317eec3fd0032 100644 --- a/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json +++ b/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-07-28", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Pod is in a CrashloopBackOff state", "tags": [ "integration:kubernetes" ], "description": "The status CrashloopBackOff means that a container in the Pod is started, crashes, and is restarted, over and over again. This monitor tracks when a pod is in a CrashloopBackOff state for your Kubernetes integration.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nAt least one container in pod {{pod_name.name}} on {{kube_namespace.name}} is in a waiting state due to reason crashloopbackoff in the last 10 minutes.\n\n{{/is_alert}}\n\n This alert could generate several alerts for a bad deployment. Adjust the thresholds of the query to suit your infrastructure.", + "message": "{{#is_alert}}\n\n## What's happening?\nAt least one container in pod {{pod_name.name}} on {{kube_namespace.name}} is in a waiting state due to reason crashloopbackoff in the last 10 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+kube_namespace:{{kube_namespace.name}}+pod_name:{{pod_name.name}}+reason:crashloopbackoff)\n- [Pod Explorer](/orchestration/explorer/pod?query={{pod_name.name}})\n- [Metrics Explorer (kubernetes_state.container.status_report.count.waiting)](/metric/explorer?exp_metric=kubernetes_state.container.status_report.count.waiting&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_namespace:{{kube_namespace.name}},pod_name:{{pod_name.name}},reason:crashloopbackoff&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}\n\n This alert could generate several alerts for a bad deployment. Adjust the thresholds of the query to suit your infrastructure.", "name": "[Kubernetes] Pod {{pod_name.name}} is CrashloopBackOff on namespace {{kube_namespace.name}}", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json b/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json index 07f30a6eb7b44..a42c9be57e11d 100644 --- a/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json +++ b/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-09-15", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Pod is in an ImagePullBackOff state", "tags": [ "integration:kubernetes" ], "description": "The status ImagePullBackOff means that a container could not start because Kubernetes could not pull a container image. This monitor tracks when a pod is in an ImagePullBackOff state for your Kubernetes integration.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nAt least one container in pod {{pod_name.name}} on namespace {{kube_namespace.name}} is in a waiting state due to an ImagePullBackOff error in the last 10 minutes.\n\n{{/is_alert}}\n\n This could happen for several reasons, for example a bad image path or tag or if the credentials for pulling images are not configured properly.", + "message": "{{#is_alert}}\n\n## What's happening?\nAt least one container in pod {{pod_name.name}} on namespace {{kube_namespace.name}} is in a waiting state due to an ImagePullBackOff error in the last 10 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+kube_namespace:{{kube_namespace.name}}+pod_name:{{pod_name.name}}+reason:imagepullbackoff)\n- [Pod Explorer](/orchestration/explorer/pod?query={{pod_name.name}})\n- [Metrics Explorer (kubernetes_state.container.status_report.count.waiting)](/metric/explorer?exp_metric=kubernetes_state.container.status_report.count.waiting&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_namespace:{{kube_namespace.name}},pod_name:{{pod_name.name}},reason:imagepullbackoff&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}\n\n This could happen for several reasons, for example a bad image path or tag or if the credentials for pulling images are not configured properly.", "name": "[Kubernetes] Pod {{pod_name.name}} is ImagePullBackOff on namespace {{kube_namespace.name}}", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_pod_oomkilled.json b/kubernetes/assets/monitors/monitor_pod_oomkilled.json index 3eece4a5d9e41..e4f7ad7aa755c 100644 --- a/kubernetes/assets/monitors/monitor_pod_oomkilled.json +++ b/kubernetes/assets/monitors/monitor_pod_oomkilled.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2025-09-15", - "last_updated_at": "2025-09-15", + "last_updated_at": "2026-04-09", "title": "Pod is in an OOMKilled state", "tags": [ "integration:kubernetes" ], "description": "The status OOMKilled means that a container was killed because it exceeded memory limits or the node ran out of available memory. This monitor tracks when a pod is in an OOMKilled state for your Kubernetes integration.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nThere has been at least one container terminated in pod {{pod_name.name}} on namespace {{kube_namespace.name}} with reason oomkilled in the last 10 minutes.\n\n{{/is_alert}}\n\n This could happen for several reasons, for example insufficient memory limits, memory leaks in the application, or the node running out of available memory.", + "message": "{{#is_alert}}\n\n## What's happening?\nThere has been at least one container terminated in pod {{pod_name.name}} on namespace {{kube_namespace.name}} with reason oomkilled in the last 10 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+kube_namespace:{{kube_namespace.name}}+pod_name:{{pod_name.name}}+reason:oomkilled)\n- [Pod Explorer](/orchestration/explorer/pod?query={{pod_name.name}})\n- [Metrics Explorer (kubernetes.containers.state.terminated)](/metric/explorer?exp_metric=kubernetes.containers.state.terminated&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_namespace:{{kube_namespace.name}},pod_name:{{pod_name.name}},reason:oomkilled&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}\n\n This could happen for several reasons, for example insufficient memory limits, memory leaks in the application, or the node running out of available memory.", "name": "[Kubernetes] Pod {{pod_name.name}} is OOMKilled on namespace {{kube_namespace.name}}", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_pods_failed_state.json b/kubernetes/assets/monitors/monitor_pods_failed_state.json index 708a41da74ee4..33ee1b348348e 100644 --- a/kubernetes/assets/monitors/monitor_pods_failed_state.json +++ b/kubernetes/assets/monitors/monitor_pods_failed_state.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-07-28", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Pods are failing", "tags": [ "integration:kubernetes" ], "description": "When a pod is failing it means the container either exited with non-zero status or was terminated by the system. This monitor tracks when more than 10 pods are failing for a given Kubernetes cluster.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nThe number of failed pods has increased by more than 10 in ({{kube_cluster_name.name}} cluster in the last 5 minutes.\n\n{{/is_alert}}\n\n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.", + "message": "{{#is_alert}}\n\n## What's happening?\nThe number of failed pods has increased by more than 10 in ({{kube_cluster_name.name}} cluster in the last 5 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+kube_namespace:{{kube_namespace.name}}+pod_phase:failed)\n- [Metrics Explorer (kubernetes_state.pod.status_phase)](/metric/explorer?exp_metric=kubernetes_state.pod.status_phase&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_namespace:{{kube_namespace.name}},pod_phase:failed&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}\n\n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.", "name": "[Kubernetes] Monitor Kubernetes Failed Pods in Namespaces", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_pods_restarting.json b/kubernetes/assets/monitors/monitor_pods_restarting.json index f35d90c629c09..c7cccced75755 100644 --- a/kubernetes/assets/monitors/monitor_pods_restarting.json +++ b/kubernetes/assets/monitors/monitor_pods_restarting.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-07-28", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Pods are restarting", "tags": [ "integration:kubernetes" ], "description": "Kubernetes pods restart according to the restart policy. A restarting container can indicate problems with memory, CPU usage, or an application exiting prematurely. This monitor tracks when pods are restarting multiple times.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nThere has been an increase of more than 5 container restarts in the pod {{pod_name.name}} in the last 5 minutes.\n\n{{/is_alert}}", + "message": "{{#is_alert}}\n\n## What's happening?\nThere has been an increase of more than 5 container restarts in the pod {{pod_name.name}} in the last 5 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+pod_name:{{pod_name.name}})\n- [Pod Explorer](/orchestration/explorer/pod?query={{pod_name.name}})\n- [Metrics Explorer (kubernetes.containers.restarts)](/metric/explorer?exp_metric=kubernetes.containers.restarts&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},pod_name:{{pod_name.name}}&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}", "name": "[Kubernetes] Monitor Kubernetes Pods Restarting", "options": { "escalation_message": "", diff --git a/kubernetes/assets/monitors/monitor_statefulset_replicas.json b/kubernetes/assets/monitors/monitor_statefulset_replicas.json index b0954fe2785bb..ef5fbc979d832 100644 --- a/kubernetes/assets/monitors/monitor_statefulset_replicas.json +++ b/kubernetes/assets/monitors/monitor_statefulset_replicas.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-07-28", - "last_updated_at": "2025-06-12", + "last_updated_at": "2026-04-09", "title": "Kubernetes Statefulset Replicas are failing", "tags": [ "integration:kubernetes" ], "description": "Kubernetes replicas are clones that facilitate self-healing for pods. Each pod has a desired number of replica Pods that should be running at any given time. This monitor tracks when the number of replicas per statefulset is falling.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nThere are at least 2 desired replicas that are not ready for {{kube_namespace.name}}/{{kube_stateful_set.name}} StatefulSet over the last 15 minutes.\n\n{{/is_alert}}\n\n This might present an unsafe situation for any further manual operations, such as killing other pods.", + "message": "{{#is_alert}}\n\n## What's happening?\nThere are at least 2 desired replicas that are not ready for {{kube_namespace.name}}/{{kube_stateful_set.name}} StatefulSet over the last 15 minutes.\n\n## Related Links\n\n- [Logs](/logs?query=kube_cluster_name:{{kube_cluster_name.name}}+kube_namespace:{{kube_namespace.name}}+kube_stateful_set:{{kube_stateful_set.name}})\n- [Metrics Explorer (kubernetes_state.statefulset.replicas_desired)](/metric/explorer?exp_metric=kubernetes_state.statefulset.replicas_desired&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_namespace:{{kube_namespace.name}},kube_stateful_set:{{kube_stateful_set.name}}&exp_agg=avg&exp_type=line)\n- [Metrics Explorer (kubernetes_state.statefulset.replicas_ready)](/metric/explorer?exp_metric=kubernetes_state.statefulset.replicas_ready&exp_scope=kube_cluster_name:{{kube_cluster_name.name}},kube_namespace:{{kube_namespace.name}},kube_stateful_set:{{kube_stateful_set.name}}&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}\n\n This might present an unsafe situation for any further manual operations, such as killing other pods.", "name": "[Kubernetes] Monitor Kubernetes Statefulset Replicas", "options": { "escalation_message": "", diff --git a/nginx/assets/monitors/4xx.json b/nginx/assets/monitors/4xx.json index 17fbd04888321..e38ee3436c7e3 100644 --- a/nginx/assets/monitors/4xx.json +++ b/nginx/assets/monitors/4xx.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-09-16", - "last_updated_at": "2026-03-09", + "last_updated_at": "2026-04-09", "title": "Upstream 4xx errors are high", "tags": [ "integration:nginx" ], "description": "NGINX sends requests to upstream peers that can fail eventually. This monitor tracks the count of 4xx HTTP responses to identify issues in the communication between NGINX and the backend servers.", "definition": { - "message": "{{#is_alert}}\n## 🚨 What's happening\n\nAn anomaly has been detected in the number of 4xx HTTP responses from NGINX upstream **{{upstream.name}}** (anomaly score: `{{value}}`, threshold: `{{threshold}}`). The 4xx response rate is significantly higher than normal, indicating that a notable portion of incoming requests are being rejected with client-side error codes.\n\nFirst triggered at **{{first_triggered_at}}**, active for **{{triggered_duration_sec}}** seconds.\n{{/is_alert}}{{#is_recovery}}\n## ✅ Recovered\n\nThe 4xx anomaly for upstream **{{upstream.name}}** has resolved. Current value: `{{value}}`.\n{{/is_recovery}}\n{{^is_recovery}}\n***\n\n## 📈 Impact\n\nElevated 4xx error rates can result in failed requests for end users and may expose misconfigurations or broken routes. Services and clients relying on this NGINX upstream may experience partial or complete degradation of functionality.\n\n***\n\n## Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected upstream** from the alert (`{{upstream.name}}`).\n2. Open [**Metrics Explorer**](/metric/explorer) and inspect `nginx.upstream.peers.responses.4xx` broken down by `upstream`.\n3. Review NGINX access logs for specific endpoints and status codes:\n ```bash\n tail -f /var/log/nginx/access.log | grep \" 4[0-9][0-9] \"\n ```\n4. Correlate the spike with recent configuration changes, upstream deployments, or traffic shifts.\n\n### Cause and Resolution\n\n| Cause | Resolution |\n| ----- | ---------- |\n| Invalid or removed request paths (404) | Verify routes in NGINX configuration; update upstream routing rules to reflect the current backend state. |\n| Authentication or authorization failures (401/403) | Review auth configuration; check if credentials or access tokens have expired or been revoked. |\n| Malformed client requests (400) | Inspect incoming request headers and payloads; check client-side request construction. |\n| Rate limiting triggered (429) | Review rate limit thresholds; consider scaling upstream services or relaxing limits. |\n| Upstream endpoints renamed or removed | Update NGINX upstream configuration to reflect the current backend service endpoints. |\n\n### Related links\n\n* [Documentation](https://docs.datadoghq.com/integrations/nginx/)\n* [Metrics Explorer](/metric/explorer)\n* [Log Explorer](/logs?query=source%3Anginx)\n\n### Who should be notified?\n\nAssign the appropriate notification handle for this alert (e.g., `@slack-infra`, `@pagerduty-nginx`):\n`@your-team-handle`\n{{/is_recovery}}", + "message": "{{#is_alert}}\n## 🚨 What's happening\n\nAn anomaly has been detected in the number of 4xx HTTP responses from NGINX upstream **{{upstream.name}}** (anomaly score: `{{value}}`, threshold: `{{threshold}}`). The 4xx response rate is significantly higher than normal, indicating that a notable portion of incoming requests are being rejected with client-side error codes.\n\nFirst triggered at **{{first_triggered_at}}**, active for **{{triggered_duration_sec}}** seconds.\n{{/is_alert}}{{#is_recovery}}\n## ✅ Recovered\n\nThe 4xx anomaly for upstream **{{upstream.name}}** has resolved. Current value: `{{value}}`.\n{{/is_recovery}}\n{{^is_recovery}}\n***\n\n## 📈 Impact\n\nElevated 4xx error rates can result in failed requests for end users and may expose misconfigurations or broken routes. Services and clients relying on this NGINX upstream may experience partial or complete degradation of functionality.\n\n***\n\n## Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected upstream** from the alert (`{{upstream.name}}`).\n2. Open [**Metrics Explorer**](/metric/explorer) and inspect `nginx.upstream.peers.responses.4xx` broken down by `upstream`.\n3. Review NGINX access logs for specific endpoints and status codes:\n ```bash\n tail -f /var/log/nginx/access.log | grep \" 4[0-9][0-9] \"\n ```\n4. Correlate the spike with recent configuration changes, upstream deployments, or traffic shifts.\n\n### Cause and Resolution\n\n| Cause | Resolution |\n| ----- | ---------- |\n| Invalid or removed request paths (404) | Verify routes in NGINX configuration; update upstream routing rules to reflect the current backend state. |\n| Authentication or authorization failures (401/403) | Review auth configuration; check if credentials or access tokens have expired or been revoked. |\n| Malformed client requests (400) | Inspect incoming request headers and payloads; check client-side request construction. |\n| Rate limiting triggered (429) | Review rate limit thresholds; consider scaling upstream services or relaxing limits. |\n| Upstream endpoints renamed or removed | Update NGINX upstream configuration to reflect the current backend service endpoints. |\n\n### Related links\n\n* [Documentation](https://docs.datadoghq.com/integrations/nginx/)\n* [Logs](/logs?query=upstream:{{upstream.name}})\n* [Metrics Explorer (nginx.upstream.peers.responses.4xx)](/metric/explorer?exp_metric=nginx.upstream.peers.responses.4xx&exp_scope=upstream:{{upstream.name}}&exp_agg=avg&exp_type=line)\n\n### Who should be notified?\n\nAssign the appropriate notification handle for this alert (e.g., `@slack-infra`, `@pagerduty-nginx`):\n`@your-team-handle`\n{{/is_recovery}}", "name": "[NGINX] 4xx Errors higher than usual", "options": { "escalation_message": "", diff --git a/nginx/assets/monitors/5xx.json b/nginx/assets/monitors/5xx.json index c7b9ef7201dbc..b98d0bf985336 100644 --- a/nginx/assets/monitors/5xx.json +++ b/nginx/assets/monitors/5xx.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-09-16", - "last_updated_at": "2026-03-09", + "last_updated_at": "2026-04-09", "title": "Upstream 5xx errors are high", "tags": [ "integration:nginx" ], "description": "“5xx upstream request errors” are indicating server issues from backend servers. This monitor tracks the count of 5xx responses from NGINX's upstream peers to identify server-related issues in your web or application infrastructure.", "definition": { - "message": "{{#is_alert}}\n## 🚨 What's happening\n\nAn anomaly has been detected in the number of 5xx HTTP responses from NGINX upstream **{{upstream.name}}** (anomaly score: `{{value}}`, threshold: `{{threshold}}`). The 5xx error rate is significantly higher than normal, indicating that backend servers are failing to handle a notable portion of requests.\n\nFirst triggered at **{{first_triggered_at}}**, active for **{{triggered_duration_sec}}** seconds.\n{{/is_alert}}{{#is_recovery}}\n## ✅ Recovered\n\nThe 5xx anomaly for upstream **{{upstream.name}}** has resolved. Current value: `{{value}}`.\n{{/is_recovery}}\n{{^is_recovery}}\n***\n\n## 📈 Impact\n\n5xx errors indicate server-side failures that cause direct service disruptions for users. Dependent services that rely on successful responses from this NGINX upstream may experience cascading failures or degraded functionality.\n\n***\n\n## Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected upstream** from the alert (`{{upstream.name}}`).\n2. Open [**Metrics Explorer**](/metric/explorer) and inspect `nginx.upstream.peers.responses.5xx` broken down by `upstream`.\n3. Review NGINX error logs for connection failures or backend errors:\n ```bash\n tail -f /var/log/nginx/error.log\n ```\n4. Check upstream backend service health and application logs.\n5. Correlate the spike with recent deployments or infrastructure changes.\n\n### Cause and Resolution\n\n| Cause | Resolution |\n| ----- | ---------- |\n| Backend server is down or crashed (502) | Verify the upstream service is running; restart the service if needed and check its logs. |\n| Gateway timeout due to slow upstream (504) | Check upstream response times; increase `proxy_read_timeout` if the upstream is legitimately slow. |\n| Application-level errors (500) | Inspect upstream application logs for unhandled exceptions or crashes; roll back recent deployments if correlated. |\n| Service unavailable due to overload (503) | Check upstream server resource utilization; scale out or enable load balancing across more peers. |\n| Resource exhaustion on upstream servers | Review CPU, memory, and connection pool usage on the backend; tune resource limits and autoscaling. |\n\n### Related links\n\n* [Documentation](https://docs.datadoghq.com/integrations/nginx/)\n* [Metrics Explorer](/metric/explorer)\n* [Log Explorer](/logs?query=source%3Anginx)\n\n### Who should be notified?\n\nAssign the appropriate notification handle for this alert (e.g., `@slack-infra`, `@pagerduty-nginx`):\n`@your-team-handle`\n{{/is_recovery}}", + "message": "{{#is_alert}}\n## 🚨 What's happening\n\nAn anomaly has been detected in the number of 5xx HTTP responses from NGINX upstream **{{upstream.name}}** (anomaly score: `{{value}}`, threshold: `{{threshold}}`). The 5xx error rate is significantly higher than normal, indicating that backend servers are failing to handle a notable portion of requests.\n\nFirst triggered at **{{first_triggered_at}}**, active for **{{triggered_duration_sec}}** seconds.\n{{/is_alert}}{{#is_recovery}}\n## ✅ Recovered\n\nThe 5xx anomaly for upstream **{{upstream.name}}** has resolved. Current value: `{{value}}`.\n{{/is_recovery}}\n{{^is_recovery}}\n***\n\n## 📈 Impact\n\n5xx errors indicate server-side failures that cause direct service disruptions for users. Dependent services that rely on successful responses from this NGINX upstream may experience cascading failures or degraded functionality.\n\n***\n\n## Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected upstream** from the alert (`{{upstream.name}}`).\n2. Open [**Metrics Explorer**](/metric/explorer) and inspect `nginx.upstream.peers.responses.5xx` broken down by `upstream`.\n3. Review NGINX error logs for connection failures or backend errors:\n ```bash\n tail -f /var/log/nginx/error.log\n ```\n4. Check upstream backend service health and application logs.\n5. Correlate the spike with recent deployments or infrastructure changes.\n\n### Cause and Resolution\n\n| Cause | Resolution |\n| ----- | ---------- |\n| Backend server is down or crashed (502) | Verify the upstream service is running; restart the service if needed and check its logs. |\n| Gateway timeout due to slow upstream (504) | Check upstream response times; increase `proxy_read_timeout` if the upstream is legitimately slow. |\n| Application-level errors (500) | Inspect upstream application logs for unhandled exceptions or crashes; roll back recent deployments if correlated. |\n| Service unavailable due to overload (503) | Check upstream server resource utilization; scale out or enable load balancing across more peers. |\n| Resource exhaustion on upstream servers | Review CPU, memory, and connection pool usage on the backend; tune resource limits and autoscaling. |\n\n### Related links\n\n* [Documentation](https://docs.datadoghq.com/integrations/nginx/)\n* [Logs](/logs?query=upstream:{{upstream.name}})\n* [Metrics Explorer (nginx.upstream.peers.responses.5xx)](/metric/explorer?exp_metric=nginx.upstream.peers.responses.5xx&exp_scope=upstream:{{upstream.name}}&exp_agg=avg&exp_type=line)\n\n### Who should be notified?\n\nAssign the appropriate notification handle for this alert (e.g., `@slack-infra`, `@pagerduty-nginx`):\n`@your-team-handle`\n{{/is_recovery}}", "name": "[NGINX] 5xx Errors higher than usual", "options": { "escalation_message": "", diff --git a/nginx/assets/monitors/upstream_peer_fails.json b/nginx/assets/monitors/upstream_peer_fails.json index 08cef0b5647dd..d8f4b0fb003d1 100644 --- a/nginx/assets/monitors/upstream_peer_fails.json +++ b/nginx/assets/monitors/upstream_peer_fails.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2020-09-16", - "last_updated_at": "2026-03-09", + "last_updated_at": "2026-04-09", "title": "Upstream peers are failing", "tags": [ "integration:nginx" ], "description": "NGINX can be configured to distribute incoming client requests to multiple upstream peers (individual web servers, application servers, or other backend services). This monitor tracks anomalies in the number of failed upstream peers to identify issues.", "definition": { - "message": "{{#is_alert}}\n## 🚨 What's happening\n\nAn anomaly has been detected in the number of upstream peer communication failures for **{{upstream.name}}** (anomaly score: `{{value}}`, threshold: `{{threshold}}`). NGINX is experiencing an unusual number of unsuccessful attempts to connect to or communicate with one or more backend servers.\n\nFirst triggered at **{{first_triggered_at}}**, active for **{{triggered_duration_sec}}** seconds.\n{{/is_alert}}{{#is_recovery}}\n## ✅ Recovered\n\nUpstream peer failures for **{{upstream.name}}** have resolved. Current value: `{{value}}`.\n{{/is_recovery}}\n{{^is_recovery}}\n***\n\n## 📈 Impact\n\nUpstream peer failures reduce the pool of available backend servers, increasing load on healthy peers. Users may experience intermittent errors or increased response times as NGINX retries or routes traffic around failed peers.\n\n***\n\n## Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected upstream** from the alert (`{{upstream.name}}`).\n2. Open [**Metrics Explorer**](/metric/explorer) and inspect `nginx.stream.upstream.peers.fails` broken down by `upstream` to identify which specific peers are failing.\n3. Review NGINX error logs for connection-level failures:\n ```bash\n tail -f /var/log/nginx/error.log | grep \"upstream\"\n ```\n4. Test connectivity from the NGINX host to the failing upstream servers:\n ```bash\n curl -v http://:/health\n ```\n5. Correlate with recent configuration changes or upstream service deployments.\n\n### Cause and Resolution\n\n| Cause | Resolution |\n| ----- | ---------- |\n| Upstream server is down or crashed | Verify the upstream service is running and listening on the expected port; restart if needed. |\n| Network connectivity issues | Test connectivity from the NGINX host to the upstream; check firewall rules and network routing. |\n| Upstream not responding within timeout | Review `proxy_connect_timeout` and `proxy_read_timeout` in NGINX config; increase if the upstream is legitimately slow. |\n| Misconfigured upstream address or port | Verify the upstream block in NGINX configuration has the correct server addresses and ports. |\n| Firewall or security group blocking traffic | Check security group rules and host-based firewall (iptables/nftables) on the upstream servers. |\n\n### Related links\n\n* [Documentation](https://docs.datadoghq.com/integrations/nginx/)\n* [Metrics Explorer](/metric/explorer)\n* [Log Explorer](/logs?query=source%3Anginx)\n\n### Who should be notified?\n\nAssign the appropriate notification handle for this alert (e.g., `@slack-infra`, `@pagerduty-nginx`):\n`@your-team-handle`\n{{/is_recovery}}", + "message": "{{#is_alert}}\n## 🚨 What's happening\n\nAn anomaly has been detected in the number of upstream peer communication failures for **{{upstream.name}}** (anomaly score: `{{value}}`, threshold: `{{threshold}}`). NGINX is experiencing an unusual number of unsuccessful attempts to connect to or communicate with one or more backend servers.\n\nFirst triggered at **{{first_triggered_at}}**, active for **{{triggered_duration_sec}}** seconds.\n{{/is_alert}}{{#is_recovery}}\n## ✅ Recovered\n\nUpstream peer failures for **{{upstream.name}}** have resolved. Current value: `{{value}}`.\n{{/is_recovery}}\n{{^is_recovery}}\n***\n\n## 📈 Impact\n\nUpstream peer failures reduce the pool of available backend servers, increasing load on healthy peers. Users may experience intermittent errors or increased response times as NGINX retries or routes traffic around failed peers.\n\n***\n\n## Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected upstream** from the alert (`{{upstream.name}}`).\n2. Open [**Metrics Explorer**](/metric/explorer) and inspect `nginx.stream.upstream.peers.fails` broken down by `upstream` to identify which specific peers are failing.\n3. Review NGINX error logs for connection-level failures:\n ```bash\n tail -f /var/log/nginx/error.log | grep \"upstream\"\n ```\n4. Test connectivity from the NGINX host to the failing upstream servers:\n ```bash\n curl -v http://:/health\n ```\n5. Correlate with recent configuration changes or upstream service deployments.\n\n### Cause and Resolution\n\n| Cause | Resolution |\n| ----- | ---------- |\n| Upstream server is down or crashed | Verify the upstream service is running and listening on the expected port; restart if needed. |\n| Network connectivity issues | Test connectivity from the NGINX host to the upstream; check firewall rules and network routing. |\n| Upstream not responding within timeout | Review `proxy_connect_timeout` and `proxy_read_timeout` in NGINX config; increase if the upstream is legitimately slow. |\n| Misconfigured upstream address or port | Verify the upstream block in NGINX configuration has the correct server addresses and ports. |\n| Firewall or security group blocking traffic | Check security group rules and host-based firewall (iptables/nftables) on the upstream servers. |\n\n### Related links\n\n* [Documentation](https://docs.datadoghq.com/integrations/nginx/)\n* [Logs](/logs?query=upstream:{{upstream.name}})\n* [Metrics Explorer (nginx.stream.upstream.peers.fails)](/metric/explorer?exp_metric=nginx.stream.upstream.peers.fails&exp_scope=upstream:{{upstream.name}}&exp_agg=avg&exp_type=line)\n\n### Who should be notified?\n\nAssign the appropriate notification handle for this alert (e.g., `@slack-infra`, `@pagerduty-nginx`):\n`@your-team-handle`\n{{/is_recovery}}", "name": "[NGINX] Upstream peers fails", "options": { "escalation_message": "", diff --git a/postgres/assets/monitors/percent_usage_connections.json b/postgres/assets/monitors/percent_usage_connections.json index bfa477bbe4e8a..01608f00c12bb 100644 --- a/postgres/assets/monitors/percent_usage_connections.json +++ b/postgres/assets/monitors/percent_usage_connections.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2021-03-17", - "last_updated_at": "2023-07-24", + "last_updated_at": "2026-04-09", "title": "Connection pool is reaching saturation point", "tags": [ "integration:postgres" ], "description": "In PostgreSQL, there is a limit of concurrent connections that can be increased. When this limit is exceeded, new users cannot establish a connection with the database. This monitor tracks the total number of connections.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nPostgreSQL connection usage on host {{host.name}} has exceeded 90% of the maximum allowed connections over the last 15 minutes.\n\n{{/is_alert}}", + "message": "{{#is_alert}}\n\n## What's happening?\nPostgreSQL connection usage on host {{host.name}} has exceeded 90% of the maximum allowed connections over the last 15 minutes.\n\n## Related Links\n\n- [Metrics Explorer (postgresql.percent_usage_connections)](/metric/explorer?exp_metric=postgresql.percent_usage_connections&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}", "name": "[Postgres] Number of connections is approaching connection limit on {{host.name}}", "options": { "escalation_message": "", diff --git a/postgres/assets/monitors/replication_delay.json b/postgres/assets/monitors/replication_delay.json index 889700af13e39..6ec45e4efe1c5 100644 --- a/postgres/assets/monitors/replication_delay.json +++ b/postgres/assets/monitors/replication_delay.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2021-02-16", - "last_updated_at": "2021-03-17", + "last_updated_at": "2026-04-09", "title": "Replication delay is high", "tags": [ "integration:postgres" ], "description": "Replication lag is the delay between the time when data is written to the primary database and the time when it is replicated to the standby databases. This monitor tracks the replication lag of the postgres database.", "definition": { - "message": "{{#is_alert}}\n\n## What's happening?\nAnomalies in replication delay on host {{host.name}} for PostgreSQL have been detected above the expected range within the past 15 minutes, over the last hour.\n\n{{/is_alert}}", + "message": "{{#is_alert}}\n\n## What's happening?\nAnomalies in replication delay on host {{host.name}} for PostgreSQL have been detected above the expected range within the past 15 minutes, over the last hour.\n\n## Related Links\n\n- [Metrics Explorer (postgresql.replication_delay)](/metric/explorer?exp_metric=postgresql.replication_delay&exp_agg=avg&exp_type=line)\n\n{{/is_alert}}", "name": "[Postgres] Replication delay is abnormally high on {{host.name}}", "options": { "escalation_message": "", diff --git a/redisdb/assets/monitors/high_mem.json b/redisdb/assets/monitors/high_mem.json index b230aa497a55d..a360246d126a6 100644 --- a/redisdb/assets/monitors/high_mem.json +++ b/redisdb/assets/monitors/high_mem.json @@ -1,14 +1,14 @@ { "version": 2, "created_at": "2021-02-08", - "last_updated_at": "2021-02-08", + "last_updated_at": "2026-04-09", "title": "Memory consumption is high", "tags": [ "integration:redis" ], "description": "Redis servers use RAM to store data and memory is a critical resource for its performance. This monitor tracks the percentage of used memory to avoid the risk of running out of memory, which can lead to performance issues.", "definition": { - "message": "## What's happening?\n{{#is_alert}}\nRedis memory usage has exceeded 90% of its allocated limit in the last 5 minutes with current value of {{value}}.\n{{/is_alert}} \n\n{{#is_warning}}\nRedis memory usage has exceeded 70% of its allocated limit in the last 5 minutes with current value of {{value}}.\n{{/is_warning}}", + "message": "## What's happening?\n{{#is_alert}}\nRedis memory usage has exceeded 90% of its allocated limit in the last 5 minutes with current value of {{value}}.\n{{/is_alert}} \n\n{{#is_warning}}\nRedis memory usage has exceeded 70% of its allocated limit in the last 5 minutes with current value of {{value}}.\n{{/is_warning}}\n\n## Related Links\n\n- [Metrics Explorer (redis.mem.used)](/metric/explorer?exp_metric=redis.mem.used&exp_agg=avg&exp_type=line)\n- [Metrics Explorer (redis.mem.maxmemory)](/metric/explorer?exp_metric=redis.mem.maxmemory&exp_agg=avg&exp_type=line)", "name": "[Redis] High memory consumption", "options": { "escalation_message": "",