Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
bfd424e
feat(observability): scaffold infrahub-observability chart
FragmentedPacket May 12, 2026
e841554
feat(observability): ship Alloy config.alloy as ConfigMap
FragmentedPacket May 12, 2026
3c4f877
feat(observability): provision Grafana datasources via sidecar ConfigMap
FragmentedPacket May 12, 2026
f8e12ac
feat(observability): provision vendored Grafana dashboards via sideca…
FragmentedPacket May 12, 2026
e122fcd
feat(observability): add Prefect prometheus exporter Deployment
FragmentedPacket May 14, 2026
3680c9c
feat(observability): expose Prefect exporter via Service with scrape …
FragmentedPacket May 14, 2026
8b26984
feat(observability): add post-install NOTES with port-forward and wir…
FragmentedPacket May 14, 2026
d515bd0
feat(infrahub): add global.tracing for OTLP collector wiring
FragmentedPacket May 14, 2026
16a33c0
ci: lint infrahub-observability chart in CI
FragmentedPacket May 14, 2026
7c8b682
docs(observability): render README from gotmpl
FragmentedPacket May 14, 2026
051d00d
docs: add local install/test guide for infrahub + observability
FragmentedPacket May 14, 2026
0d24c99
fix(observability): correct service names, ports, and broken subchart…
FragmentedPacket May 14, 2026
c7f2520
fix(infrahub): emit OTEL_EXPORTER_OTLP_INSECURE alongside INFRAHUB_TR…
FragmentedPacket May 14, 2026
71e66c8
docs: update local-testing guide with verified pod names and discover…
FragmentedPacket May 14, 2026
a1fc473
docs(plans): design dashboard k8s adaptation + cAdvisor scrape
FragmentedPacket May 14, 2026
a331b8a
feat(observability): scrape kubelet cAdvisor for container metrics
FragmentedPacket May 14, 2026
7ef3a6c
feat(observability): add dashboard transform pipeline for k8s label a…
FragmentedPacket May 14, 2026
328335e
chore(observability): re-sync dashboards through k8s transform
FragmentedPacket May 14, 2026
fec33ee
feat(observability): static-validate dashboard queries against allowlist
FragmentedPacket May 14, 2026
e455729
ci: path-filter chart-specific lint jobs and add dashboard validator
FragmentedPacket May 14, 2026
9db5d1b
ci: scheduled upstream-dashboard drift check with auto-PR
FragmentedPacket May 14, 2026
082a2f9
feat(observability): wire infrahub-observability as optional subchart
FragmentedPacket May 18, 2026
9654361
refactor(observability): use infrahub-observability.enabled toggle
FragmentedPacket May 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 64 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,77 @@ concurrency:
cancel-in-progress: true

jobs:
helm-lint:
changes:
runs-on: "ubuntu-latest"
timeout-minutes: 2
outputs:
infrahub: ${{ steps.filter.outputs.infrahub }}
enterprise: ${{ steps.filter.outputs.enterprise }}
observability: ${{ steps.filter.outputs.observability }}
steps:
- uses: "actions/checkout@v4"
- id: filter
uses: "dorny/paths-filter@v3"
with:
filters: |
infrahub:
- 'charts/infrahub/**'
enterprise:
- 'charts/infrahub-enterprise/**'
- 'charts/infrahub/**'
observability:
- 'charts/infrahub-observability/**'
- 'scripts/sync-dashboards.sh'
- 'scripts/transform_dashboard.py'
- 'scripts/validate_dashboards.py'
- 'scripts/known-metrics.yaml'

helm-lint-infrahub:
needs: changes
if: ${{ needs.changes.outputs.infrahub == 'true' }}
runs-on: "ubuntu-latest"
timeout-minutes: 5
steps:
- name: "Check out repository code"
uses: "actions/checkout@v4"
- uses: "actions/checkout@v4"
with:
submodules: true
- name: "Install Helm"
uses: azure/setup-helm@v4.3.0
- uses: "azure/setup-helm@v4.3.0"
- name: "Updating dependencies"
run: "helm dependency update charts/infrahub"
- name: "Linting: helm lint infrahub"
run: "helm lint charts/infrahub"
- name: "Linting: helm lint infrahub enterprise"

helm-lint-enterprise:
needs: changes
if: ${{ needs.changes.outputs.enterprise == 'true' }}
runs-on: "ubuntu-latest"
timeout-minutes: 5
steps:
- uses: "actions/checkout@v4"
with:
submodules: true
- uses: "azure/setup-helm@v4.3.0"
- name: "Linting: helm lint infrahub-enterprise"
run: "helm lint charts/infrahub-enterprise"

helm-lint-observability:
needs: changes
if: ${{ needs.changes.outputs.observability == 'true' }}
runs-on: "ubuntu-latest"
timeout-minutes: 10
steps:
- uses: "actions/checkout@v4"
with:
submodules: true
- uses: "azure/setup-helm@v4.3.0"
- uses: "actions/setup-python@v5"
with:
python-version: "3.12"
- name: "Install Python deps"
run: "pip install pyyaml"
- name: "Updating dependencies: infrahub-observability"
run: "helm dependency update charts/infrahub-observability"
- name: "Linting: helm lint infrahub-observability"
run: "helm lint charts/infrahub-observability"
- name: "Static-validate dashboard queries against known-metrics allowlist"
run: "python3 scripts/validate_dashboards.py"
80 changes: 80 additions & 0 deletions .github/workflows/dashboard-drift-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---
# yamllint disable rule:line-length
#
# Scheduled re-sync of upstream Grafana dashboards. If upstream changed the
# JSON at the recorded ref (or the transform script's output changes), this
# workflow opens (or updates) a draft PR with the re-synced dashboards so a
# human can review the diff. If there's no drift, the workflow is a no-op.
#
# Trigger options:
# - Scheduled: runs Mondays at 09:00 UTC
# - Manual: workflow_dispatch with an optional REF input
name: "Dashboard drift check"
on:
schedule:
- cron: "0 9 * * 1"
workflow_dispatch:
inputs:
ref:
description: "Upstream ref to sync against (default: ref recorded in .dashboards-source)"
required: false
type: string

jobs:
drift-check:
runs-on: "ubuntu-latest"
timeout-minutes: 10
permissions:
contents: write
pull-requests: write
steps:
- uses: "actions/checkout@v4"
- name: "Install yq"
run: |
sudo wget -q -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
sudo chmod +x /usr/local/bin/yq
- name: "Re-sync dashboards"
run: |
if [ -n "${{ inputs.ref }}" ]; then
./scripts/sync-dashboards.sh "${{ inputs.ref }}"
else
./scripts/sync-dashboards.sh
fi
- name: "Validate the synced dashboards"
run: |
python3 -m pip install pyyaml
python3 scripts/validate_dashboards.py
- name: "Open or update drift PR"
uses: "peter-evans/create-pull-request@v7"
with:
# The branch name is fixed so re-runs update the same PR rather
# than spawning a new one each week.
branch: "automation/dashboard-drift"
base: "${{ github.event.repository.default_branch }}"
title: "chore(observability): sync upstream dashboards"
commit-message: |
chore(observability): sync upstream dashboards

Auto-generated by .github/workflows/dashboard-drift-check.yml.
Re-ran scripts/sync-dashboards.sh and committed any drift.
body: |
Automated drift detection re-ran `scripts/sync-dashboards.sh`
against the ref recorded in `.dashboards-source`
(`${{ inputs.ref || 'default' }}`) and found changes.

**Review checklist**

- [ ] Look at the rendered diff — are upstream's edits sensible?
- [ ] Confirm the transform pipeline still produces clean output
(no leftover `container_label_com_docker_compose_*` tokens).
- [ ] Spot-check at least one panel in Grafana against a live
cluster before merging.
- [ ] If upstream renamed a metric we no longer collect, update
`scripts/known-metrics.yaml`.

Generated by `peter-evans/create-pull-request@v7`.
draft: true
labels: |
automation
observability
delete-branch: true
28 changes: 28 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
.PHONY: help sync-dashboards lint lint-observability template-observability deps-observability

help:
@echo "Available targets:"
@echo " sync-dashboards - sync vendored dashboards from upstream infrahub (REF=<ref> to override)"
@echo " deps-observability - run 'helm dependency update' for the observability chart"
@echo " lint-observability - run 'helm lint' for the observability chart"
@echo " template-observability - render the observability chart with default values"
@echo " lint - lint every chart in charts/"

sync-dashboards:
./scripts/sync-dashboards.sh $(REF)

deps-observability:
helm dependency update charts/infrahub-observability

lint-observability: deps-observability
helm lint charts/infrahub-observability

template-observability: deps-observability
helm template test charts/infrahub-observability

lint:
@for chart in charts/*/; do \
echo "==> linting $$chart"; \
helm dependency update "$$chart" >/dev/null; \
helm lint "$$chart"; \
done
14 changes: 14 additions & 0 deletions charts/infrahub-observability/.dashboards-source
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Tracks the upstream source for vendored Grafana dashboards.
# Updated by scripts/sync-dashboards.sh and reviewed in PRs.
# The chart's appVersion should match the infrahub release recorded here.
repo: opsmill/infrahub
ref: infrahub-v1.9.3
path: development/grafana/provisioning/dashboards
files:
- infrahub_monitoring.json
- neo4j_monitoring.json
- rabbitmq_instance_monitoring.json
- container_resources.json
- loki_monitoring.json
- prefect_platform_overview.json
- prefect_flow_run_overview.json
23 changes: 23 additions & 0 deletions charts/infrahub-observability/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
50 changes: 50 additions & 0 deletions charts/infrahub-observability/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
apiVersion: v2
name: infrahub-observability
description: Observability stack (Alloy, Loki, Prometheus, Tempo, Grafana) for Infrahub on Kubernetes
icon: https://github.com/opsmill/infrahub/raw/develop/frontend/app/public/favicons/logo512.png
home: https://github.com/opsmill/infrahub-helm
sources:
- https://github.com/opsmill/infrahub
- https://github.com/opsmill/infrahub-helm
keywords:
- infrahub
- observability
- grafana
- loki
- prometheus
- tempo
- alloy
- kubernetes
maintainers:
- name: OpsMill
url: https://github.com/opsmill
type: application
version: 0.1.0
appVersion: "1.9.3"

dependencies:
- name: alloy
version: "1.0.3"
repository: "https://grafana.github.io/helm-charts"
condition: alloy.enabled
- name: loki
version: "6.16.0"
repository: "https://grafana.github.io/helm-charts"
condition: loki.enabled
- name: tempo
version: "1.10.0"
repository: "https://grafana.github.io/helm-charts"
condition: tempo.enabled
- name: grafana
version: "8.5.0"
repository: "https://grafana.github.io/helm-charts"
condition: grafana.enabled
- name: prometheus
version: "25.27.0"
repository: "https://prometheus-community.github.io/helm-charts"
condition: prometheus.enabled
- name: prometheus-node-exporter
version: "4.36.0"
repository: "https://prometheus-community.github.io/helm-charts"
condition: prometheus-node-exporter.enabled
112 changes: 112 additions & 0 deletions charts/infrahub-observability/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# infrahub-observability

Observability stack (Alloy, Loki, Prometheus, Tempo, Grafana) for Infrahub on Kubernetes

**Homepage:** <https://github.com/opsmill/infrahub-helm>

This chart deploys the same observability stack that Infrahub ships for local
Docker Compose development — Grafana Alloy (logs + metrics), Loki (logs),
Prometheus (metrics + remote-write receiver), Tempo (traces), Grafana
(visualization), and the Prefect prometheus exporter — onto Kubernetes. It is
designed to be installed alongside the [infrahub](../infrahub) or
[infrahub-enterprise](../infrahub-enterprise) chart in the same namespace.

## Prerequisites

- Kubernetes 1.24+
- Helm 3.0+
- PV provisioner support in the underlying infrastructure (Loki, Prometheus,
Tempo and Grafana enable persistence by default)
- The infrahub chart is installed in the same namespace, or its release
name is supplied via `global.infrahubReleaseName`

## Installing the Chart

```sh
helm dependency update charts/infrahub-observability
helm install obs charts/infrahub-observability -n infrahub
```

## Wiring infrahub to send traces to Tempo

The infrahub chart exposes a `global.tracing` block that emits the
`INFRAHUB_TRACE_*` env vars on the server and task-worker deployments. Point
it at the Tempo service this chart creates:

```yaml
# infrahub values
global:
tracing:
enabled: true
endpoint: "obs-tempo:4317" # <obs-release-name>-tempo:4317 (host:port for grpc)
protocol: grpc
insecure: true
```

## Dashboards

Seven Grafana dashboards are vendored from the [opsmill/infrahub
repository](https://github.com/opsmill/infrahub) at the version recorded in
`.dashboards-source`. The chart's `appVersion` tracks this version. The
dashboards are kept in sync with upstream automatically as part of the
infrahub release flow; for local development:

```sh
make sync-dashboards REF=v1.9.3
```

## Uninstalling the Chart

```sh
helm delete obs -n infrahub
```

Persistent volumes for Loki, Prometheus, Tempo and Grafana are retained by
default. Delete the PVCs explicitly if you want a clean slate.

## Maintainers

| Name | Email | Url |
| ---- | ------ | --- |
| OpsMill | | <https://github.com/opsmill> |

## Requirements

| Repository | Name | Version |
|------------|------|---------|
| https://grafana.github.io/helm-charts | alloy | 1.0.3 |
| https://grafana.github.io/helm-charts | grafana | 8.5.0 |
| https://grafana.github.io/helm-charts | loki | 6.16.0 |
| https://grafana.github.io/helm-charts | tempo | 1.10.0 |
| https://prometheus-community.github.io/helm-charts | prometheus | 25.27.0 |
| https://prometheus-community.github.io/helm-charts | prometheus-node-exporter | 4.36.0 |

## Values

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| alloy | object | `{"alloy":{"clustering":{"enabled":false},"configMap":{"create":false,"key":"config.alloy","name":""},"mounts":{"dockercontainers":false,"varlog":true}},"cadvisor":{"enabled":true},"controller":{"type":"daemonset"},"enabled":true}` | -------------------------------------------------------------------------- |
| alloy.alloy.configMap.name | string | `""` | Name of the ConfigMap that holds Alloy's config.alloy file. Resolved at render time via the helper. |
| alloy.cadvisor | object | `{"enabled":true}` | Scrape kubelet cAdvisor for per-container CPU/memory/network/fs metrics. Requires the Alloy ServiceAccount to have `get nodes/proxy`, which the subchart's default RBAC already grants. Disable if your cluster's policy forbids that permission; the Container Resources and Neo4j Monitoring dashboards will then show no data. |
| global | object | `{"commonAnnotations":{},"commonLabels":{},"imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"infrahubNamespace":"","infrahubReleaseName":"infrahub","kubernetesClusterDomain":"cluster.local","podLabels":{}}` | Global values shared across all sub-charts and templates in this chart. |
| global.commonAnnotations | object | `{}` | Annotations added to every resource managed by this chart. |
| global.commonLabels | object | `{}` | Labels added to every resource managed by this chart. |
| global.imagePullPolicy | string | `"IfNotPresent"` | Default imagePullPolicy for in-chart workloads (currently only the Prefect exporter). |
| global.imagePullSecrets | list | `[]` | Image pull secrets propagated to in-chart workloads. |
| global.infrahubNamespace | string | `""` | Namespace where the sibling infrahub release lives. Empty string means the same namespace as this release. |
| global.infrahubReleaseName | string | `"infrahub"` | Release name of the sibling infrahub chart. Used by the Prefect exporter to derive the default PREFECT_API_URL and by Alloy when scoping discovery. |
| global.kubernetesClusterDomain | string | `"cluster.local"` | Cluster DNS domain. Used for fully-qualified service names if needed. |
| global.podLabels | object | `{}` | Pod-level labels merged into the standard selector labels. |
| grafana | object | `{"adminPassword":"admin","adminUser":"admin","enabled":true,"env":{"GF_LOG_LEVEL":"warn","GF_USERS_ALLOW_SIGN_UP":"false"},"ingress":{"enabled":false},"persistence":{"enabled":true,"size":"5Gi"},"service":{"type":"ClusterIP"},"sidecar":{"dashboards":{"enabled":true,"label":"grafana_dashboard","labelValue":"1","searchNamespace":"ALL"},"datasources":{"enabled":true,"label":"grafana_datasource","labelValue":"1","searchNamespace":"ALL"}}}` | -------------------------------------------------------------------------- |
| grafana.adminPassword | string | `"admin"` | Default password matches docker-compose dev parity. Override via `grafana.admin.existingSecret` in production. |
| loki | object | `{"backend":{"replicas":0},"chunksCache":{"enabled":false},"deploymentMode":"SingleBinary","enabled":true,"gateway":{"enabled":false},"loki":{"auth_enabled":false,"commonConfig":{"replication_factor":1},"compactor":{"compaction_interval":"10m","delete_request_store":"filesystem","retention_delete_delay":"2h","retention_delete_worker_count":100,"retention_enabled":true,"working_directory":"/var/loki/compactor"},"limits_config":{"allow_structured_metadata":true,"cardinality_limit":100000,"ingestion_burst_size_mb":64,"ingestion_rate_mb":32,"max_entries_limit_per_query":10000,"max_global_streams_per_user":15000,"max_query_lookback":"24h","max_streams_per_user":20000,"per_stream_rate_limit":"3MB","per_stream_rate_limit_burst":"5MB","reject_old_samples":true,"reject_old_samples_max_age":"168h","retention_period":"24h"},"schemaConfig":{"configs":[{"from":"2024-04-01","index":{"period":"24h","prefix":"loki_index_"},"object_store":"filesystem","schema":"v13","store":"tsdb"}]},"server":{"log_level":"warn"},"storage":{"type":"filesystem"}},"lokiCanary":{"enabled":false},"read":{"replicas":0},"resultsCache":{"enabled":false},"singleBinary":{"persistence":{"enabled":true,"size":"10Gi"},"replicas":1},"test":{"enabled":false},"write":{"replicas":0}}` | -------------------------------------------------------------------------- |
| prefectExporter | object | `{"affinity":{},"enabled":true,"image":{"pullPolicy":"","repository":"prefecthq/prometheus-prefect-exporter","tag":"3.3.0"},"logLevel":"WARNING","nodeSelector":{},"podAnnotations":{},"prefectApiUrl":"","replicas":1,"resources":{},"securityContext":{"runAsNonRoot":true,"runAsUser":1000},"service":{"port":8000,"type":"ClusterIP"},"tolerations":[]}` | -------------------------------------------------------------------------- |
| prefectExporter.enabled | bool | `true` | Enable the Prefect prometheus exporter sidecar Deployment. |
| prefectExporter.logLevel | string | `"WARNING"` | Log level passed to the exporter. |
| prefectExporter.prefectApiUrl | string | `""` | PREFECT_API_URL. Empty string defaults to the task-manager service of the sibling infrahub release (see _helpers.tpl). |
| prometheus | object | `{"alertmanager":{"enabled":false},"enabled":true,"kube-state-metrics":{"enabled":false},"prometheus-node-exporter":{"enabled":false},"prometheus-pushgateway":{"enabled":false},"server":{"extraArgs":{"log.level":"warn","web.enable-remote-write-receiver":""},"persistentVolume":{"enabled":true,"size":"20Gi"},"retention":"96h"},"serverFiles":{"prometheus.yml":{"scrape_configs":[]}}}` | -------------------------------------------------------------------------- |
| prometheus-node-exporter | object | `{"enabled":true}` | -------------------------------------------------------------------------- |
| tempo | object | `{"enabled":true,"persistence":{"enabled":true,"size":"10Gi"},"tempo":{"metricsGenerator":{"enabled":false},"receivers":{"otlp":{"protocols":{"grpc":{"endpoint":"0.0.0.0:4317"},"http":{}}}},"retention":"96h"}}` | -------------------------------------------------------------------------- |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
Loading
Loading