Skip to content

Commit 3793af8

Browse files
authored
Aggregate metadata for ownership chain and use annotations for runtime risks (#33)
* add AggregatePodMetadata to track metadata of pod and its owners, add runtime risk tracking * simplify recursion to iteration, don't get metadata for delete events * add metadata client, switch to using PartialObjectMetadata * focus linter only on new changes when run from PRs
1 parent 088b94e commit 3793af8

File tree

7 files changed

+261
-26
lines changed

7 files changed

+261
-26
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
*~
22
/deployment-tracker
3+
.idea/

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ deployment records to GitHub's artifact metadata API.
2323
- **Real-time tracking**: Sends deployment records when pods are
2424
created or deleted
2525
- **Graceful shutdown**: Properly drains work queue before terminating
26+
- **Runtime risks**: Track runtime risks through annotations
2627

2728
## How It Works
2829

@@ -82,14 +83,21 @@ The `DN_TEMPLATE` supports the following placeholders:
8283
- `{{deploymentName}}` - Name of the owning Deployment
8384
- `{{containerName}}` - Container name
8485

86+
## Runtime Risks
87+
88+
You can track runtime risks through annotations. Add the annotation `github.com/runtime-risks`, with a comma-separated list of supported runtime risk values. Annotations are aggregated from the pod and its owner reference objects.
89+
90+
Currently supported runtime risks can be found in the [Create Deployment Record API docs](https://docs.github.com/en/rest/orgs/artifact-metadata?apiVersion=2022-11-28#create-an-artifact-deployment-record). Invalid runtime risk values will be ignored.
91+
92+
8593
## Kubernetes Deployment
8694

8795
A complete deployment manifest is provided in `deploy/manifest.yaml`
8896
which includes:
8997

9098
- **Namespace**: `deployment-tracker`
9199
- **ServiceAccount**: Identity for the controller pod
92-
- **ClusterRole**: Minimal permissions (`get`, `list`, `watch` on pods)
100+
- **ClusterRole**: Minimal permissions (`get`, `list`, `watch` on pods; `get` on other supported objects)
93101
- **ClusterRoleBinding**: Binds the ServiceAccount to the ClusterRole
94102
- **Deployment**: Runs the controller with security hardening
95103

cmd/deployment-tracker/main.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"time"
1414

1515
"github.com/github/deployment-tracker/internal/controller"
16+
"k8s.io/client-go/metadata"
1617

1718
"github.com/prometheus/client_golang/prometheus/promhttp"
1819
"k8s.io/client-go/kubernetes"
@@ -112,6 +113,14 @@ func main() {
112113
os.Exit(1)
113114
}
114115

116+
// Create metadata client
117+
metadataClient, err := metadata.NewForConfig(k8sCfg)
118+
if err != nil {
119+
slog.Error("Error creating Kubernetes metadata client",
120+
"error", err)
121+
os.Exit(1)
122+
}
123+
115124
// Start the metrics server
116125
var promSrv = &http.Server{
117126
Addr: ":" + metricsPort,
@@ -151,7 +160,7 @@ func main() {
151160
cancel()
152161
}()
153162

154-
cntrl, err := controller.New(clientset, namespace, excludeNamespaces, &cntrlCfg)
163+
cntrl, err := controller.New(clientset, metadataClient, namespace, excludeNamespaces, &cntrlCfg)
155164
if err != nil {
156165
slog.Error("Failed to create controller",
157166
"error", err)

deploy/manifest.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ rules:
2020
- apiGroups: ["apps"]
2121
resources: ["deployments"]
2222
verbs: ["get"]
23+
- apiGroups: ["apps"]
24+
resources: ["replicasets"]
25+
verbs: ["get"]
2326
---
2427
apiVersion: rbac.authorization.k8s.io/v1
2528
kind: ClusterRoleBinding
@@ -66,7 +69,7 @@ spec:
6669
- name: PHYSICAL_ENVIRONMENT
6770
value: "iad-moda1"
6871
- name: CLUSTER
69-
value: "kommendorkapten"
72+
value: "test-cluster"
7073
- name: BASE_URL
7174
value: "http://artifact-registry.artifact-registry.svc.cluster.local:9090"
7275
resources:

internal/controller/controller.go

Lines changed: 150 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,17 @@ import (
55
"errors"
66
"fmt"
77
"log/slog"
8+
"slices"
89
"strings"
910
"sync"
1011
"time"
1112

1213
"github.com/github/deployment-tracker/pkg/deploymentrecord"
1314
"github.com/github/deployment-tracker/pkg/image"
1415
"github.com/github/deployment-tracker/pkg/metrics"
16+
"k8s.io/apimachinery/pkg/runtime/schema"
17+
"k8s.io/apimachinery/pkg/types"
18+
"k8s.io/client-go/metadata"
1519

1620
corev1 "k8s.io/api/core/v1"
1721
k8serrors "k8s.io/apimachinery/pkg/api/errors"
@@ -29,6 +33,8 @@ const (
2933
EventCreated = "CREATED"
3034
// EventDeleted indicates that a pod has been deleted.
3135
EventDeleted = "DELETED"
36+
// RuntimeRiskAnnotationKey represents the annotation key for runtime risks.
37+
RuntimeRiskAnnotationKey = "github.com/runtime-risks"
3238
)
3339

3440
// PodEvent represents a pod event to be processed.
@@ -38,21 +44,27 @@ type PodEvent struct {
3844
DeletedPod *corev1.Pod // Only populated for delete events
3945
}
4046

47+
// AggregatePodMetadata represents combined metadata for a pod and its ownership hierarchy.
48+
type AggregatePodMetadata struct {
49+
RuntimeRisks map[deploymentrecord.RuntimeRisk]bool
50+
}
51+
4152
// Controller is the Kubernetes controller for tracking deployments.
4253
type Controller struct {
43-
clientset kubernetes.Interface
44-
podInformer cache.SharedIndexInformer
45-
workqueue workqueue.TypedRateLimitingInterface[PodEvent]
46-
apiClient *deploymentrecord.Client
47-
cfg *Config
54+
clientset kubernetes.Interface
55+
metadataClient metadata.Interface
56+
podInformer cache.SharedIndexInformer
57+
workqueue workqueue.TypedRateLimitingInterface[PodEvent]
58+
apiClient *deploymentrecord.Client
59+
cfg *Config
4860
// best effort cache to avoid redundant posts
4961
// post requests are idempotent, so if this cache fails due to
5062
// restarts or other events, nothing will break.
5163
observedDeployments sync.Map
5264
}
5365

5466
// New creates a new deployment tracker controller.
55-
func New(clientset kubernetes.Interface, namespace string, excludeNamespaces string, cfg *Config) (*Controller, error) {
67+
func New(clientset kubernetes.Interface, metadataClient metadata.Interface, namespace string, excludeNamespaces string, cfg *Config) (*Controller, error) {
5668
// Create informer factory
5769
factory := createInformerFactory(clientset, namespace, excludeNamespaces)
5870

@@ -84,11 +96,12 @@ func New(clientset kubernetes.Interface, namespace string, excludeNamespaces str
8496
}
8597

8698
cntrl := &Controller{
87-
clientset: clientset,
88-
podInformer: podInformer,
89-
workqueue: queue,
90-
apiClient: apiClient,
91-
cfg: cfg,
99+
clientset: clientset,
100+
metadataClient: metadataClient,
101+
podInformer: podInformer,
102+
workqueue: queue,
103+
apiClient: apiClient,
104+
cfg: cfg,
92105
}
93106

94107
// Add event handlers to the informer
@@ -334,16 +347,26 @@ func (c *Controller) processEvent(ctx context.Context, event PodEvent) error {
334347

335348
var lastErr error
336349

350+
// Gather aggregate metadata for adds/updates
351+
var runtimeRisks []deploymentrecord.RuntimeRisk
352+
if status != deploymentrecord.StatusDecommissioned {
353+
aggMetadata := c.aggregateMetadata(ctx, podToPartialMetadata(pod))
354+
for risk := range aggMetadata.RuntimeRisks {
355+
runtimeRisks = append(runtimeRisks, risk)
356+
}
357+
slices.Sort(runtimeRisks)
358+
}
359+
337360
// Record info for each container in the pod
338361
for _, container := range pod.Spec.Containers {
339-
if err := c.recordContainer(ctx, pod, container, status, event.EventType); err != nil {
362+
if err := c.recordContainer(ctx, pod, container, status, event.EventType, runtimeRisks); err != nil {
340363
lastErr = err
341364
}
342365
}
343366

344367
// Also record init containers
345368
for _, container := range pod.Spec.InitContainers {
346-
if err := c.recordContainer(ctx, pod, container, status, event.EventType); err != nil {
369+
if err := c.recordContainer(ctx, pod, container, status, event.EventType, runtimeRisks); err != nil {
347370
lastErr = err
348371
}
349372
}
@@ -371,7 +394,7 @@ func (c *Controller) deploymentExists(ctx context.Context, namespace, name strin
371394
}
372395

373396
// recordContainer records a single container's deployment info.
374-
func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string) error {
397+
func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string, runtimeRisks []deploymentrecord.RuntimeRisk) error {
375398
dn := getARDeploymentName(pod, container, c.cfg.Template)
376399
digest := getContainerDigest(pod, container.Name)
377400

@@ -424,6 +447,7 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
424447
c.cfg.Cluster,
425448
status,
426449
dn,
450+
runtimeRisks,
427451
)
428452

429453
if err := c.apiClient.PostOne(ctx, record); err != nil {
@@ -457,6 +481,7 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
457481
"name", record.Name,
458482
"deployment_name", record.DeploymentName,
459483
"status", record.Status,
484+
"runtime_risks", record.RuntimeRisks,
460485
"digest", record.Digest,
461486
)
462487

@@ -473,6 +498,94 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
473498
return nil
474499
}
475500

501+
// aggregateMetadata returns aggregated metadata for a pod and its owners.
502+
func (c *Controller) aggregateMetadata(ctx context.Context, obj *metav1.PartialObjectMetadata) AggregatePodMetadata {
503+
aggMetadata := AggregatePodMetadata{
504+
RuntimeRisks: make(map[deploymentrecord.RuntimeRisk]bool),
505+
}
506+
queue := []*metav1.PartialObjectMetadata{obj}
507+
visited := make(map[types.UID]bool)
508+
509+
for len(queue) > 0 {
510+
current := queue[0]
511+
queue = queue[1:]
512+
513+
if visited[current.GetUID()] {
514+
slog.Warn("Already visited object, skipping to avoid cycles",
515+
"UID", current.GetUID(),
516+
"name", current.GetName(),
517+
)
518+
continue
519+
}
520+
visited[current.GetUID()] = true
521+
522+
extractMetadataFromObject(current, &aggMetadata)
523+
c.addOwnersToQueue(ctx, current, &queue)
524+
}
525+
526+
return aggMetadata
527+
}
528+
529+
// addOwnersToQueue takes a current object and looks up its owners, adding them to the queue for processing
530+
// to collect their metadata.
531+
func (c *Controller) addOwnersToQueue(ctx context.Context, current *metav1.PartialObjectMetadata, queue *[]*metav1.PartialObjectMetadata) {
532+
ownerRefs := current.GetOwnerReferences()
533+
534+
for _, owner := range ownerRefs {
535+
ownerObj, err := c.getOwnerMetadata(ctx, current.GetNamespace(), owner)
536+
if err != nil {
537+
slog.Warn("Failed to get owner object for metadata collection",
538+
"namespace", current.GetNamespace(),
539+
"owner_kind", owner.Kind,
540+
"owner_name", owner.Name,
541+
"error", err,
542+
)
543+
continue
544+
}
545+
546+
if ownerObj == nil {
547+
continue
548+
}
549+
550+
*queue = append(*queue, ownerObj)
551+
}
552+
}
553+
554+
// getOwnerMetadata retrieves partial object metadata for an owner ref.
555+
func (c *Controller) getOwnerMetadata(ctx context.Context, namespace string, owner metav1.OwnerReference) (*metav1.PartialObjectMetadata, error) {
556+
gvr := schema.GroupVersionResource{
557+
Group: "apps",
558+
Version: "v1",
559+
}
560+
561+
switch owner.Kind {
562+
case "ReplicaSet":
563+
gvr.Resource = "replicasets"
564+
case "Deployment":
565+
gvr.Resource = "deployments"
566+
default:
567+
slog.Debug("Unsupported owner kind for runtime risk collection",
568+
"kind", owner.Kind,
569+
"name", owner.Name,
570+
)
571+
return nil, nil
572+
}
573+
574+
obj, err := c.metadataClient.Resource(gvr).Namespace(namespace).Get(ctx, owner.Name, metav1.GetOptions{})
575+
if err != nil {
576+
if k8serrors.IsNotFound(err) {
577+
slog.Debug("Owner object not found for metadata collection",
578+
"namespace", namespace,
579+
"owner_kind", owner.Kind,
580+
"owner_name", owner.Name,
581+
)
582+
return nil, nil
583+
}
584+
return nil, err
585+
}
586+
return obj, nil
587+
}
588+
476589
func getCacheKey(dn, digest string) string {
477590
return dn + "||" + digest
478591
}
@@ -580,3 +693,26 @@ func getDeploymentName(pod *corev1.Pod) string {
580693
}
581694
return ""
582695
}
696+
697+
// extractMetadataFromObject extracts metadata from an object.
698+
func extractMetadataFromObject(obj *metav1.PartialObjectMetadata, aggMetadata *AggregatePodMetadata) {
699+
annotations := obj.GetAnnotations()
700+
if risks, exists := annotations[RuntimeRiskAnnotationKey]; exists {
701+
for _, risk := range strings.Split(risks, ",") {
702+
r := deploymentrecord.ValidateRuntimeRisk(risk)
703+
if r != "" {
704+
aggMetadata.RuntimeRisks[r] = true
705+
}
706+
}
707+
}
708+
}
709+
710+
func podToPartialMetadata(pod *corev1.Pod) *metav1.PartialObjectMetadata {
711+
return &metav1.PartialObjectMetadata{
712+
TypeMeta: metav1.TypeMeta{
713+
APIVersion: "v1",
714+
Kind: "Pod",
715+
},
716+
ObjectMeta: pod.ObjectMeta,
717+
}
718+
}

0 commit comments

Comments
 (0)