Skip to content

Commit 3df38d6

Browse files
khewonccoignetplevan-m
authored
Refactor experiment signals (#2944)
* refactor * review fixes * Review suggestions * add cluster uid tag * Fix version check * Update default versions * separate goroutine for acks * Review suggestions * Simplify refactor * fix go.mod * skip checking experiment ID on promote signal * exclude fleet.datadoghq.com annotation from controller revision --------- Co-authored-by: Paul Coignet <paul.coignet@datadoghq.com> Co-authored-by: levan-m <116471169+levan-m@users.noreply.github.com> Co-authored-by: Levan Machablishvili <levan.machablishvili@datadoghq.com>
1 parent 7f86fa0 commit 3df38d6

20 files changed

Lines changed: 3068 additions & 1103 deletions

api/datadoghq/v2alpha1/const.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,29 @@ const (
1313
// DefaultAPIKeyKey default api-key key (use in secret for instance).
1414
DefaultAPIKeyKey = "api_key"
1515
)
16+
17+
// Experiment signal annotations. The fleet daemon writes these annotations to
18+
// request state transitions; the reconciler clears them after processing.
19+
const (
20+
// AnnotationExperimentID is the annotation key for the experiment signal ID.
21+
AnnotationExperimentID = "experiment.datadoghq.com/id"
22+
// AnnotationExperimentSignal is the annotation key for the experiment signal type.
23+
AnnotationExperimentSignal = "experiment.datadoghq.com/signal"
24+
)
25+
26+
// Fleet pending-operation annotations. The fleet daemon writes these
27+
// annotations to durably track the single in-flight async operation for a DDA
28+
// so it can recover after restarts.
29+
const (
30+
// AnnotationPendingTaskID is the RC updater task ID currently awaiting completion.
31+
AnnotationPendingTaskID = "fleet.datadoghq.com/pending-task-id"
32+
// AnnotationPendingAction is the pending fleet action, such as start/stop/promote.
33+
AnnotationPendingAction = "fleet.datadoghq.com/pending-action"
34+
// AnnotationPendingExperimentID is the stable experiment identity (`params.version`).
35+
AnnotationPendingExperimentID = "fleet.datadoghq.com/pending-experiment-id"
36+
// AnnotationPendingPackage is the RC package whose Task.State/config versions should be updated.
37+
AnnotationPendingPackage = "fleet.datadoghq.com/pending-package"
38+
// AnnotationPendingResultVersion is the RC config version to write on
39+
// success when it differs from the experiment identity, such as promote.
40+
AnnotationPendingResultVersion = "fleet.datadoghq.com/pending-result-version"
41+
)

api/datadoghq/v2alpha1/datadogagent_types.go

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,33 +2427,45 @@ type RemoteConfigConfiguration struct {
24272427
}
24282428

24292429
// ExperimentPhase is the lifecycle phase of a Fleet Automation experiment.
2430-
// +kubebuilder:validation:Enum=running;stopped;rollback;timeout;promoted;aborted
2430+
// +kubebuilder:validation:Enum=running;terminated;promoted;aborted
24312431
type ExperimentPhase string
24322432

24332433
const (
2434-
// ExperimentPhaseRunning is set by RC when an experiment starts (startExperiment).
2434+
// ExperimentPhaseRunning is set by the reconciler when it processes a start signal.
24352435
ExperimentPhaseRunning ExperimentPhase = "running"
2436-
// ExperimentPhaseStopped is set by RC to request a rollback (stopExperiment).
2437-
ExperimentPhaseStopped ExperimentPhase = "stopped"
2438-
// ExperimentPhaseRollback is set by the operator after processing a stopped signal and restoring the previous spec.
2439-
ExperimentPhaseRollback ExperimentPhase = "rollback"
2440-
// ExperimentPhaseTimeout is set by the operator when the experiment exceeds the timeout and is auto-rolled back.
2441-
ExperimentPhaseTimeout ExperimentPhase = "timeout"
2442-
// ExperimentPhasePromoted is set by RC when an experiment succeeds (promoteExperiment).
2436+
// ExperimentPhaseTerminated is set by the reconciler after restoring the previous spec,
2437+
// either due to an explicit rollback signal or a timeout. The TerminationReason field
2438+
// distinguishes the cause.
2439+
ExperimentPhaseTerminated ExperimentPhase = "terminated"
2440+
// ExperimentPhasePromoted is set by the reconciler when a promote signal is processed.
24432441
ExperimentPhasePromoted ExperimentPhase = "promoted"
2444-
// ExperimentPhaseAborted is set by the operator when a manual spec change is detected during a running experiment.
2442+
// ExperimentPhaseAborted is set by the reconciler when a manual spec change is detected during a running experiment.
24452443
ExperimentPhaseAborted ExperimentPhase = "aborted"
24462444
)
24472445

2446+
// Experiment signal values written to the AnnotationExperimentSignal annotation.
2447+
const (
2448+
// ExperimentSignalStart requests a new experiment to begin.
2449+
ExperimentSignalStart = "start"
2450+
// ExperimentSignalRollback requests the current experiment to roll back.
2451+
ExperimentSignalRollback = "rollback"
2452+
// ExperimentSignalPromote requests the current experiment to be promoted.
2453+
ExperimentSignalPromote = "promote"
2454+
)
2455+
24482456
// ExperimentStatus defines the state of a Fleet Automation experiment.
24492457
// +k8s:openapi-gen=true
24502458
type ExperimentStatus struct {
24512459
// Phase is the current state of the experiment.
24522460
// +optional
24532461
Phase ExperimentPhase `json:"phase,omitempty"`
2454-
// ID is the unique experiment ID sent by Fleet Automation.
2462+
// ID is the RC task ID that triggered this experiment state.
24552463
// +optional
24562464
ID string `json:"id,omitempty"`
2465+
// TerminationReason distinguishes why the experiment was terminated.
2466+
// Only set when Phase is "terminated".
2467+
// +optional
2468+
TerminationReason string `json:"terminationReason,omitempty"`
24572469
}
24582470

24592471
// DatadogAgentStatus defines the observed state of DatadogAgent.

api/datadoghq/v2alpha1/zz_generated.openapi.go

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,6 @@ func setupAndStartHelmMetadataForwarder(logger logr.Logger, mgr manager.Manager,
651651
}
652652

653653
func setupFleetDaemon(logger logr.Logger, mgr manager.Manager, rcClient remoteconfig.RCClient, revisionsEnabled bool) error {
654-
daemon := fleet.NewDaemon(rcClient, mgr.GetClient(), revisionsEnabled)
654+
daemon := fleet.NewDaemon(rcClient, mgr, revisionsEnabled)
655655
return mgr.Add(daemon)
656656
}

config/crd/bases/v1/datadoghq.com_datadogagents.yaml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8767,18 +8767,21 @@ spec:
87678767
description: Experiment tracks the state of an active or recent Fleet Automation experiment.
87688768
properties:
87698769
id:
8770-
description: ID is the unique experiment ID sent by Fleet Automation.
8770+
description: ID is the RC task ID that triggered this experiment state.
87718771
type: string
87728772
phase:
87738773
description: Phase is the current state of the experiment.
87748774
enum:
87758775
- running
8776-
- stopped
8777-
- rollback
8778-
- timeout
8776+
- terminated
87798777
- promoted
87808778
- aborted
87818779
type: string
8780+
terminationReason:
8781+
description: |-
8782+
TerminationReason distinguishes why the experiment was terminated.
8783+
Only set when Phase is "terminated".
8784+
type: string
87828785
type: object
87838786
otelAgentGateway:
87848787
description: The actual state of the OTel Agent Gateway as a deployment.

config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8456,20 +8456,22 @@
84568456
"description": "Experiment tracks the state of an active or recent Fleet Automation experiment.",
84578457
"properties": {
84588458
"id": {
8459-
"description": "ID is the unique experiment ID sent by Fleet Automation.",
8459+
"description": "ID is the RC task ID that triggered this experiment state.",
84608460
"type": "string"
84618461
},
84628462
"phase": {
84638463
"description": "Phase is the current state of the experiment.",
84648464
"enum": [
84658465
"running",
8466-
"stopped",
8467-
"rollback",
8468-
"timeout",
8466+
"terminated",
84698467
"promoted",
84708468
"aborted"
84718469
],
84728470
"type": "string"
8471+
},
8472+
"terminationReason": {
8473+
"description": "TerminationReason distinguishes why the experiment was terminated.\nOnly set when Phase is \"terminated\".",
8474+
"type": "string"
84738475
}
84748476
},
84758477
"type": "object"

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ require (
6868
github.com/samber/lo v1.52.0
6969
golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8
7070
golang.org/x/text v0.35.0
71+
google.golang.org/protobuf v1.36.11
7172
helm.sh/helm/v3 v3.20.2
7273
k8s.io/kubectl v0.35.3
7374
k8s.io/utils v0.0.0-20251222233032-718f0e51e6d2
@@ -280,7 +281,6 @@ require (
280281
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect
281282
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
282283
google.golang.org/grpc v1.79.3 // indirect
283-
google.golang.org/protobuf v1.36.11 // indirect
284284
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
285285
gopkg.in/inf.v0 v0.9.1 // indirect
286286
gotest.tools/v3 v3.5.1 // indirect

0 commit comments

Comments
 (0)