Skip to content

Commit b23ce6c

Browse files
authored
Merge pull request #45 from KestrelAI/datadog-workflow-queries
Add Datadog write operations for workflow builder
2 parents 7b8e3b0 + 99ef791 commit b23ce6c

55 files changed

Lines changed: 8500 additions & 2673 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Dockerfile

Lines changed: 84 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,85 @@
1-
FROM golang:1.24-alpine AS builder
2-
3-
WORKDIR /app
4-
5-
# Copy the source code
6-
COPY . .
7-
8-
# Build the operator application
9-
WORKDIR /app
10-
RUN CGO_ENABLED=0 GOOS=linux go build -o /app/bin/client ./cmd/client
11-
12-
# Use a small alpine image for the final container
13-
FROM alpine:latest
14-
15-
# Install basic dependencies
16-
RUN apk --no-cache add \
17-
ca-certificates \
18-
bash \
19-
curl \
20-
wget \
21-
tar \
22-
gzip \
23-
jq
24-
25-
# Install kubectl (multi-arch)
26-
RUN ARCH=$(uname -m) && \
27-
if [ "$ARCH" = "x86_64" ]; then ARCH="amd64"; elif [ "$ARCH" = "aarch64" ]; then ARCH="arm64"; fi && \
28-
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl" && \
29-
chmod +x kubectl && \
30-
mv kubectl /usr/local/bin/
31-
32-
# Install Cilium CLI (multi-arch)
33-
RUN ARCH=$(uname -m) && \
34-
if [ "$ARCH" = "x86_64" ]; then ARCH="amd64"; elif [ "$ARCH" = "aarch64" ]; then ARCH="arm64"; fi && \
35-
CILIUM_CLI_VERSION=$(curl -s https://raw.githubusercontent.com/cilium/cilium-cli/main/stable.txt) && \
36-
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${ARCH}.tar.gz{,.sha256sum} && \
37-
sha256sum -c cilium-linux-${ARCH}.tar.gz.sha256sum && \
38-
tar xzvfC cilium-linux-${ARCH}.tar.gz /usr/local/bin && \
39-
rm cilium-linux-${ARCH}.tar.gz cilium-linux-${ARCH}.tar.gz.sha256sum
40-
41-
# Install Trivy (multi-arch)
42-
RUN ARCH=$(uname -m) && \
43-
if [ "$ARCH" = "x86_64" ]; then TRIVY_ARCH="64bit"; elif [ "$ARCH" = "aarch64" ]; then TRIVY_ARCH="ARM64"; fi && \
44-
TRIVY_VERSION=$(curl -s "https://api.github.com/repos/aquasecurity/trivy/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/v//') && \
45-
wget https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/trivy_${TRIVY_VERSION}_Linux-${TRIVY_ARCH}.tar.gz && \
46-
tar zxvf trivy_${TRIVY_VERSION}_Linux-${TRIVY_ARCH}.tar.gz && \
47-
mv trivy /usr/local/bin/ && \
48-
rm trivy_${TRIVY_VERSION}_Linux-${TRIVY_ARCH}.tar.gz
49-
50-
# Pre-download Trivy vulnerability database during build
51-
# This ensures the operator doesn't need internet access at runtime
52-
RUN mkdir -p /root/.cache/trivy && \
53-
trivy image --download-db-only --cache-dir /root/.cache/trivy && \
54-
chmod -R 755 /root/.cache/trivy
55-
56-
WORKDIR /app
57-
58-
# Set environment variables for Trivy to use offline mode with pre-downloaded DB
59-
ENV TRIVY_OFFLINE=true
60-
ENV TRIVY_CACHE_DIR=/root/.cache/trivy
61-
ENV TRIVY_DB_REPOSITORY=""
62-
63-
# Copy the binary from the builder stage
64-
COPY --from=builder /app/bin/client .
65-
66-
# Verify tools are installed and Trivy database is ready
67-
RUN kubectl version --client=true && \
68-
cilium version --client && \
69-
trivy --version && \
70-
echo "Testing Trivy offline mode..." && \
71-
trivy image --offline-scan --skip-db-update alpine:latest || echo "Trivy offline test completed (exit code expected for test image)" && \
72-
jq --version && \
73-
bash --version
74-
75-
# Run the client
1+
FROM golang:1.25-alpine AS builder
2+
3+
WORKDIR /app
4+
5+
# Copy the source code
6+
COPY . .
7+
8+
# Build the operator application
9+
WORKDIR /app
10+
RUN CGO_ENABLED=0 GOOS=linux go build -o /app/bin/client ./cmd/client
11+
12+
# Use a small alpine image for the final container
13+
FROM alpine:latest
14+
15+
# Install basic dependencies
16+
RUN apk --no-cache add \
17+
ca-certificates \
18+
bash \
19+
curl \
20+
wget \
21+
tar \
22+
gzip \
23+
jq
24+
25+
# Install kubectl (multi-arch) - pinned to v1.31.0 for reliability
26+
RUN ARCH=$(uname -m) && \
27+
if [ "$ARCH" = "x86_64" ]; then ARCH="amd64"; elif [ "$ARCH" = "aarch64" ]; then ARCH="arm64"; fi && \
28+
curl -LO "https://dl.k8s.io/release/v1.31.0/bin/linux/${ARCH}/kubectl" && \
29+
chmod +x kubectl && \
30+
mv kubectl /usr/local/bin/
31+
32+
# Install Cilium CLI (multi-arch)
33+
RUN ARCH=$(uname -m) && \
34+
if [ "$ARCH" = "x86_64" ]; then ARCH="amd64"; elif [ "$ARCH" = "aarch64" ]; then ARCH="arm64"; fi && \
35+
CILIUM_CLI_VERSION=$(curl -s https://raw.githubusercontent.com/cilium/cilium-cli/main/stable.txt) && \
36+
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${ARCH}.tar.gz{,.sha256sum} && \
37+
sha256sum -c cilium-linux-${ARCH}.tar.gz.sha256sum && \
38+
tar xzvfC cilium-linux-${ARCH}.tar.gz /usr/local/bin && \
39+
rm cilium-linux-${ARCH}.tar.gz cilium-linux-${ARCH}.tar.gz.sha256sum
40+
41+
# Install Trivy (multi-arch)
42+
RUN ARCH=$(uname -m) && \
43+
if [ "$ARCH" = "x86_64" ]; then TRIVY_ARCH="64bit"; elif [ "$ARCH" = "aarch64" ]; then TRIVY_ARCH="ARM64"; fi && \
44+
TRIVY_VERSION=$(curl -s "https://api.github.com/repos/aquasecurity/trivy/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/v//') && \
45+
wget https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/trivy_${TRIVY_VERSION}_Linux-${TRIVY_ARCH}.tar.gz && \
46+
tar zxvf trivy_${TRIVY_VERSION}_Linux-${TRIVY_ARCH}.tar.gz && \
47+
mv trivy /usr/local/bin/ && \
48+
rm trivy_${TRIVY_VERSION}_Linux-${TRIVY_ARCH}.tar.gz
49+
50+
# Install Helm (multi-arch)
51+
RUN ARCH=$(uname -m) && \
52+
if [ "$ARCH" = "x86_64" ]; then ARCH="amd64"; elif [ "$ARCH" = "aarch64" ]; then ARCH="arm64"; fi && \
53+
curl -fsSL https://get.helm.sh/helm-v3.16.3-linux-${ARCH}.tar.gz -o helm.tar.gz && \
54+
tar xzf helm.tar.gz && \
55+
mv linux-${ARCH}/helm /usr/local/bin/ && \
56+
rm -rf helm.tar.gz linux-${ARCH}
57+
58+
# Pre-download Trivy vulnerability database during build
59+
# This ensures the operator doesn't need internet access at runtime
60+
RUN mkdir -p /root/.cache/trivy && \
61+
trivy image --download-db-only --cache-dir /root/.cache/trivy && \
62+
chmod -R 755 /root/.cache/trivy
63+
64+
WORKDIR /app
65+
66+
# Set environment variables for Trivy to use offline mode with pre-downloaded DB
67+
ENV TRIVY_OFFLINE=true
68+
ENV TRIVY_CACHE_DIR=/root/.cache/trivy
69+
ENV TRIVY_DB_REPOSITORY=""
70+
71+
# Copy the binary from the builder stage
72+
COPY --from=builder /app/bin/client .
73+
74+
# Verify tools are installed and Trivy database is ready
75+
RUN kubectl version --client=true && \
76+
cilium version --client && \
77+
helm version --short && \
78+
trivy --version && \
79+
echo "Testing Trivy offline mode..." && \
80+
trivy image --offline-scan --skip-db-update alpine:latest || echo "Trivy offline test completed (exit code expected for test image)" && \
81+
jq --version && \
82+
bash --version
83+
84+
# Run the client
7685
CMD ["/app/client"]

api/cloud/v1/message.proto

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ package cloud.v1;
44

55
import "google/protobuf/timestamp.proto";
66

7-
option go_package = "operator/api/cloud/v1";
7+
option go_package = "operator/api/gen/cloud/v1";
88

99
// Action types for resource operations
1010
enum Action {
@@ -44,6 +44,10 @@ message StreamDataRequest {
4444
MetricsQueryResponse metrics_query_response = 24;
4545
// Operator resource usage stats for Grafana dashboards
4646
OperatorStats operator_stats = 25;
47+
// Datadog query response (fallback when sent on data stream)
48+
DatadogQueryResponse datadog_query_response = 26;
49+
// Datadog monitor alert (emitted when a monitor transitions state)
50+
DatadogMonitorAlert datadog_monitor_alert = 27;
4751
}
4852
}
4953

@@ -139,6 +143,8 @@ message StreamDataResponse {
139143
MetricsQueryRequest metrics_query_request = 9;
140144
// Heartbeat keepalive to prevent LB idle-stream timeouts
141145
Heartbeat heartbeat = 10;
146+
// Datadog query request (fallback for operators without control stream)
147+
DatadogQueryRequest datadog_query_request = 11;
142148
}
143149
}
144150

@@ -236,6 +242,8 @@ message InventoryCommit {
236242
// True if the operator detected a Datadog installation in the cluster.
237243
// Old operators send false (default) — the server treats it as "no Datadog".
238244
bool has_datadog = 1;
245+
// True if the operator detected an ArgoCD installation in the cluster.
246+
bool has_argocd = 2;
239247
}
240248

241249
// Network policy definition matching Kubernetes NetworkPolicy format
@@ -1313,15 +1321,27 @@ message DatadogQueryRequest {
13131321
int32 max_results = 8; // limit returned items (0 = API default)
13141322

13151323
int32 timeout_seconds = 9;
1324+
1325+
// JSON body for write operations (create-monitor, send-event, mute-monitor).
1326+
// Ignored for read queries.
1327+
string json_body = 10;
13161328
}
13171329

13181330
enum DatadogQueryType {
13191331
DATADOG_QUERY_UNSPECIFIED = 0;
1320-
DATADOG_QUERY_METRICS = 1; // GET /api/v1/query
1321-
DATADOG_QUERY_EVENTS = 2; // GET /api/v1/events
1322-
DATADOG_QUERY_HOSTS = 3; // GET /api/v1/hosts
1323-
DATADOG_QUERY_LOGS = 4; // POST /api/v2/logs/events/search
1324-
DATADOG_QUERY_LIST_METRICS = 5; // GET /api/v1/metrics — list available metric names
1332+
DATADOG_QUERY_METRICS = 1; // GET /api/v1/query
1333+
DATADOG_QUERY_EVENTS = 2; // GET /api/v1/events
1334+
DATADOG_QUERY_HOSTS = 3; // GET /api/v1/hosts
1335+
DATADOG_QUERY_LOGS = 4; // POST /api/v2/logs/events/search
1336+
DATADOG_QUERY_LIST_METRICS = 5; // GET /api/v1/metrics
1337+
DATADOG_CREATE_MONITOR = 6; // POST /api/v1/monitor
1338+
DATADOG_SEND_EVENT = 7; // POST /api/v1/events
1339+
DATADOG_MUTE_MONITOR = 8; // POST /api/v1/monitor/{id}/mute
1340+
DATADOG_LIST_MONITORS = 9; // GET /api/v1/monitor
1341+
ARGOCD_SYNC_APP = 10; // POST /api/v1/applications/{name}/sync
1342+
ARGOCD_GET_APP_STATUS = 11; // GET /api/v1/applications/{name}
1343+
ARGOCD_LIST_APPS = 12; // GET /api/v1/applications
1344+
ARGOCD_ROLLBACK_APP = 13; // POST /api/v1/applications/{name}/rollback
13251345
}
13261346

13271347
// Datadog query response from operator to server.
@@ -1415,13 +1435,27 @@ message StreamFlowsResponse {
14151435
}
14161436
}
14171437

1438+
// DatadogMonitorAlert is emitted by the operator when a Datadog monitor transitions state.
1439+
message DatadogMonitorAlert {
1440+
string monitor_id = 1;
1441+
string monitor_name = 2;
1442+
string current_status = 3; // "Alert", "Warn", "OK", "No Data"
1443+
string previous_status = 4;
1444+
string monitor_type = 5; // "metric alert", "service check", etc.
1445+
string query = 6;
1446+
string message = 7;
1447+
repeated string tags = 8;
1448+
int64 timestamp = 9;
1449+
}
1450+
14181451
// StreamEventsRequest carries event/status/condition data from operator to server.
14191452
message StreamEventsRequest {
14201453
oneof request {
14211454
KubernetesEvent kubernetes_event = 1;
14221455
PodStatusChange pod_status_change = 2;
14231456
NodeConditionChange node_condition_change = 3;
14241457
WorkloadRolloutStatus workload_rollout_status = 4;
1458+
DatadogMonitorAlert datadog_monitor_alert = 5;
14251459
}
14261460
}
14271461

0 commit comments

Comments
 (0)