Skip to content

Commit 726ae6f

Browse files
authored
chore: configure alerting and monitoring (anomalyco#25857)
1 parent 773078e commit 726ae6f

13 files changed

Lines changed: 471 additions & 0 deletions

File tree

.github/workflows/deploy.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ jobs:
3636
PLANETSCALE_SERVICE_TOKEN_NAME: ${{ secrets.PLANETSCALE_SERVICE_TOKEN_NAME }}
3737
PLANETSCALE_SERVICE_TOKEN: ${{ secrets.PLANETSCALE_SERVICE_TOKEN }}
3838
STRIPE_SECRET_KEY: ${{ github.ref_name == 'production' && secrets.STRIPE_SECRET_KEY_PROD || secrets.STRIPE_SECRET_KEY_DEV }}
39+
HONEYCOMB_API_KEY: ${{ secrets.HONEYCOMB_API_KEY }}
40+
INCIDENT_API_KEY: ${{ secrets.INCIDENT_API_KEY }}
3941
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
4042
SENTRY_ORG: ${{ vars.SENTRY_ORG }}
4143
SENTRY_PROJECT: ${{ vars.WEB_SENTRY_PROJECT }}

bun.lock

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

infra/console.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,9 @@ const AUTH_API_URL = new sst.Linkable("AUTH_API_URL", {
221221
const STRIPE_WEBHOOK_SECRET = new sst.Linkable("STRIPE_WEBHOOK_SECRET", {
222222
properties: { value: stripeWebhook.secret },
223223
})
224+
const INCIDENT_WEBHOOK_SIGNING_SECRET = new sst.Secret("INCIDENT_WEBHOOK_SIGNING_SECRET")
225+
const DISCORD_INCIDENT_WEBHOOK_URL = new sst.Secret("DISCORD_INCIDENT_WEBHOOK_URL")
226+
224227
const gatewayKv = new sst.cloudflare.Kv("GatewayKv")
225228

226229
////////////////
@@ -251,6 +254,8 @@ new sst.cloudflare.x.SolidStart("Console", {
251254
database,
252255
AUTH_API_URL,
253256
STRIPE_WEBHOOK_SECRET,
257+
INCIDENT_WEBHOOK_SIGNING_SECRET,
258+
DISCORD_INCIDENT_WEBHOOK_URL,
254259
STRIPE_SECRET_KEY,
255260
EMAILOCTOPUS_API_KEY,
256261
AWS_SES_ACCESS_KEY_ID,

infra/monitoring.ts

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
const displayName = (s: string) =>
2+
s
3+
.split("-")
4+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
5+
.join(" ")
6+
.replace(/(?<=\d) (?=\d)/g, ".")
7+
8+
const resourceName = (s: string) => displayName(s).replace(/[^a-zA-Z0-9]/g, "")
9+
10+
const varSpec = (label: string, name: string) =>
11+
$jsonStringify({
12+
content: [
13+
{
14+
content: [
15+
{
16+
attrs: {
17+
name,
18+
label,
19+
missing: false,
20+
},
21+
type: "varSpec",
22+
},
23+
],
24+
type: "paragraph",
25+
},
26+
],
27+
type: "doc",
28+
})
29+
30+
const fields = {
31+
model: incident.getAlertAttributeOutput({ name: "Model" }),
32+
product: incident.getAlertAttributeOutput({ name: "Product" }),
33+
}
34+
35+
const alertSource = new incident.AlertSource("HoneycombAlertSource", {
36+
name: $app.stage === "production" ? "Honeycomb" : `Honeycomb (${$app.stage})`,
37+
sourceType: "honeycomb",
38+
template: {
39+
title: {
40+
literal: varSpec("Payload -> Title", "title"),
41+
},
42+
description: {
43+
literal: varSpec("Payload -> Description", "description"),
44+
},
45+
attributes: [
46+
{
47+
alertAttributeId: fields.model.id,
48+
binding: {
49+
value: {
50+
reference: 'expressions["model"]',
51+
},
52+
mergeStrategy: "first_wins",
53+
},
54+
},
55+
{
56+
alertAttributeId: fields.product.id,
57+
binding: {
58+
value: {
59+
reference: 'expressions["product"]',
60+
},
61+
mergeStrategy: "first_wins",
62+
},
63+
},
64+
],
65+
expressions: [
66+
{
67+
label: "Model",
68+
operations: [
69+
{
70+
operationType: "parse",
71+
parse: {
72+
returns: {
73+
array: false,
74+
type: fields.model.type,
75+
},
76+
source: "$['model']",
77+
},
78+
},
79+
],
80+
reference: "model",
81+
rootReference: "payload",
82+
},
83+
{
84+
label: "Product",
85+
operations: [
86+
{
87+
operationType: "parse",
88+
parse: {
89+
returns: {
90+
array: false,
91+
type: fields.product.type,
92+
},
93+
source: "$['product']",
94+
},
95+
},
96+
],
97+
reference: "product",
98+
rootReference: "payload",
99+
},
100+
],
101+
},
102+
})
103+
104+
const webhookRecipient = new honeycomb.WebhookRecipient(`IncidentWebhook`, {
105+
name: $app.stage === "production" ? "Incident.io" : `Incident.io (${$app.stage})`,
106+
url: alertSource.alertEventsUrl,
107+
secret: alertSource.secretToken,
108+
templates: [
109+
{
110+
type: "trigger",
111+
body: $jsonStringify({
112+
title: "{{ .Name }}",
113+
description: "{{ .Description }}",
114+
status: "{{ .Alert.Status }}",
115+
deduplication_key: "{{ .Alert.InstanceID }}",
116+
source_url: "{{ .Result.URL }}",
117+
model: "{{ .Vars.model }}",
118+
product: "{{ .Vars.product }}",
119+
}),
120+
},
121+
],
122+
variables: [
123+
{
124+
name: "model",
125+
},
126+
{
127+
name: "product",
128+
},
129+
],
130+
})
131+
132+
new incident.AlertRoute("HoneycombAlertRoute", {
133+
name: $app.stage === "production" ? "Honeycomb" : `Honeycomb (${$app.stage})`,
134+
enabled: true,
135+
isPrivate: false,
136+
alertSources: [
137+
{
138+
alertSourceId: alertSource.id,
139+
conditionGroups: [
140+
{
141+
conditions: [
142+
{
143+
subject: "alert.title",
144+
operation: "is_set",
145+
paramBindings: [],
146+
},
147+
],
148+
},
149+
],
150+
},
151+
],
152+
conditionGroups: [
153+
{
154+
conditions: [
155+
{
156+
subject: "alert.title",
157+
operation: "is_set",
158+
paramBindings: [],
159+
},
160+
],
161+
},
162+
],
163+
expressions: [],
164+
escalationConfig: {
165+
autoCancelEscalations: true,
166+
escalationTargets: [],
167+
},
168+
incidentConfig: {
169+
autoDeclineEnabled: true,
170+
enabled: true,
171+
conditionGroups: [],
172+
deferTimeSeconds: 0,
173+
groupingKeys: [
174+
{
175+
reference: $interpolate`alert.attributes.${fields.model.id}`,
176+
},
177+
{
178+
reference: $interpolate`alert.attributes.${fields.product.id}`,
179+
},
180+
],
181+
groupingWindowSeconds: 900,
182+
},
183+
incidentTemplate: {
184+
name: {
185+
value: {
186+
literal: varSpec("Alert -> Title", "alert.title"),
187+
},
188+
},
189+
summary: {
190+
value: {
191+
literal: varSpec("Alert -> Description", "alert.description"),
192+
},
193+
},
194+
startInTriage: {
195+
value: {
196+
literal: "true",
197+
},
198+
},
199+
severity: {
200+
mergeStrategy: "first-wins",
201+
},
202+
incidentMode: {
203+
value: {
204+
literal: $app.stage === "production" ? "standard" : "test",
205+
},
206+
},
207+
},
208+
})
209+
210+
type Product = "go" | "zen"
211+
212+
type Trigger = (opts: { model: string; product: Product }) => {
213+
id: string
214+
title: string
215+
description: string
216+
json: honeycomb.GetQuerySpecificationOutputArgs
217+
threshold: { op: ">=" | "<="; value: number }
218+
baseline: 3600 | 86400
219+
}
220+
221+
type Model = { id: string; products: Product[]; triggers: Trigger[] }
222+
223+
const httpErrors: Trigger = ({ model, product }) => ({
224+
id: "increased-http-errors",
225+
title: `Increased HTTP Errors for ${displayName(model)} on ${displayName(product)}`,
226+
description: `Detected increased rate of HTTP errors for ${displayName(model)} on OpenCode ${displayName(product)}`,
227+
json: {
228+
calculations: [
229+
{
230+
op: "COUNT",
231+
name: "TOTAL",
232+
filterCombination: "AND",
233+
filters: [
234+
{ column: "model", op: "=", value: model },
235+
{ column: "isGoTier", op: "=", value: product === "go" ? "true" : "false" },
236+
],
237+
},
238+
{
239+
op: "COUNT",
240+
name: "FAILED",
241+
filterCombination: "AND",
242+
filters: [
243+
{ column: "model", op: "=", value: model },
244+
{ column: "isGoTier", op: "=", value: product === "go" ? "true" : "false" },
245+
{ column: "status", op: ">=", value: "400" },
246+
{ column: "status", op: "!=", value: "401" },
247+
],
248+
},
249+
],
250+
formulas: [{ name: "ERROR", expression: "$FAILED / $TOTAL" }],
251+
timeRange: 900,
252+
},
253+
// Alert when errors surge 50% compared to the previous period
254+
threshold: { op: ">=", value: 50 },
255+
// What previous time period to evaluate against
256+
baseline: 3600,
257+
})
258+
259+
const models: Model[] = [
260+
{ id: "kimi-k2.6", products: ["go", "zen"], triggers: [httpErrors] },
261+
{ id: "kimi-k2.5", products: ["go", "zen"], triggers: [httpErrors] },
262+
{ id: "deepseek-v4-flash", products: ["go", "zen"], triggers: [httpErrors] },
263+
{ id: "deepseek-v4-pro", products: ["go", "zen"], triggers: [httpErrors] },
264+
{ id: "glm-5.1", products: ["go", "zen"], triggers: [httpErrors] },
265+
// { id: "glm-5", products: ["go"], triggers: [httpErrors] },
266+
{ id: "qwen3.6-plus", products: ["go", "zen"], triggers: [httpErrors] },
267+
{ id: "qwen3.5-plus", products: ["go"], triggers: [httpErrors] },
268+
{ id: "minimax-m2.7", products: ["go", "zen"], triggers: [httpErrors] },
269+
// { id: "minimax-m2.5", products: ["go", "zen"], triggers: [httpErrors] },
270+
{ id: "mimo-v2.5-pro", products: ["go"], triggers: [httpErrors] },
271+
// { id: "mimo-v2.5", products: ["go"], triggers: [httpErrors] },
272+
// { id: "mimo-v2-omni", products: ["go"], triggers: [httpErrors] },
273+
// { id: "mimo-v2-pro", products: ["go"], triggers: [httpErrors] },
274+
{ id: "claude-opus-4-7", products: ["zen"], triggers: [httpErrors] },
275+
// { id: "claude-opus-4-6", products: ["zen"], triggers: [httpErrors] },
276+
// { id: "claude-sonnet-4-6", products: ["zen"], triggers: [httpErrors] },
277+
{ id: "gpt-5.5", products: ["zen"], triggers: [httpErrors] },
278+
{ id: "big-pickle", products: ["zen"], triggers: [httpErrors] },
279+
// { id: "minimax-m2.5-free", products: ["zen"], triggers: [httpErrors] },
280+
// { id: "hy3-preview-free", products: ["zen"], triggers: [httpErrors] },
281+
// { id: "nemotron-3-super-free", products: ["zen"], triggers: [httpErrors] },
282+
// { id: "trinity-large-preview-free", products: ["zen"], triggers: [httpErrors] },
283+
// { id: "ling-2.6-flash-free", products: ["zen"], triggers: [httpErrors] },
284+
]
285+
286+
if ($app.stage !== "production") {
287+
models.splice(1)
288+
}
289+
290+
for (const model of models) {
291+
for (const product of model.products) {
292+
for (const trigger of model.triggers) {
293+
const spec = trigger({ model: model.id, product })
294+
295+
new honeycomb.Trigger(resourceName(`${spec.id}-${product}-${model.id}`), {
296+
name: spec.title,
297+
description: spec.description,
298+
queryJson: honeycomb.getQuerySpecificationOutput(spec.json).json,
299+
alertType: "on_change",
300+
// This is the minimum when using % change detection
301+
frequency: 900,
302+
baselineDetails: [{ type: "percentage", offsetMinutes: spec.baseline / 60 }],
303+
thresholds: [{ ...spec.threshold, exceededLimit: 1 }],
304+
recipients: [
305+
{
306+
id: webhookRecipient.id,
307+
notificationDetails: [
308+
{
309+
variables: [
310+
{ name: "model", value: model.id },
311+
{ name: "product", value: product },
312+
],
313+
},
314+
],
315+
},
316+
],
317+
})
318+
}
319+
}
320+
}

0 commit comments

Comments
 (0)