Skip to content

Commit ad2f25c

Browse files
djslyCopilot
andauthored
feat: add CSE timing regression tests for all Linux VHDs (Ubuntu 22.04/24.04, Azure Linux V3) (#8284)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 0567b7c commit ad2f25c

5 files changed

Lines changed: 773 additions & 2 deletions

File tree

e2e/cse_timing.go

Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
package e2e
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"sort"
8+
"strings"
9+
"testing"
10+
"time"
11+
12+
"github.com/Azure/agentbaker/e2e/toolkit"
13+
)
14+
15+
const (
16+
// cseEventsDir is the directory where CSE task timing events are stored on the VM.
17+
// This matches EVENTS_LOGGING_DIR defined in both cse_helpers.sh and cse_start.sh.
18+
// Events are written directly here (not in per-handler subdirectories) — each file
19+
// is a single-line JSON object named <epoch-ms>.json.
20+
cseEventsDir = "/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/"
21+
// provisionJSONPath is the path to the provision.json file with overall boot timing.
22+
provisionJSONPath = "/var/log/azure/aks/provision.json"
23+
)
24+
25+
// CSETaskTiming represents the timing of a single CSE task.
26+
type CSETaskTiming struct {
27+
TaskName string
28+
StartTime time.Time
29+
EndTime time.Time
30+
Duration time.Duration
31+
Message string
32+
}
33+
34+
// CSEProvisionTiming represents the overall provisioning timing from provision.json.
35+
type CSEProvisionTiming struct {
36+
ExitCode string `json:"ExitCode"`
37+
ExecDuration string `json:"ExecDuration"`
38+
KernelStartTime string `json:"KernelStartTime"`
39+
CloudInitLocalStart string `json:"CloudInitLocalStartTime"`
40+
CloudInitStart string `json:"CloudInitStartTime"`
41+
CloudFinalStart string `json:"CloudFinalStartTime"`
42+
CSEStartTime string `json:"CSEStartTime"`
43+
GuestAgentStartTime string `json:"GuestAgentStartTime"`
44+
SystemdSummary string `json:"SystemdSummary"`
45+
BootDatapoints json.RawMessage `json:"BootDatapoints"`
46+
}
47+
48+
// CSETimingReport holds all parsed timing data from a VM.
49+
type CSETimingReport struct {
50+
Tasks []CSETaskTiming
51+
Provision *CSEProvisionTiming
52+
taskIndex map[string]*CSETaskTiming
53+
}
54+
55+
// cseEventJSON matches the JSON structure written by logs_to_events() in cse_helpers.sh.
56+
// Despite its name, OperationId stores the task *end* timestamp (not a GUID).
57+
// This is by design: GA (Guest Agent) requires specific field names, and OperationId
58+
// was repurposed to carry the end time. See cse_helpers.sh logs_to_events():
59+
//
60+
// --arg Timestamp "${startTime}"
61+
// --arg OperationId "${endTime}"
62+
type cseEventJSON struct {
63+
Timestamp string `json:"Timestamp"`
64+
OperationId string `json:"OperationId"` // end timestamp, not a GUID — see logs_to_events() in cse_helpers.sh
65+
TaskName string `json:"TaskName"`
66+
EventLevel string `json:"EventLevel"`
67+
Message string `json:"Message"`
68+
}
69+
70+
// GetTask returns the timing for a specific task, or nil if not found.
71+
func (r *CSETimingReport) GetTask(name string) *CSETaskTiming {
72+
if r.taskIndex == nil {
73+
r.taskIndex = make(map[string]*CSETaskTiming, len(r.Tasks))
74+
for i := range r.Tasks {
75+
r.taskIndex[r.Tasks[i].TaskName] = &r.Tasks[i]
76+
}
77+
}
78+
return r.taskIndex[name]
79+
}
80+
81+
// TotalCSEDuration returns the duration of the cse_start task if present.
82+
func (r *CSETimingReport) TotalCSEDuration() time.Duration {
83+
if t := r.GetTask("AKS.CSE.cse_start"); t != nil {
84+
return t.Duration
85+
}
86+
return 0
87+
}
88+
89+
// LogReport logs all task timings to the test logger.
90+
func (r *CSETimingReport) LogReport(_ context.Context, t interface{ Logf(string, ...any) }) {
91+
t.Logf("=== CSE Task Timing Report ===")
92+
t.Logf("%-60s %12s %12s", "Task", "Duration", "Start→End")
93+
t.Logf("%s", strings.Repeat("-", 90))
94+
95+
sorted := make([]CSETaskTiming, len(r.Tasks))
96+
copy(sorted, r.Tasks)
97+
sort.Slice(sorted, func(i, j int) bool {
98+
return sorted[i].StartTime.Before(sorted[j].StartTime)
99+
})
100+
101+
for _, task := range sorted {
102+
t.Logf("%-60s %10.2fs %s → %s",
103+
task.TaskName,
104+
task.Duration.Seconds(),
105+
task.StartTime.Format("15:04:05.000"),
106+
task.EndTime.Format("15:04:05.000"),
107+
)
108+
}
109+
110+
if total := r.TotalCSEDuration(); total > 0 {
111+
t.Logf("%s", strings.Repeat("-", 90))
112+
t.Logf("%-60s %10.2fs", "TOTAL (cse_start)", total.Seconds())
113+
}
114+
115+
if r.Provision != nil {
116+
t.Logf("\n=== Provision Summary ===")
117+
t.Logf("ExitCode: %s, ExecDuration: %ss", r.Provision.ExitCode, r.Provision.ExecDuration)
118+
t.Logf("KernelStart: %s, CSEStart: %s, GuestAgent: %s",
119+
r.Provision.KernelStartTime, r.Provision.CSEStartTime, r.Provision.GuestAgentStartTime)
120+
}
121+
}
122+
123+
// ExtractCSETimings SSHes into the scenario VM and extracts all CSE task timings.
124+
// Returns an error if no tasks could be parsed, since an empty report would make
125+
// regression detection ineffective.
126+
func ExtractCSETimings(ctx context.Context, s *Scenario) (*CSETimingReport, error) {
127+
report := &CSETimingReport{}
128+
129+
// Read all event JSON files from the CSE events directory, explicitly
130+
// appending a newline after each file so each JSON document is separated.
131+
// Search both the primary events directory and any handler-version subdirectories,
132+
// as the Guest Agent may move events between these locations.
133+
listCmd := fmt.Sprintf(
134+
"sudo find %s /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null",
135+
cseEventsDir,
136+
)
137+
result, err := execScriptOnVm(ctx, s, s.Runtime.VM, listCmd)
138+
if err != nil {
139+
return nil, fmt.Errorf("failed to read CSE events: %w", err)
140+
}
141+
142+
// Parse event JSON objects using json.Decoder for robustness — handles both
143+
// single-line and multi-line JSON, and doesn't break on embedded newlines.
144+
decoder := json.NewDecoder(strings.NewReader(strings.TrimSpace(result.stdout)))
145+
var parseErrors int
146+
for decoder.More() {
147+
var event cseEventJSON
148+
if err := decoder.Decode(&event); err != nil {
149+
parseErrors++
150+
s.T.Logf("WARNING: failed to decode CSE event JSON: %v", err)
151+
continue
152+
}
153+
if event.TaskName == "" || event.Timestamp == "" || event.OperationId == "" {
154+
continue
155+
}
156+
157+
startTime, err := parseCSETimestamp(event.Timestamp)
158+
if err != nil {
159+
parseErrors++
160+
s.T.Logf("WARNING: failed to parse CSE start timestamp for task %s: %v", event.TaskName, err)
161+
continue
162+
}
163+
endTime, err := parseCSETimestamp(event.OperationId)
164+
if err != nil {
165+
parseErrors++
166+
s.T.Logf("WARNING: failed to parse CSE end timestamp for task %s: %v", event.TaskName, err)
167+
continue
168+
}
169+
170+
report.Tasks = append(report.Tasks, CSETaskTiming{
171+
TaskName: event.TaskName,
172+
StartTime: startTime,
173+
EndTime: endTime,
174+
Duration: endTime.Sub(startTime),
175+
Message: event.Message,
176+
})
177+
}
178+
179+
if parseErrors > 0 {
180+
s.T.Logf("WARNING: %d CSE event parse errors encountered", parseErrors)
181+
}
182+
if len(report.Tasks) == 0 {
183+
return report, fmt.Errorf("no CSE task timings were parsed (%d parse errors)", parseErrors)
184+
}
185+
186+
// Read provision.json for overall boot timing
187+
provResult, err := execScriptOnVm(ctx, s, s.Runtime.VM, fmt.Sprintf("sudo cat %s", provisionJSONPath))
188+
if err == nil && provResult.stdout != "" {
189+
var prov CSEProvisionTiming
190+
if json.Unmarshal([]byte(strings.TrimSpace(provResult.stdout)), &prov) == nil {
191+
report.Provision = &prov
192+
}
193+
}
194+
195+
return report, nil
196+
}
197+
198+
// parseCSETimestamp parses the timestamp format used by logs_to_events: "YYYY-MM-DD HH:MM:SS.mmm"
199+
func parseCSETimestamp(s string) (time.Time, error) {
200+
layouts := []string{
201+
"2006-01-02 15:04:05.000",
202+
"2006-01-02 15:04:05",
203+
}
204+
for _, layout := range layouts {
205+
if t, err := time.Parse(layout, s); err == nil {
206+
return t, nil
207+
}
208+
}
209+
return time.Time{}, fmt.Errorf("cannot parse CSE timestamp %q", s)
210+
}
211+
212+
// CSETimingThresholds defines maximum acceptable durations for CSE tasks.
213+
type CSETimingThresholds struct {
214+
// TaskThresholds maps task name suffixes to maximum duration.
215+
// Task names are matched by suffix to allow flexible matching
216+
// (e.g., "installDebPackageFromFile" matches "AKS.CSE.installkubelet.installDebPackageFromFile").
217+
TaskThresholds map[string]time.Duration
218+
219+
// TotalCSEThreshold is the maximum acceptable total CSE duration.
220+
TotalCSEThreshold time.Duration
221+
222+
// DefaultTaskThreshold is the threshold applied to any task that exceeds it
223+
// but has no specific entry in TaskThresholds. This ensures that ALL slow tasks
224+
// appear as sub-tests in ADO Pipeline Analytics, even newly added ones.
225+
// Tasks below this threshold are silently skipped.
226+
// Set to 0 to disable dynamic tracking.
227+
DefaultTaskThreshold time.Duration
228+
}
229+
230+
// ValidateCSETimings extracts CSE task timings from the VM, logs them, and validates
231+
// against thresholds. Each threshold check runs as a t.Run() sub-test so that ADO
232+
// Pipeline Analytics (via gotestsum → JUnit XML → PublishTestResults) can track
233+
// individual CSE task pass/fail and duration trends over time.
234+
func ValidateCSETimings(ctx context.Context, s *Scenario, thresholds CSETimingThresholds) *CSETimingReport {
235+
s.T.Helper()
236+
defer toolkit.LogStep(s.T, "validating CSE task timings")()
237+
238+
// Unwrap the underlying *testing.T from the toolkit logger wrapper
239+
// so we can use t.Run() for sub-tests (ADO Pipeline Analytics tracking).
240+
tRunner := toolkit.UnwrapTestingT(s.T)
241+
if tRunner == nil {
242+
s.T.Fatalf("ValidateCSETimings requires *testing.T for sub-test support, got %T", s.T)
243+
}
244+
245+
// Use pre-cached report if available (extracted eagerly before GA swept events).
246+
// Fall back to live extraction if no cached report exists.
247+
report := s.Runtime.CSETimingReport
248+
if report == nil {
249+
var err error
250+
report, err = ExtractCSETimings(ctx, s)
251+
if err != nil {
252+
s.T.Fatalf("failed to extract CSE timings: %v", err)
253+
return nil
254+
}
255+
}
256+
257+
// Always log the full timing report
258+
report.LogReport(ctx, s.T)
259+
260+
// Fail if no tasks were parsed — an empty report makes regression detection ineffective.
261+
if len(report.Tasks) == 0 {
262+
s.T.Fatalf("no CSE task timings were parsed; cannot validate performance thresholds")
263+
return nil
264+
}
265+
266+
// Fail if the critical cse_start task is missing — without it TotalCSEDuration()
267+
// returns 0 and the total duration threshold check would silently pass.
268+
if report.GetTask("AKS.CSE.cse_start") == nil {
269+
s.T.Fatalf("AKS.CSE.cse_start task not found in timing report; cannot validate total CSE duration")
270+
return nil
271+
}
272+
273+
// Validate total CSE duration as a sub-test for ADO tracking
274+
if thresholds.TotalCSEThreshold > 0 {
275+
tRunner.Run("TotalCSEDuration", func(t *testing.T) {
276+
totalDuration := report.TotalCSEDuration()
277+
t.Logf("total CSE duration: %s (threshold: %s)", totalDuration, thresholds.TotalCSEThreshold)
278+
if totalDuration > thresholds.TotalCSEThreshold {
279+
toolkit.LogDuration(ctx, totalDuration, thresholds.TotalCSEThreshold,
280+
fmt.Sprintf("CSE total duration %s exceeds threshold %s", totalDuration, thresholds.TotalCSEThreshold))
281+
t.Errorf("CSE total duration %s exceeds threshold %s", totalDuration, thresholds.TotalCSEThreshold)
282+
}
283+
})
284+
}
285+
286+
// Validate individual task thresholds — each as a sub-test for ADO tracking.
287+
// ADO Test Analytics will show per-task pass/fail trends and flag regressions.
288+
// Sort suffixes by length descending so longer (more specific) suffixes match first,
289+
// making matching deterministic when multiple suffixes could match the same task.
290+
sortedSuffixes := make([]string, 0, len(thresholds.TaskThresholds))
291+
for suffix := range thresholds.TaskThresholds {
292+
sortedSuffixes = append(sortedSuffixes, suffix)
293+
}
294+
sort.Slice(sortedSuffixes, func(i, j int) bool {
295+
return len(sortedSuffixes[i]) > len(sortedSuffixes[j])
296+
})
297+
298+
matchedTasks := make(map[string]bool)
299+
matchedSuffixes := make(map[string]bool)
300+
for _, task := range report.Tasks {
301+
for _, suffix := range sortedSuffixes {
302+
maxDuration := thresholds.TaskThresholds[suffix]
303+
if strings.HasSuffix(task.TaskName, suffix) {
304+
matchedTasks[task.TaskName] = true
305+
matchedSuffixes[suffix] = true
306+
task := task
307+
suffix := suffix
308+
maxDuration := maxDuration
309+
// Include sanitized task name to avoid collisions when multiple tasks match different suffixes
310+
shortTask := task.TaskName
311+
if idx := strings.LastIndex(shortTask, "."); idx >= 0 {
312+
shortTask = shortTask[idx+1:]
313+
}
314+
testName := suffix
315+
if shortTask != suffix {
316+
testName = fmt.Sprintf("%s/%s", shortTask, suffix)
317+
}
318+
tRunner.Run(fmt.Sprintf("Task_%s", testName), func(t *testing.T) {
319+
t.Logf("task %s duration: %s (threshold: %s)", task.TaskName, task.Duration, maxDuration)
320+
if task.Duration > maxDuration {
321+
toolkit.LogDuration(ctx, task.Duration, maxDuration,
322+
fmt.Sprintf("CSE task %s took %s (threshold: %s)", task.TaskName, task.Duration, maxDuration))
323+
t.Errorf("CSE task %s took %s, exceeds threshold %s", task.TaskName, task.Duration, maxDuration)
324+
}
325+
})
326+
break
327+
}
328+
}
329+
}
330+
331+
// Log warnings for configured threshold suffixes that didn't match any task.
332+
// This helps detect task renames/removals without hard-failing, since some tasks
333+
// only fire on specific install paths (cached vs full) or OS variants.
334+
for _, suffix := range sortedSuffixes {
335+
if !matchedSuffixes[suffix] {
336+
s.T.Logf("⚠️ threshold suffix %q did not match any CSE task — task may not fire on this install path, or may have been renamed", suffix)
337+
}
338+
}
339+
340+
// Dynamic tracking: create sub-tests for any CSE task that exceeds DefaultTaskThreshold
341+
// but wasn't matched by a specific threshold above. This ensures newly added CSE tasks
342+
// automatically appear in ADO Pipeline Analytics without code changes.
343+
// Skip cse_start (validated by TotalCSEThreshold) and non-CSE events (e.g., AKS.Runtime.*).
344+
if thresholds.DefaultTaskThreshold > 0 {
345+
for _, task := range report.Tasks {
346+
if matchedTasks[task.TaskName] {
347+
continue
348+
}
349+
if task.TaskName == "AKS.CSE.cse_start" {
350+
continue
351+
}
352+
if !strings.HasPrefix(task.TaskName, "AKS.CSE.") {
353+
continue
354+
}
355+
if task.Duration < thresholds.DefaultTaskThreshold {
356+
continue
357+
}
358+
task := task
359+
// Extract short name: "AKS.CSE.foo.bar" → "bar", or use full name if no dots
360+
shortName := task.TaskName
361+
if idx := strings.LastIndex(shortName, "."); idx >= 0 {
362+
shortName = shortName[idx+1:]
363+
}
364+
defaultThreshold := thresholds.DefaultTaskThreshold
365+
tRunner.Run(fmt.Sprintf("Task_%s", shortName), func(t *testing.T) {
366+
t.Logf("task %s duration: %s (default threshold: %s — no specific threshold configured)",
367+
task.TaskName, task.Duration, defaultThreshold)
368+
if task.Duration > defaultThreshold {
369+
t.Errorf("CSE task %s took %s, exceeds default threshold %s (consider adding a specific threshold)",
370+
task.TaskName, task.Duration, defaultThreshold)
371+
}
372+
})
373+
}
374+
}
375+
376+
return report
377+
}

0 commit comments

Comments
 (0)