|
| 1 | +package e2e |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "encoding/json" |
| 6 | + "fmt" |
| 7 | + "sort" |
| 8 | + "strings" |
| 9 | + "testing" |
| 10 | + "time" |
| 11 | + |
| 12 | + "github.com/Azure/agentbaker/e2e/toolkit" |
| 13 | +) |
| 14 | + |
| 15 | +const ( |
| 16 | + // cseEventsDir is the directory where CSE task timing events are stored on the VM. |
| 17 | + // This matches EVENTS_LOGGING_DIR defined in both cse_helpers.sh and cse_start.sh. |
| 18 | + // Events are written directly here (not in per-handler subdirectories) — each file |
| 19 | + // is a single-line JSON object named <epoch-ms>.json. |
| 20 | + cseEventsDir = "/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/" |
| 21 | + // provisionJSONPath is the path to the provision.json file with overall boot timing. |
| 22 | + provisionJSONPath = "/var/log/azure/aks/provision.json" |
| 23 | +) |
| 24 | + |
| 25 | +// CSETaskTiming represents the timing of a single CSE task. |
| 26 | +type CSETaskTiming struct { |
| 27 | + TaskName string |
| 28 | + StartTime time.Time |
| 29 | + EndTime time.Time |
| 30 | + Duration time.Duration |
| 31 | + Message string |
| 32 | +} |
| 33 | + |
| 34 | +// CSEProvisionTiming represents the overall provisioning timing from provision.json. |
| 35 | +type CSEProvisionTiming struct { |
| 36 | + ExitCode string `json:"ExitCode"` |
| 37 | + ExecDuration string `json:"ExecDuration"` |
| 38 | + KernelStartTime string `json:"KernelStartTime"` |
| 39 | + CloudInitLocalStart string `json:"CloudInitLocalStartTime"` |
| 40 | + CloudInitStart string `json:"CloudInitStartTime"` |
| 41 | + CloudFinalStart string `json:"CloudFinalStartTime"` |
| 42 | + CSEStartTime string `json:"CSEStartTime"` |
| 43 | + GuestAgentStartTime string `json:"GuestAgentStartTime"` |
| 44 | + SystemdSummary string `json:"SystemdSummary"` |
| 45 | + BootDatapoints json.RawMessage `json:"BootDatapoints"` |
| 46 | +} |
| 47 | + |
| 48 | +// CSETimingReport holds all parsed timing data from a VM. |
| 49 | +type CSETimingReport struct { |
| 50 | + Tasks []CSETaskTiming |
| 51 | + Provision *CSEProvisionTiming |
| 52 | + taskIndex map[string]*CSETaskTiming |
| 53 | +} |
| 54 | + |
| 55 | +// cseEventJSON matches the JSON structure written by logs_to_events() in cse_helpers.sh. |
| 56 | +// Despite its name, OperationId stores the task *end* timestamp (not a GUID). |
| 57 | +// This is by design: GA (Guest Agent) requires specific field names, and OperationId |
| 58 | +// was repurposed to carry the end time. See cse_helpers.sh logs_to_events(): |
| 59 | +// |
| 60 | +// --arg Timestamp "${startTime}" |
| 61 | +// --arg OperationId "${endTime}" |
| 62 | +type cseEventJSON struct { |
| 63 | + Timestamp string `json:"Timestamp"` |
| 64 | + OperationId string `json:"OperationId"` // end timestamp, not a GUID — see logs_to_events() in cse_helpers.sh |
| 65 | + TaskName string `json:"TaskName"` |
| 66 | + EventLevel string `json:"EventLevel"` |
| 67 | + Message string `json:"Message"` |
| 68 | +} |
| 69 | + |
| 70 | +// GetTask returns the timing for a specific task, or nil if not found. |
| 71 | +func (r *CSETimingReport) GetTask(name string) *CSETaskTiming { |
| 72 | + if r.taskIndex == nil { |
| 73 | + r.taskIndex = make(map[string]*CSETaskTiming, len(r.Tasks)) |
| 74 | + for i := range r.Tasks { |
| 75 | + r.taskIndex[r.Tasks[i].TaskName] = &r.Tasks[i] |
| 76 | + } |
| 77 | + } |
| 78 | + return r.taskIndex[name] |
| 79 | +} |
| 80 | + |
| 81 | +// TotalCSEDuration returns the duration of the cse_start task if present. |
| 82 | +func (r *CSETimingReport) TotalCSEDuration() time.Duration { |
| 83 | + if t := r.GetTask("AKS.CSE.cse_start"); t != nil { |
| 84 | + return t.Duration |
| 85 | + } |
| 86 | + return 0 |
| 87 | +} |
| 88 | + |
| 89 | +// LogReport logs all task timings to the test logger. |
| 90 | +func (r *CSETimingReport) LogReport(_ context.Context, t interface{ Logf(string, ...any) }) { |
| 91 | + t.Logf("=== CSE Task Timing Report ===") |
| 92 | + t.Logf("%-60s %12s %12s", "Task", "Duration", "Start→End") |
| 93 | + t.Logf("%s", strings.Repeat("-", 90)) |
| 94 | + |
| 95 | + sorted := make([]CSETaskTiming, len(r.Tasks)) |
| 96 | + copy(sorted, r.Tasks) |
| 97 | + sort.Slice(sorted, func(i, j int) bool { |
| 98 | + return sorted[i].StartTime.Before(sorted[j].StartTime) |
| 99 | + }) |
| 100 | + |
| 101 | + for _, task := range sorted { |
| 102 | + t.Logf("%-60s %10.2fs %s → %s", |
| 103 | + task.TaskName, |
| 104 | + task.Duration.Seconds(), |
| 105 | + task.StartTime.Format("15:04:05.000"), |
| 106 | + task.EndTime.Format("15:04:05.000"), |
| 107 | + ) |
| 108 | + } |
| 109 | + |
| 110 | + if total := r.TotalCSEDuration(); total > 0 { |
| 111 | + t.Logf("%s", strings.Repeat("-", 90)) |
| 112 | + t.Logf("%-60s %10.2fs", "TOTAL (cse_start)", total.Seconds()) |
| 113 | + } |
| 114 | + |
| 115 | + if r.Provision != nil { |
| 116 | + t.Logf("\n=== Provision Summary ===") |
| 117 | + t.Logf("ExitCode: %s, ExecDuration: %ss", r.Provision.ExitCode, r.Provision.ExecDuration) |
| 118 | + t.Logf("KernelStart: %s, CSEStart: %s, GuestAgent: %s", |
| 119 | + r.Provision.KernelStartTime, r.Provision.CSEStartTime, r.Provision.GuestAgentStartTime) |
| 120 | + } |
| 121 | +} |
| 122 | + |
| 123 | +// ExtractCSETimings SSHes into the scenario VM and extracts all CSE task timings. |
| 124 | +// Returns an error if no tasks could be parsed, since an empty report would make |
| 125 | +// regression detection ineffective. |
| 126 | +func ExtractCSETimings(ctx context.Context, s *Scenario) (*CSETimingReport, error) { |
| 127 | + report := &CSETimingReport{} |
| 128 | + |
| 129 | + // Read all event JSON files from the CSE events directory, explicitly |
| 130 | + // appending a newline after each file so each JSON document is separated. |
| 131 | + // Search both the primary events directory and any handler-version subdirectories, |
| 132 | + // as the Guest Agent may move events between these locations. |
| 133 | + listCmd := fmt.Sprintf( |
| 134 | + "sudo find %s /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null", |
| 135 | + cseEventsDir, |
| 136 | + ) |
| 137 | + result, err := execScriptOnVm(ctx, s, s.Runtime.VM, listCmd) |
| 138 | + if err != nil { |
| 139 | + return nil, fmt.Errorf("failed to read CSE events: %w", err) |
| 140 | + } |
| 141 | + |
| 142 | + // Parse event JSON objects using json.Decoder for robustness — handles both |
| 143 | + // single-line and multi-line JSON, and doesn't break on embedded newlines. |
| 144 | + decoder := json.NewDecoder(strings.NewReader(strings.TrimSpace(result.stdout))) |
| 145 | + var parseErrors int |
| 146 | + for decoder.More() { |
| 147 | + var event cseEventJSON |
| 148 | + if err := decoder.Decode(&event); err != nil { |
| 149 | + parseErrors++ |
| 150 | + s.T.Logf("WARNING: failed to decode CSE event JSON: %v", err) |
| 151 | + continue |
| 152 | + } |
| 153 | + if event.TaskName == "" || event.Timestamp == "" || event.OperationId == "" { |
| 154 | + continue |
| 155 | + } |
| 156 | + |
| 157 | + startTime, err := parseCSETimestamp(event.Timestamp) |
| 158 | + if err != nil { |
| 159 | + parseErrors++ |
| 160 | + s.T.Logf("WARNING: failed to parse CSE start timestamp for task %s: %v", event.TaskName, err) |
| 161 | + continue |
| 162 | + } |
| 163 | + endTime, err := parseCSETimestamp(event.OperationId) |
| 164 | + if err != nil { |
| 165 | + parseErrors++ |
| 166 | + s.T.Logf("WARNING: failed to parse CSE end timestamp for task %s: %v", event.TaskName, err) |
| 167 | + continue |
| 168 | + } |
| 169 | + |
| 170 | + report.Tasks = append(report.Tasks, CSETaskTiming{ |
| 171 | + TaskName: event.TaskName, |
| 172 | + StartTime: startTime, |
| 173 | + EndTime: endTime, |
| 174 | + Duration: endTime.Sub(startTime), |
| 175 | + Message: event.Message, |
| 176 | + }) |
| 177 | + } |
| 178 | + |
| 179 | + if parseErrors > 0 { |
| 180 | + s.T.Logf("WARNING: %d CSE event parse errors encountered", parseErrors) |
| 181 | + } |
| 182 | + if len(report.Tasks) == 0 { |
| 183 | + return report, fmt.Errorf("no CSE task timings were parsed (%d parse errors)", parseErrors) |
| 184 | + } |
| 185 | + |
| 186 | + // Read provision.json for overall boot timing |
| 187 | + provResult, err := execScriptOnVm(ctx, s, s.Runtime.VM, fmt.Sprintf("sudo cat %s", provisionJSONPath)) |
| 188 | + if err == nil && provResult.stdout != "" { |
| 189 | + var prov CSEProvisionTiming |
| 190 | + if json.Unmarshal([]byte(strings.TrimSpace(provResult.stdout)), &prov) == nil { |
| 191 | + report.Provision = &prov |
| 192 | + } |
| 193 | + } |
| 194 | + |
| 195 | + return report, nil |
| 196 | +} |
| 197 | + |
| 198 | +// parseCSETimestamp parses the timestamp format used by logs_to_events: "YYYY-MM-DD HH:MM:SS.mmm" |
| 199 | +func parseCSETimestamp(s string) (time.Time, error) { |
| 200 | + layouts := []string{ |
| 201 | + "2006-01-02 15:04:05.000", |
| 202 | + "2006-01-02 15:04:05", |
| 203 | + } |
| 204 | + for _, layout := range layouts { |
| 205 | + if t, err := time.Parse(layout, s); err == nil { |
| 206 | + return t, nil |
| 207 | + } |
| 208 | + } |
| 209 | + return time.Time{}, fmt.Errorf("cannot parse CSE timestamp %q", s) |
| 210 | +} |
| 211 | + |
| 212 | +// CSETimingThresholds defines maximum acceptable durations for CSE tasks. |
| 213 | +type CSETimingThresholds struct { |
| 214 | + // TaskThresholds maps task name suffixes to maximum duration. |
| 215 | + // Task names are matched by suffix to allow flexible matching |
| 216 | + // (e.g., "installDebPackageFromFile" matches "AKS.CSE.installkubelet.installDebPackageFromFile"). |
| 217 | + TaskThresholds map[string]time.Duration |
| 218 | + |
| 219 | + // TotalCSEThreshold is the maximum acceptable total CSE duration. |
| 220 | + TotalCSEThreshold time.Duration |
| 221 | + |
| 222 | + // DefaultTaskThreshold is the threshold applied to any task that exceeds it |
| 223 | + // but has no specific entry in TaskThresholds. This ensures that ALL slow tasks |
| 224 | + // appear as sub-tests in ADO Pipeline Analytics, even newly added ones. |
| 225 | + // Tasks below this threshold are silently skipped. |
| 226 | + // Set to 0 to disable dynamic tracking. |
| 227 | + DefaultTaskThreshold time.Duration |
| 228 | +} |
| 229 | + |
| 230 | +// ValidateCSETimings extracts CSE task timings from the VM, logs them, and validates |
| 231 | +// against thresholds. Each threshold check runs as a t.Run() sub-test so that ADO |
| 232 | +// Pipeline Analytics (via gotestsum → JUnit XML → PublishTestResults) can track |
| 233 | +// individual CSE task pass/fail and duration trends over time. |
| 234 | +func ValidateCSETimings(ctx context.Context, s *Scenario, thresholds CSETimingThresholds) *CSETimingReport { |
| 235 | + s.T.Helper() |
| 236 | + defer toolkit.LogStep(s.T, "validating CSE task timings")() |
| 237 | + |
| 238 | + // Unwrap the underlying *testing.T from the toolkit logger wrapper |
| 239 | + // so we can use t.Run() for sub-tests (ADO Pipeline Analytics tracking). |
| 240 | + tRunner := toolkit.UnwrapTestingT(s.T) |
| 241 | + if tRunner == nil { |
| 242 | + s.T.Fatalf("ValidateCSETimings requires *testing.T for sub-test support, got %T", s.T) |
| 243 | + } |
| 244 | + |
| 245 | + // Use pre-cached report if available (extracted eagerly before GA swept events). |
| 246 | + // Fall back to live extraction if no cached report exists. |
| 247 | + report := s.Runtime.CSETimingReport |
| 248 | + if report == nil { |
| 249 | + var err error |
| 250 | + report, err = ExtractCSETimings(ctx, s) |
| 251 | + if err != nil { |
| 252 | + s.T.Fatalf("failed to extract CSE timings: %v", err) |
| 253 | + return nil |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + // Always log the full timing report |
| 258 | + report.LogReport(ctx, s.T) |
| 259 | + |
| 260 | + // Fail if no tasks were parsed — an empty report makes regression detection ineffective. |
| 261 | + if len(report.Tasks) == 0 { |
| 262 | + s.T.Fatalf("no CSE task timings were parsed; cannot validate performance thresholds") |
| 263 | + return nil |
| 264 | + } |
| 265 | + |
| 266 | + // Fail if the critical cse_start task is missing — without it TotalCSEDuration() |
| 267 | + // returns 0 and the total duration threshold check would silently pass. |
| 268 | + if report.GetTask("AKS.CSE.cse_start") == nil { |
| 269 | + s.T.Fatalf("AKS.CSE.cse_start task not found in timing report; cannot validate total CSE duration") |
| 270 | + return nil |
| 271 | + } |
| 272 | + |
| 273 | + // Validate total CSE duration as a sub-test for ADO tracking |
| 274 | + if thresholds.TotalCSEThreshold > 0 { |
| 275 | + tRunner.Run("TotalCSEDuration", func(t *testing.T) { |
| 276 | + totalDuration := report.TotalCSEDuration() |
| 277 | + t.Logf("total CSE duration: %s (threshold: %s)", totalDuration, thresholds.TotalCSEThreshold) |
| 278 | + if totalDuration > thresholds.TotalCSEThreshold { |
| 279 | + toolkit.LogDuration(ctx, totalDuration, thresholds.TotalCSEThreshold, |
| 280 | + fmt.Sprintf("CSE total duration %s exceeds threshold %s", totalDuration, thresholds.TotalCSEThreshold)) |
| 281 | + t.Errorf("CSE total duration %s exceeds threshold %s", totalDuration, thresholds.TotalCSEThreshold) |
| 282 | + } |
| 283 | + }) |
| 284 | + } |
| 285 | + |
| 286 | + // Validate individual task thresholds — each as a sub-test for ADO tracking. |
| 287 | + // ADO Test Analytics will show per-task pass/fail trends and flag regressions. |
| 288 | + // Sort suffixes by length descending so longer (more specific) suffixes match first, |
| 289 | + // making matching deterministic when multiple suffixes could match the same task. |
| 290 | + sortedSuffixes := make([]string, 0, len(thresholds.TaskThresholds)) |
| 291 | + for suffix := range thresholds.TaskThresholds { |
| 292 | + sortedSuffixes = append(sortedSuffixes, suffix) |
| 293 | + } |
| 294 | + sort.Slice(sortedSuffixes, func(i, j int) bool { |
| 295 | + return len(sortedSuffixes[i]) > len(sortedSuffixes[j]) |
| 296 | + }) |
| 297 | + |
| 298 | + matchedTasks := make(map[string]bool) |
| 299 | + matchedSuffixes := make(map[string]bool) |
| 300 | + for _, task := range report.Tasks { |
| 301 | + for _, suffix := range sortedSuffixes { |
| 302 | + maxDuration := thresholds.TaskThresholds[suffix] |
| 303 | + if strings.HasSuffix(task.TaskName, suffix) { |
| 304 | + matchedTasks[task.TaskName] = true |
| 305 | + matchedSuffixes[suffix] = true |
| 306 | + task := task |
| 307 | + suffix := suffix |
| 308 | + maxDuration := maxDuration |
| 309 | + // Include sanitized task name to avoid collisions when multiple tasks match different suffixes |
| 310 | + shortTask := task.TaskName |
| 311 | + if idx := strings.LastIndex(shortTask, "."); idx >= 0 { |
| 312 | + shortTask = shortTask[idx+1:] |
| 313 | + } |
| 314 | + testName := suffix |
| 315 | + if shortTask != suffix { |
| 316 | + testName = fmt.Sprintf("%s/%s", shortTask, suffix) |
| 317 | + } |
| 318 | + tRunner.Run(fmt.Sprintf("Task_%s", testName), func(t *testing.T) { |
| 319 | + t.Logf("task %s duration: %s (threshold: %s)", task.TaskName, task.Duration, maxDuration) |
| 320 | + if task.Duration > maxDuration { |
| 321 | + toolkit.LogDuration(ctx, task.Duration, maxDuration, |
| 322 | + fmt.Sprintf("CSE task %s took %s (threshold: %s)", task.TaskName, task.Duration, maxDuration)) |
| 323 | + t.Errorf("CSE task %s took %s, exceeds threshold %s", task.TaskName, task.Duration, maxDuration) |
| 324 | + } |
| 325 | + }) |
| 326 | + break |
| 327 | + } |
| 328 | + } |
| 329 | + } |
| 330 | + |
| 331 | + // Log warnings for configured threshold suffixes that didn't match any task. |
| 332 | + // This helps detect task renames/removals without hard-failing, since some tasks |
| 333 | + // only fire on specific install paths (cached vs full) or OS variants. |
| 334 | + for _, suffix := range sortedSuffixes { |
| 335 | + if !matchedSuffixes[suffix] { |
| 336 | + s.T.Logf("⚠️ threshold suffix %q did not match any CSE task — task may not fire on this install path, or may have been renamed", suffix) |
| 337 | + } |
| 338 | + } |
| 339 | + |
| 340 | + // Dynamic tracking: create sub-tests for any CSE task that exceeds DefaultTaskThreshold |
| 341 | + // but wasn't matched by a specific threshold above. This ensures newly added CSE tasks |
| 342 | + // automatically appear in ADO Pipeline Analytics without code changes. |
| 343 | + // Skip cse_start (validated by TotalCSEThreshold) and non-CSE events (e.g., AKS.Runtime.*). |
| 344 | + if thresholds.DefaultTaskThreshold > 0 { |
| 345 | + for _, task := range report.Tasks { |
| 346 | + if matchedTasks[task.TaskName] { |
| 347 | + continue |
| 348 | + } |
| 349 | + if task.TaskName == "AKS.CSE.cse_start" { |
| 350 | + continue |
| 351 | + } |
| 352 | + if !strings.HasPrefix(task.TaskName, "AKS.CSE.") { |
| 353 | + continue |
| 354 | + } |
| 355 | + if task.Duration < thresholds.DefaultTaskThreshold { |
| 356 | + continue |
| 357 | + } |
| 358 | + task := task |
| 359 | + // Extract short name: "AKS.CSE.foo.bar" → "bar", or use full name if no dots |
| 360 | + shortName := task.TaskName |
| 361 | + if idx := strings.LastIndex(shortName, "."); idx >= 0 { |
| 362 | + shortName = shortName[idx+1:] |
| 363 | + } |
| 364 | + defaultThreshold := thresholds.DefaultTaskThreshold |
| 365 | + tRunner.Run(fmt.Sprintf("Task_%s", shortName), func(t *testing.T) { |
| 366 | + t.Logf("task %s duration: %s (default threshold: %s — no specific threshold configured)", |
| 367 | + task.TaskName, task.Duration, defaultThreshold) |
| 368 | + if task.Duration > defaultThreshold { |
| 369 | + t.Errorf("CSE task %s took %s, exceeds default threshold %s (consider adding a specific threshold)", |
| 370 | + task.TaskName, task.Duration, defaultThreshold) |
| 371 | + } |
| 372 | + }) |
| 373 | + } |
| 374 | + } |
| 375 | + |
| 376 | + return report |
| 377 | +} |
0 commit comments