Skip to content

Commit 371195f

Browse files
committed
fix: improve logging on nil-resource case
Signed-off-by: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
1 parent d3bb7cb commit 371195f

2 files changed

Lines changed: 451 additions & 22 deletions

File tree

main.go

Lines changed: 238 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"io"
1313
"io/fs"
14+
"maps"
1415
"net"
1516
"net/http"
1617
"net/url"
@@ -318,6 +319,7 @@ type CustodianExecutionResult struct {
318319
ArtifactPath string
319320
ResourcesPath string
320321
LogPaths []string
322+
LogTail string
321323
DiagnosticWarnings []string
322324
}
323325

@@ -507,10 +509,12 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe
507509
e.Logger.Info("Custodian AWS API trace enabled", "check_name", req.Check.Name, "trace_log_path", traceLogPath, "pythonpath_dir", traceDir)
508510
}
509511
}
510-
e.Logger.Debug("Executing custodian command",
512+
e.Logger.Info("Executing custodian command",
511513
"check_name", req.Check.Name,
514+
"resource", req.Check.Resource,
512515
"command", req.BinaryPath,
513516
"args", args,
517+
"aws_regions", regions,
514518
)
515519
stdoutBuf := &lockedBuffer{}
516520
stderrBuf := &lockedBuffer{}
@@ -637,14 +641,14 @@ commandFinished:
637641
if resources != nil {
638642
result.Resources = resources
639643
}
640-
var logTail string
641-
var logErr error
642-
if req.LogTailDuringRun || err != nil || runCtx.Err() != nil || resourcesErr != nil {
643-
logPaths, tail, readErr := readCustodianLogArtifacts(req.OutputDir, custodianOutputTailBytes)
644-
result.LogPaths = logPaths
645-
logTail = tail
646-
logErr = readErr
647-
}
644+
// Always capture the tail of custodian's own run log, regardless of exit
645+
// code or LogTailDuringRun. Custodian frequently exits 0 with an empty
646+
// resources.json after logging AccessDenied / EndpointConnectionError to
647+
// custodian-run.log; capturing the tail here (before the temp dir is
648+
// deleted) is the only non-invasive way to surface that cause to Eval.
649+
logPaths, logTail, logErr := readCustodianLogArtifacts(req.OutputDir, custodianOutputTailBytes)
650+
result.LogPaths = logPaths
651+
result.LogTail = logTail
648652

649653
if err != nil {
650654
result.Err = fmt.Errorf("custodian execution failed: %w", err)
@@ -2455,8 +2459,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24552459
successfulPolicyRuns := 0
24562460
hadCheckExecutionFailures := false
24572461
defer func() {
2458-
if p.parsedConfig.PreserveArtifacts && hadCheckExecutionFailures {
2459-
p.Logger.Warn("Preserving cloud custodian execution artifacts after check execution failure", "execution_root", executionRoot)
2462+
zeroEvidence := successfulPolicyRuns == 0 && totalEvidenceCount == 0
2463+
if p.parsedConfig.PreserveArtifacts && (hadCheckExecutionFailures || zeroEvidence) {
2464+
p.Logger.Warn("Preserving cloud custodian execution artifacts for troubleshooting",
2465+
"execution_root", executionRoot,
2466+
"had_check_execution_failures", hadCheckExecutionFailures,
2467+
"zero_evidence", zeroEvidence,
2468+
)
24602469
return
24612470
}
24622471
if err := os.RemoveAll(executionRoot); err != nil {
@@ -2465,9 +2474,21 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24652474
}()
24662475
p.Logger.Debug("Created temporary execution root", "execution_root", executionRoot)
24672476

2477+
// Retain a compact diagnostic per execution (baselines + checks) and a
2478+
// per-check summary row so the zero-evidence failure branch and the
2479+
// consolidated end-of-run summary can explain why nothing was produced.
2480+
var executionDiagnostics []executionDiagnostic
2481+
var runSummaries []checkRunSummary
2482+
24682483
baselines := p.collectInventoryBaselines(ctx, executionRoot)
2469-
for resourceType, baseline := range baselines {
2470-
if baseline == nil || baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 {
2484+
for _, resourceType := range slices.Sorted(maps.Keys(baselines)) {
2485+
baseline := baselines[resourceType]
2486+
if baseline == nil {
2487+
continue
2488+
}
2489+
baselineName := fmt.Sprintf("inventory-%s", sanitizeIdentifier(resourceType))
2490+
executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil))
2491+
if baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 {
24712492
continue
24722493
}
24732494
err := formatExecutionDiagnosticWarnings(baseline.Execution.DiagnosticWarnings)
@@ -2481,6 +2502,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24812502
p.Logger.Warn("Skipping custodian execution due to check parse issues", "check_name", check.Name, "parse_errors", check.ParseErrors)
24822503
accumulatedErrors = errors.Join(accumulatedErrors, fmt.Errorf("check %s has parse errors: %s", check.Name, strings.Join(check.ParseErrors, "; ")))
24832504
hadCheckExecutionFailures = true
2505+
runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: -1, HadError: true, Regions: p.parsedConfig.AWSRegions})
24842506
continue
24852507
}
24862508

@@ -2495,6 +2517,11 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24952517
p.Logger.Error("Skipping check due to unavailable inventory baseline", "check_name", check.Name, "resource", check.Resource, "error", err)
24962518
accumulatedErrors = errors.Join(accumulatedErrors, err)
24972519
hadCheckExecutionFailures = true
2520+
baselineRecordCount := 0
2521+
if baseline != nil {
2522+
baselineRecordCount = len(baseline.Records)
2523+
}
2524+
runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: -1, BaselineResourceCount: baselineRecordCount, HadError: true, Regions: p.parsedConfig.AWSRegions})
24982525
continue
24992526
}
25002527

@@ -2517,6 +2544,8 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25172544
p.Logger.Error("Skipping resource evaluation due to check execution error", "check_name", check.Name, "error", err)
25182545
accumulatedErrors = errors.Join(accumulatedErrors, err)
25192546
hadCheckExecutionFailures = true
2547+
executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(check.Name, check.Resource, false, execution, true))
2548+
runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: execution.ExitCode, BaselineResourceCount: len(baseline.Records), MatchedResourceCount: len(execution.Resources), HadError: true, Regions: p.parsedConfig.AWSRegions})
25202549
continue
25212550
}
25222551

@@ -2547,10 +2576,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25472576
}
25482577
}
25492578

2579+
checkEvidenceCount := 0
2580+
checkHadError := false
25502581
for _, payload := range payloads {
25512582
evidences, evalErr, successfulRuns := p.evaluateResourcePolicies(ctx, payload, req.GetPolicyPaths())
25522583
pendingEvidences = append(pendingEvidences, evidences...)
25532584
totalEvidenceCount += len(evidences)
2585+
checkEvidenceCount += len(evidences)
25542586
successfulPolicyRuns += successfulRuns
25552587
p.Logger.Debug("Completed policy evaluations for resource",
25562588
"check_name", payload.Check.Name,
@@ -2561,6 +2593,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25612593
"had_eval_error", evalErr != nil,
25622594
)
25632595
if evalErr != nil {
2596+
checkHadError = true
25642597
accumulatedErrors = errors.Join(accumulatedErrors, evalErr)
25652598
}
25662599
for len(pendingEvidences) >= evidenceBatchSize {
@@ -2575,7 +2608,20 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25752608
p.Logger.Warn("Check completed with unavailable AWS service endpoints", "check_name", check.Name, "error", err)
25762609
accumulatedErrors = errors.Join(accumulatedErrors, err)
25772610
hadCheckExecutionFailures = true
2578-
}
2611+
checkHadError = true
2612+
}
2613+
executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(check.Name, check.Resource, false, execution, checkHadError))
2614+
runSummaries = append(runSummaries, checkRunSummary{
2615+
Check: check.Name,
2616+
Resource: check.Resource,
2617+
ExitCode: execution.ExitCode,
2618+
BaselineResourceCount: len(baseline.Records),
2619+
MatchedResourceCount: len(execution.Resources),
2620+
PayloadCount: len(payloads),
2621+
EvidenceCount: checkEvidenceCount,
2622+
HadError: checkHadError,
2623+
Regions: p.parsedConfig.AWSRegions,
2624+
})
25792625
}
25802626

25812627
if len(pendingEvidences) > 0 {
@@ -2596,11 +2642,25 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25962642
p.Logger.Warn("No evidence generated by current evaluation run")
25972643
}
25982644

2599-
if successfulPolicyRuns == 0 && totalEvidenceCount == 0 {
2600-
if accumulatedErrors == nil {
2601-
accumulatedErrors = errors.New("policy evaluation failed for all checks")
2602-
}
2603-
return &proto.EvalResponse{Status: proto.ExecutionStatus_FAILURE}, accumulatedErrors
2645+
// Consolidated, one-glance picture across every resource type. Emitted at
2646+
// Warn when no evidence was produced, otherwise at Debug since a run that
2647+
// produced evidence is the healthy case.
2648+
zeroEvidenceOutcome := successfulPolicyRuns == 0 && totalEvidenceCount == 0
2649+
summaryLog := p.Logger.Debug
2650+
if zeroEvidenceOutcome {
2651+
summaryLog = p.Logger.Warn
2652+
}
2653+
summaryLog("Cloud custodian evaluation summary",
2654+
"check_count", len(p.checks),
2655+
"successful_policy_runs", successfulPolicyRuns,
2656+
"total_evidence_count", totalEvidenceCount,
2657+
"had_check_execution_failures", hadCheckExecutionFailures,
2658+
"aws_regions", p.parsedConfig.AWSRegions,
2659+
"per_check", runSummaries,
2660+
)
2661+
2662+
if zeroEvidenceOutcome {
2663+
return &proto.EvalResponse{Status: proto.ExecutionStatus_FAILURE}, composeZeroEvidenceError(accumulatedErrors, executionDiagnostics)
26042664
}
26052665
if hadCheckExecutionFailures {
26062666
if accumulatedErrors == nil {
@@ -2648,6 +2708,7 @@ func (p *CloudCustodianPlugin) collectInventoryBaselines(ctx context.Context, ex
26482708
if execution.Err != nil || execution.Error != "" {
26492709
baselineErr = formatExecutionFailure(check.Name, execution)
26502710
}
2711+
p.warnZeroResourceBaseline(check.Name, resourceType, execution)
26512712
records := make([]ResourceRecord, 0, len(execution.Resources))
26522713
for _, resource := range execution.Resources {
26532714
records = append(records, p.buildResourceRecord(resourceType, resource))
@@ -3185,6 +3246,24 @@ func isReservedResourceLabel(label string) bool {
31853246
}
31863247
}
31873248

3249+
// warnZeroResourceBaseline emits a Warn naming the likely causes whenever an
3250+
// inventory baseline run exits 0 but returned no resources, so that an operator
3251+
// can distinguish a permissions problem from a network/endpoint problem without
3252+
// re-running with invasive tracing. This is not done for policy checks: a fully
3253+
// compliant estate legitimately matches zero resources.
3254+
func (p *CloudCustodianPlugin) warnZeroResourceBaseline(name, resource string, execution CustodianExecutionResult) {
3255+
if execution.ExitCode != 0 || len(execution.Resources) > 0 {
3256+
return
3257+
}
3258+
p.Logger.Warn("Custodian inventory baseline exited successfully but returned zero resources; likely insufficient IAM read permissions or unreachable/cross-region service endpoints",
3259+
"name", name,
3260+
"resource", resource,
3261+
"exit_code", execution.ExitCode,
3262+
"stderr_tail", tailString(execution.Stderr, custodianOutputTailBytes),
3263+
"log_tail", execution.LogTail,
3264+
)
3265+
}
3266+
31883267
func formatExecutionFailure(checkName string, execution CustodianExecutionResult) error {
31893268
switch {
31903269
case execution.Error != "" && execution.Err != nil:
@@ -3210,6 +3289,146 @@ func formatExecutionDiagnosticWarnings(messages []string) error {
32103289
return err
32113290
}
32123291

3292+
// custodianDiagnosticDetailCap bounds the total size of the per-execution
3293+
// diagnostic detail appended to a zero-evidence failure error so that a policy
3294+
// pack with many resource types can never produce an unbounded gRPC error.
3295+
const custodianDiagnosticDetailCap = 32 * 1024
3296+
3297+
// tailString returns at most the last maxBytes bytes of s.
3298+
func tailString(s string, maxBytes int) string {
3299+
if maxBytes <= 0 || len(s) <= maxBytes {
3300+
return s
3301+
}
3302+
return s[len(s)-maxBytes:]
3303+
}
3304+
3305+
// executionDiagnostic is a compact, bounded snapshot of a single custodian
3306+
// execution (an inventory baseline or a policy check) retained so that the
3307+
// zero-evidence failure branch can explain why nothing was produced.
3308+
type executionDiagnostic struct {
3309+
name string
3310+
resource string
3311+
isBaseline bool
3312+
exitCode int
3313+
resourceCount int
3314+
logTail string
3315+
stderrTail string
3316+
warnings []string
3317+
hadError bool
3318+
}
3319+
3320+
func newExecutionDiagnostic(name, resource string, isBaseline bool, execution CustodianExecutionResult, hadError bool) executionDiagnostic {
3321+
return executionDiagnostic{
3322+
name: name,
3323+
resource: resource,
3324+
isBaseline: isBaseline,
3325+
exitCode: execution.ExitCode,
3326+
resourceCount: len(execution.Resources),
3327+
logTail: execution.LogTail,
3328+
stderrTail: tailString(execution.Stderr, custodianOutputTailBytes),
3329+
warnings: execution.DiagnosticWarnings,
3330+
hadError: hadError,
3331+
}
3332+
}
3333+
3334+
func (d executionDiagnostic) kind() string {
3335+
if d.isBaseline {
3336+
return "baseline"
3337+
}
3338+
return "check"
3339+
}
3340+
3341+
// formatExecutionDiagnosticDetail renders the full (already capped) detail for
3342+
// one execution: its identity, exit code, resource count, diagnostic warnings,
3343+
// and the tails of stderr and custodian-run.log.
3344+
func formatExecutionDiagnosticDetail(d executionDiagnostic) string {
3345+
var sb strings.Builder
3346+
fmt.Fprintf(&sb, "\n--- %s %s [%s] exit=%d resources=%d ---\n", d.name, d.resource, d.kind(), d.exitCode, d.resourceCount)
3347+
if len(d.warnings) > 0 {
3348+
sb.WriteString("diagnostic warnings:\n")
3349+
for _, w := range d.warnings {
3350+
w = strings.TrimSpace(w)
3351+
if w == "" {
3352+
continue
3353+
}
3354+
sb.WriteString(" " + w + "\n")
3355+
}
3356+
}
3357+
if tail := strings.TrimSpace(d.stderrTail); tail != "" {
3358+
sb.WriteString("stderr tail:\n")
3359+
sb.WriteString(tail + "\n")
3360+
}
3361+
if tail := strings.TrimSpace(d.logTail); tail != "" {
3362+
sb.WriteString("custodian-run.log tail:\n")
3363+
sb.WriteString(tail + "\n")
3364+
}
3365+
return sb.String()
3366+
}
3367+
3368+
// composeZeroEvidenceError builds the error returned when no evidence was
3369+
// produced for any check. It preserves any accumulated per-policy/exec errors
3370+
// and appends, for every custodian execution, a compact one-line summary plus
3371+
// full log/stderr tails for the suspicious (zero-resource) executions. The
3372+
// generic sentence survives only as a last-resort fallback when there is
3373+
// genuinely nothing else to report.
3374+
func composeZeroEvidenceError(accumulatedErrors error, diagnostics []executionDiagnostic) error {
3375+
var sb strings.Builder
3376+
3377+
if len(diagnostics) > 0 {
3378+
fmt.Fprintf(&sb, "custodian execution summary (%d execution(s)):\n", len(diagnostics))
3379+
for _, d := range diagnostics {
3380+
fmt.Fprintf(&sb, " - %s %s [%s] exit=%d resources=%d had_error=%t\n",
3381+
d.name, d.resource, d.kind(), d.exitCode, d.resourceCount, d.hadError)
3382+
}
3383+
}
3384+
3385+
omitted := 0
3386+
for _, d := range diagnostics {
3387+
// Only the zero-resource executions are suspicious; emit their full
3388+
// tails. Executions that returned resources get the one-line summary
3389+
// above only, keeping the total bounded.
3390+
if d.resourceCount > 0 {
3391+
continue
3392+
}
3393+
section := formatExecutionDiagnosticDetail(d)
3394+
if sb.Len()+len(section) > custodianDiagnosticDetailCap {
3395+
omitted++
3396+
continue
3397+
}
3398+
sb.WriteString(section)
3399+
}
3400+
if omitted > 0 {
3401+
fmt.Fprintf(&sb, "\n(%d additional zero-resource diagnostic section(s) omitted to bound error size)\n", omitted)
3402+
}
3403+
3404+
detail := strings.TrimSpace(sb.String())
3405+
3406+
switch {
3407+
case detail != "" && accumulatedErrors != nil:
3408+
return errors.Join(accumulatedErrors, errors.New(detail))
3409+
case detail != "":
3410+
return errors.New(detail)
3411+
case accumulatedErrors != nil:
3412+
return accumulatedErrors
3413+
default:
3414+
return errors.New("policy evaluation failed for all checks")
3415+
}
3416+
}
3417+
3418+
// checkRunSummary is one row of the consolidated end-of-run summary emitted by
3419+
// Eval, giving an operator a one-glance picture across every resource type.
3420+
type checkRunSummary struct {
3421+
Check string `json:"check"`
3422+
Resource string `json:"resource"`
3423+
ExitCode int `json:"exit_code"`
3424+
BaselineResourceCount int `json:"baseline_resource_count"`
3425+
MatchedResourceCount int `json:"matched_resource_count"`
3426+
PayloadCount int `json:"payload_count"`
3427+
EvidenceCount int `json:"evidence_count"`
3428+
HadError bool `json:"had_error"`
3429+
Regions []string `json:"regions,omitempty"`
3430+
}
3431+
32133432
func (p *CloudCustodianPlugin) logPolicyPayload(payload *StandardizedResourcePayload) {
32143433
if payload == nil || !p.Logger.IsDebug() {
32153434
return

0 commit comments

Comments
 (0)