@@ -11,6 +11,7 @@ import (
1111 "fmt"
1212 "io"
1313 "io/fs"
14+ "maps"
1415 "net"
1516 "net/http"
1617 "net/url"
@@ -318,6 +319,7 @@ type CustodianExecutionResult struct {
318319 ArtifactPath string
319320 ResourcesPath string
320321 LogPaths []string
322+ LogTail string
321323 DiagnosticWarnings []string
322324}
323325
@@ -507,10 +509,12 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe
507509 e .Logger .Info ("Custodian AWS API trace enabled" , "check_name" , req .Check .Name , "trace_log_path" , traceLogPath , "pythonpath_dir" , traceDir )
508510 }
509511 }
510- e .Logger .Debug ("Executing custodian command" ,
512+ e .Logger .Info ("Executing custodian command" ,
511513 "check_name" , req .Check .Name ,
514+ "resource" , req .Check .Resource ,
512515 "command" , req .BinaryPath ,
513516 "args" , args ,
517+ "aws_regions" , regions ,
514518 )
515519 stdoutBuf := & lockedBuffer {}
516520 stderrBuf := & lockedBuffer {}
@@ -637,14 +641,14 @@ commandFinished:
637641 if resources != nil {
638642 result .Resources = resources
639643 }
640- var logTail string
641- var logErr error
642- if req . LogTailDuringRun || err != nil || runCtx . Err () != nil || resourcesErr != nil {
643- logPaths , tail , readErr := readCustodianLogArtifacts ( req . OutputDir , custodianOutputTailBytes )
644- result . LogPaths = logPaths
645- logTail = tail
646- logErr = readErr
647- }
644+ // Always capture the tail of custodian's own run log, regardless of exit
645+ // code or LogTailDuringRun. Custodian frequently exits 0 with an empty
646+ // resources.json after logging AccessDenied / EndpointConnectionError to
647+ // custodian-run.log; capturing the tail here (before the temp dir is
648+ // deleted) is the only non-invasive way to surface that cause to Eval.
649+ logPaths , logTail , logErr := readCustodianLogArtifacts ( req . OutputDir , custodianOutputTailBytes )
650+ result . LogPaths = logPaths
651+ result . LogTail = logTail
648652
649653 if err != nil {
650654 result .Err = fmt .Errorf ("custodian execution failed: %w" , err )
@@ -2455,8 +2459,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24552459 successfulPolicyRuns := 0
24562460 hadCheckExecutionFailures := false
24572461 defer func () {
2458- if p .parsedConfig .PreserveArtifacts && hadCheckExecutionFailures {
2459- p .Logger .Warn ("Preserving cloud custodian execution artifacts after check execution failure" , "execution_root" , executionRoot )
2462+ zeroEvidence := successfulPolicyRuns == 0 && totalEvidenceCount == 0
2463+ if p .parsedConfig .PreserveArtifacts && (hadCheckExecutionFailures || zeroEvidence ) {
2464+ p .Logger .Warn ("Preserving cloud custodian execution artifacts for troubleshooting" ,
2465+ "execution_root" , executionRoot ,
2466+ "had_check_execution_failures" , hadCheckExecutionFailures ,
2467+ "zero_evidence" , zeroEvidence ,
2468+ )
24602469 return
24612470 }
24622471 if err := os .RemoveAll (executionRoot ); err != nil {
@@ -2465,9 +2474,21 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24652474 }()
24662475 p .Logger .Debug ("Created temporary execution root" , "execution_root" , executionRoot )
24672476
2477+ // Retain a compact diagnostic per execution (baselines + checks) and a
2478+ // per-check summary row so the zero-evidence failure branch and the
2479+ // consolidated end-of-run summary can explain why nothing was produced.
2480+ var executionDiagnostics []executionDiagnostic
2481+ var runSummaries []checkRunSummary
2482+
24682483 baselines := p .collectInventoryBaselines (ctx , executionRoot )
2469- for resourceType , baseline := range baselines {
2470- if baseline == nil || baseline .Err != nil || len (baseline .Execution .DiagnosticWarnings ) == 0 {
2484+ for _ , resourceType := range slices .Sorted (maps .Keys (baselines )) {
2485+ baseline := baselines [resourceType ]
2486+ if baseline == nil {
2487+ continue
2488+ }
2489+ baselineName := fmt .Sprintf ("inventory-%s" , sanitizeIdentifier (resourceType ))
2490+ executionDiagnostics = append (executionDiagnostics , newExecutionDiagnostic (baselineName , resourceType , true , baseline .Execution , baseline .Err != nil ))
2491+ if baseline .Err != nil || len (baseline .Execution .DiagnosticWarnings ) == 0 {
24712492 continue
24722493 }
24732494 err := formatExecutionDiagnosticWarnings (baseline .Execution .DiagnosticWarnings )
@@ -2481,6 +2502,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24812502 p .Logger .Warn ("Skipping custodian execution due to check parse issues" , "check_name" , check .Name , "parse_errors" , check .ParseErrors )
24822503 accumulatedErrors = errors .Join (accumulatedErrors , fmt .Errorf ("check %s has parse errors: %s" , check .Name , strings .Join (check .ParseErrors , "; " )))
24832504 hadCheckExecutionFailures = true
2505+ runSummaries = append (runSummaries , checkRunSummary {Check : check .Name , Resource : check .Resource , ExitCode : - 1 , HadError : true , Regions : p .parsedConfig .AWSRegions })
24842506 continue
24852507 }
24862508
@@ -2495,6 +2517,11 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
24952517 p .Logger .Error ("Skipping check due to unavailable inventory baseline" , "check_name" , check .Name , "resource" , check .Resource , "error" , err )
24962518 accumulatedErrors = errors .Join (accumulatedErrors , err )
24972519 hadCheckExecutionFailures = true
2520+ baselineRecordCount := 0
2521+ if baseline != nil {
2522+ baselineRecordCount = len (baseline .Records )
2523+ }
2524+ runSummaries = append (runSummaries , checkRunSummary {Check : check .Name , Resource : check .Resource , ExitCode : - 1 , BaselineResourceCount : baselineRecordCount , HadError : true , Regions : p .parsedConfig .AWSRegions })
24982525 continue
24992526 }
25002527
@@ -2517,6 +2544,8 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25172544 p .Logger .Error ("Skipping resource evaluation due to check execution error" , "check_name" , check .Name , "error" , err )
25182545 accumulatedErrors = errors .Join (accumulatedErrors , err )
25192546 hadCheckExecutionFailures = true
2547+ executionDiagnostics = append (executionDiagnostics , newExecutionDiagnostic (check .Name , check .Resource , false , execution , true ))
2548+ runSummaries = append (runSummaries , checkRunSummary {Check : check .Name , Resource : check .Resource , ExitCode : execution .ExitCode , BaselineResourceCount : len (baseline .Records ), MatchedResourceCount : len (execution .Resources ), HadError : true , Regions : p .parsedConfig .AWSRegions })
25202549 continue
25212550 }
25222551
@@ -2547,10 +2576,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25472576 }
25482577 }
25492578
2579+ checkEvidenceCount := 0
2580+ checkHadError := false
25502581 for _ , payload := range payloads {
25512582 evidences , evalErr , successfulRuns := p .evaluateResourcePolicies (ctx , payload , req .GetPolicyPaths ())
25522583 pendingEvidences = append (pendingEvidences , evidences ... )
25532584 totalEvidenceCount += len (evidences )
2585+ checkEvidenceCount += len (evidences )
25542586 successfulPolicyRuns += successfulRuns
25552587 p .Logger .Debug ("Completed policy evaluations for resource" ,
25562588 "check_name" , payload .Check .Name ,
@@ -2561,6 +2593,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25612593 "had_eval_error" , evalErr != nil ,
25622594 )
25632595 if evalErr != nil {
2596+ checkHadError = true
25642597 accumulatedErrors = errors .Join (accumulatedErrors , evalErr )
25652598 }
25662599 for len (pendingEvidences ) >= evidenceBatchSize {
@@ -2575,7 +2608,20 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25752608 p .Logger .Warn ("Check completed with unavailable AWS service endpoints" , "check_name" , check .Name , "error" , err )
25762609 accumulatedErrors = errors .Join (accumulatedErrors , err )
25772610 hadCheckExecutionFailures = true
2578- }
2611+ checkHadError = true
2612+ }
2613+ executionDiagnostics = append (executionDiagnostics , newExecutionDiagnostic (check .Name , check .Resource , false , execution , checkHadError ))
2614+ runSummaries = append (runSummaries , checkRunSummary {
2615+ Check : check .Name ,
2616+ Resource : check .Resource ,
2617+ ExitCode : execution .ExitCode ,
2618+ BaselineResourceCount : len (baseline .Records ),
2619+ MatchedResourceCount : len (execution .Resources ),
2620+ PayloadCount : len (payloads ),
2621+ EvidenceCount : checkEvidenceCount ,
2622+ HadError : checkHadError ,
2623+ Regions : p .parsedConfig .AWSRegions ,
2624+ })
25792625 }
25802626
25812627 if len (pendingEvidences ) > 0 {
@@ -2596,11 +2642,25 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
25962642 p .Logger .Warn ("No evidence generated by current evaluation run" )
25972643 }
25982644
2599- if successfulPolicyRuns == 0 && totalEvidenceCount == 0 {
2600- if accumulatedErrors == nil {
2601- accumulatedErrors = errors .New ("policy evaluation failed for all checks" )
2602- }
2603- return & proto.EvalResponse {Status : proto .ExecutionStatus_FAILURE }, accumulatedErrors
2645+ // Consolidated, one-glance picture across every resource type. Emitted at
2646+ // Warn when no evidence was produced, otherwise at Debug since a run that
2647+ // produced evidence is the healthy case.
2648+ zeroEvidenceOutcome := successfulPolicyRuns == 0 && totalEvidenceCount == 0
2649+ summaryLog := p .Logger .Debug
2650+ if zeroEvidenceOutcome {
2651+ summaryLog = p .Logger .Warn
2652+ }
2653+ summaryLog ("Cloud custodian evaluation summary" ,
2654+ "check_count" , len (p .checks ),
2655+ "successful_policy_runs" , successfulPolicyRuns ,
2656+ "total_evidence_count" , totalEvidenceCount ,
2657+ "had_check_execution_failures" , hadCheckExecutionFailures ,
2658+ "aws_regions" , p .parsedConfig .AWSRegions ,
2659+ "per_check" , runSummaries ,
2660+ )
2661+
2662+ if zeroEvidenceOutcome {
2663+ return & proto.EvalResponse {Status : proto .ExecutionStatus_FAILURE }, composeZeroEvidenceError (accumulatedErrors , executionDiagnostics )
26042664 }
26052665 if hadCheckExecutionFailures {
26062666 if accumulatedErrors == nil {
@@ -2648,6 +2708,7 @@ func (p *CloudCustodianPlugin) collectInventoryBaselines(ctx context.Context, ex
26482708 if execution .Err != nil || execution .Error != "" {
26492709 baselineErr = formatExecutionFailure (check .Name , execution )
26502710 }
2711+ p .warnZeroResourceBaseline (check .Name , resourceType , execution )
26512712 records := make ([]ResourceRecord , 0 , len (execution .Resources ))
26522713 for _ , resource := range execution .Resources {
26532714 records = append (records , p .buildResourceRecord (resourceType , resource ))
@@ -3185,6 +3246,24 @@ func isReservedResourceLabel(label string) bool {
31853246 }
31863247}
31873248
3249+ // warnZeroResourceBaseline emits a Warn naming the likely causes whenever an
3250+ // inventory baseline run exits 0 but returned no resources, so that an operator
3251+ // can distinguish a permissions problem from a network/endpoint problem without
3252+ // re-running with invasive tracing. This is not done for policy checks: a fully
3253+ // compliant estate legitimately matches zero resources.
3254+ func (p * CloudCustodianPlugin ) warnZeroResourceBaseline (name , resource string , execution CustodianExecutionResult ) {
3255+ if execution .ExitCode != 0 || len (execution .Resources ) > 0 {
3256+ return
3257+ }
3258+ p .Logger .Warn ("Custodian inventory baseline exited successfully but returned zero resources; likely insufficient IAM read permissions or unreachable/cross-region service endpoints" ,
3259+ "name" , name ,
3260+ "resource" , resource ,
3261+ "exit_code" , execution .ExitCode ,
3262+ "stderr_tail" , tailString (execution .Stderr , custodianOutputTailBytes ),
3263+ "log_tail" , execution .LogTail ,
3264+ )
3265+ }
3266+
31883267func formatExecutionFailure (checkName string , execution CustodianExecutionResult ) error {
31893268 switch {
31903269 case execution .Error != "" && execution .Err != nil :
@@ -3210,6 +3289,146 @@ func formatExecutionDiagnosticWarnings(messages []string) error {
32103289 return err
32113290}
32123291
3292+ // custodianDiagnosticDetailCap bounds the total size of the per-execution
3293+ // diagnostic detail appended to a zero-evidence failure error so that a policy
3294+ // pack with many resource types can never produce an unbounded gRPC error.
3295+ const custodianDiagnosticDetailCap = 32 * 1024
3296+
3297+ // tailString returns at most the last maxBytes bytes of s.
3298+ func tailString (s string , maxBytes int ) string {
3299+ if maxBytes <= 0 || len (s ) <= maxBytes {
3300+ return s
3301+ }
3302+ return s [len (s )- maxBytes :]
3303+ }
3304+
3305+ // executionDiagnostic is a compact, bounded snapshot of a single custodian
3306+ // execution (an inventory baseline or a policy check) retained so that the
3307+ // zero-evidence failure branch can explain why nothing was produced.
3308+ type executionDiagnostic struct {
3309+ name string
3310+ resource string
3311+ isBaseline bool
3312+ exitCode int
3313+ resourceCount int
3314+ logTail string
3315+ stderrTail string
3316+ warnings []string
3317+ hadError bool
3318+ }
3319+
3320+ func newExecutionDiagnostic (name , resource string , isBaseline bool , execution CustodianExecutionResult , hadError bool ) executionDiagnostic {
3321+ return executionDiagnostic {
3322+ name : name ,
3323+ resource : resource ,
3324+ isBaseline : isBaseline ,
3325+ exitCode : execution .ExitCode ,
3326+ resourceCount : len (execution .Resources ),
3327+ logTail : execution .LogTail ,
3328+ stderrTail : tailString (execution .Stderr , custodianOutputTailBytes ),
3329+ warnings : execution .DiagnosticWarnings ,
3330+ hadError : hadError ,
3331+ }
3332+ }
3333+
3334+ func (d executionDiagnostic ) kind () string {
3335+ if d .isBaseline {
3336+ return "baseline"
3337+ }
3338+ return "check"
3339+ }
3340+
3341+ // formatExecutionDiagnosticDetail renders the full (already capped) detail for
3342+ // one execution: its identity, exit code, resource count, diagnostic warnings,
3343+ // and the tails of stderr and custodian-run.log.
3344+ func formatExecutionDiagnosticDetail (d executionDiagnostic ) string {
3345+ var sb strings.Builder
3346+ fmt .Fprintf (& sb , "\n --- %s %s [%s] exit=%d resources=%d ---\n " , d .name , d .resource , d .kind (), d .exitCode , d .resourceCount )
3347+ if len (d .warnings ) > 0 {
3348+ sb .WriteString ("diagnostic warnings:\n " )
3349+ for _ , w := range d .warnings {
3350+ w = strings .TrimSpace (w )
3351+ if w == "" {
3352+ continue
3353+ }
3354+ sb .WriteString (" " + w + "\n " )
3355+ }
3356+ }
3357+ if tail := strings .TrimSpace (d .stderrTail ); tail != "" {
3358+ sb .WriteString ("stderr tail:\n " )
3359+ sb .WriteString (tail + "\n " )
3360+ }
3361+ if tail := strings .TrimSpace (d .logTail ); tail != "" {
3362+ sb .WriteString ("custodian-run.log tail:\n " )
3363+ sb .WriteString (tail + "\n " )
3364+ }
3365+ return sb .String ()
3366+ }
3367+
3368+ // composeZeroEvidenceError builds the error returned when no evidence was
3369+ // produced for any check. It preserves any accumulated per-policy/exec errors
3370+ // and appends, for every custodian execution, a compact one-line summary plus
3371+ // full log/stderr tails for the suspicious (zero-resource) executions. The
3372+ // generic sentence survives only as a last-resort fallback when there is
3373+ // genuinely nothing else to report.
3374+ func composeZeroEvidenceError (accumulatedErrors error , diagnostics []executionDiagnostic ) error {
3375+ var sb strings.Builder
3376+
3377+ if len (diagnostics ) > 0 {
3378+ fmt .Fprintf (& sb , "custodian execution summary (%d execution(s)):\n " , len (diagnostics ))
3379+ for _ , d := range diagnostics {
3380+ fmt .Fprintf (& sb , " - %s %s [%s] exit=%d resources=%d had_error=%t\n " ,
3381+ d .name , d .resource , d .kind (), d .exitCode , d .resourceCount , d .hadError )
3382+ }
3383+ }
3384+
3385+ omitted := 0
3386+ for _ , d := range diagnostics {
3387+ // Only the zero-resource executions are suspicious; emit their full
3388+ // tails. Executions that returned resources get the one-line summary
3389+ // above only, keeping the total bounded.
3390+ if d .resourceCount > 0 {
3391+ continue
3392+ }
3393+ section := formatExecutionDiagnosticDetail (d )
3394+ if sb .Len ()+ len (section ) > custodianDiagnosticDetailCap {
3395+ omitted ++
3396+ continue
3397+ }
3398+ sb .WriteString (section )
3399+ }
3400+ if omitted > 0 {
3401+ fmt .Fprintf (& sb , "\n (%d additional zero-resource diagnostic section(s) omitted to bound error size)\n " , omitted )
3402+ }
3403+
3404+ detail := strings .TrimSpace (sb .String ())
3405+
3406+ switch {
3407+ case detail != "" && accumulatedErrors != nil :
3408+ return errors .Join (accumulatedErrors , errors .New (detail ))
3409+ case detail != "" :
3410+ return errors .New (detail )
3411+ case accumulatedErrors != nil :
3412+ return accumulatedErrors
3413+ default :
3414+ return errors .New ("policy evaluation failed for all checks" )
3415+ }
3416+ }
3417+
3418+ // checkRunSummary is one row of the consolidated end-of-run summary emitted by
3419+ // Eval, giving an operator a one-glance picture across every resource type.
3420+ type checkRunSummary struct {
3421+ Check string `json:"check"`
3422+ Resource string `json:"resource"`
3423+ ExitCode int `json:"exit_code"`
3424+ BaselineResourceCount int `json:"baseline_resource_count"`
3425+ MatchedResourceCount int `json:"matched_resource_count"`
3426+ PayloadCount int `json:"payload_count"`
3427+ EvidenceCount int `json:"evidence_count"`
3428+ HadError bool `json:"had_error"`
3429+ Regions []string `json:"regions,omitempty"`
3430+ }
3431+
32133432func (p * CloudCustodianPlugin ) logPolicyPayload (payload * StandardizedResourcePayload ) {
32143433 if payload == nil || ! p .Logger .IsDebug () {
32153434 return
0 commit comments