diff --git a/README.md b/README.md index 9519344..459afd5 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ All plugin config fields are strings (agent gRPC `map` contract). | `custodian_debug` | No | Boolean (`true`/`false`) toggle to pass `--debug` to Cloud Custodian. This increases Cloud Custodian diagnostic output on stderr. Default: `false`. | | `custodian_verbose` | No | Boolean (`true`/`false`) toggle to pass `-v` to Cloud Custodian. This increases Cloud Custodian diagnostic output on stderr. Default: `false`. | | `custodian_aws_api_trace` | No | Boolean (`true`/`false`) toggle to inject a temporary Python `sitecustomize.py` into the Custodian child process. Logs botocore API start/end/error events to stderr and `custodian-aws-api-trace.jsonl` in the check output directory. Default: `false`. | -| `custodian_network_diagnostics` | No | Boolean (`true`/`false`) toggle to run Go DNS/TLS preflight probes for relevant AWS service endpoints before Custodian starts and log child process TCP socket snapshots while Custodian is running. Preflight failures stop the Custodian check before execution. If `aws_regions` is unset or only `all`, service-derived probes are skipped unless `custodian_network_diagnostic_endpoints` is configured. For AWS resource types not mapped to diagnostic services, diagnostics require explicit `custodian_network_diagnostic_endpoints`; otherwise the preflight fails. Default: `false`. | +| `custodian_network_diagnostics` | No | Boolean (`true`/`false`) toggle to run Go DNS/TLS preflight probes for relevant AWS service endpoints before Custodian starts and log child process TCP socket snapshots while Custodian is running. DNS/TLS probe failures are logged as warnings and surfaced in evidence as `execution.warnings`. Concrete `aws_regions` are narrowed to regions with reachable resource service endpoints before Custodian runs; if none are reachable for a check, that check is skipped, later checks continue, and the accumulated diagnostic warnings are returned after reachable evidence is submitted. If no concrete `aws_regions` are configured, service-derived probes are skipped unless `custodian_network_diagnostic_endpoints` is configured. For AWS resource types not mapped to diagnostic services, diagnostics warn and continue unless explicit endpoints are configured. Invalid configured endpoints still fail before execution. Default: `false`. | | `custodian_network_diagnostic_endpoints` | No | Comma or whitespace separated list of additional endpoint hostnames, `host:port` values, or HTTPS URLs to DNS/TLS probe when `custodian_network_diagnostics` is enabled. Non-HTTPS URL schemes are rejected. Use this for AWS VPC endpoint DNS names such as `vpce-123.backup.eu-west-1.vpce.amazonaws.com`. Default: unset. | | `custodian_log_tail_during_run` | No | Boolean (`true`/`false`) toggle to tail discovered `custodian-run.log` artifacts during the monitor loop, not only after process exit. Default: `false`. | | `aws_regions` | No | Comma or whitespace separated AWS regions passed as repeated `--region` flags. Duplicate entries are removed while preserving order. Default: unset, which falls back to `--region all` for AWS checks. | diff --git a/main.go b/main.go index c3db01b..7426496 100644 --- a/main.go +++ b/main.go @@ -306,18 +306,19 @@ type CustodianExecutionRequest struct { // CustodianExecutionResult captures runtime output and artifacts from one check run. type CustodianExecutionResult struct { - StartedAt time.Time - EndedAt time.Time - ExitCode int - Stdout string - Stderr string - Error string - Errors []string - Err error - Resources []interface{} - ArtifactPath string - ResourcesPath string - LogPaths []string + StartedAt time.Time + EndedAt time.Time + ExitCode int + Stdout string + Stderr string + Error string + Errors []string + Err error + Resources []interface{} + ArtifactPath string + ResourcesPath string + LogPaths []string + DiagnosticWarnings []string } // CustodianExecutor runs one Cloud Custodian check and captures execution artifacts. @@ -434,34 +435,64 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe runCtx, cancel := context.WithTimeout(ctx, req.Timeout) defer cancel() - args := []string{"run"} - if req.Debug { - args = append(args, "--debug") - } - if req.Verbose { - args = append(args, "-v") - } - args = append(args, "--dryrun", "-s", req.OutputDir, policyPath) + regions := req.AWSRegions if strings.EqualFold(req.Check.Provider, "aws") { // Ensure AWS policies evaluate across all regions by default while // allowing operators to narrow problematic service/region scans. - regions := req.AWSRegions if len(regions) == 0 { regions = []string{"all"} } - for _, region := range regions { - args = append(args, "--region", region) - } } if req.NetworkDiagnostics && strings.EqualFold(req.Check.Provider, "aws") { - if diagErr := e.runAWSEndpointDiagnostics(runCtx, req); diagErr != nil { + diagnostics, diagErr := e.runAWSEndpointDiagnostics(runCtx, req) + result.DiagnosticWarnings = append(result.DiagnosticWarnings, diagnostics.executionWarnings(req.Check)...) + if diagErr != nil { result.Err = fmt.Errorf("aws endpoint network diagnostics failed: %w", diagErr) - result.Error = result.Err.Error() - result.Errors = []string{result.Error} + result.Errors = []string{result.Err.Error()} + result.Error = executionErrorString(result) result.EndedAt = time.Now().UTC() e.Logger.Error("Skipping custodian command because AWS endpoint diagnostics failed", "check_name", req.Check.Name, "error", result.Error) return result } + if availableRegions, ok := diagnostics.availableAWSRegions(regions); ok { + if len(availableRegions) == 0 { + result.Err = fmt.Errorf("cloud custodian policy %s could not be checked because no AWS service endpoints were reachable for resource %s in requested regions %s", req.Check.Name, req.Check.Resource, strings.Join(regions, ",")) + result.Errors = append([]string{result.Err.Error()}, result.Errors...) + result.Error = executionErrorString(result) + e.Logger.Warn("Skipping custodian command because no AWS service endpoints were reachable for policy", + "check_name", req.Check.Name, + "resource", req.Check.Resource, + "aws_regions", regions, + "unavailable_endpoint_count", len(diagnostics.Failures), + ) + result.EndedAt = time.Now().UTC() + return result + } + if !slices.Equal(regions, availableRegions) { + e.Logger.Warn("Running custodian command only for AWS regions with reachable service endpoints", + "check_name", req.Check.Name, + "resource", req.Check.Resource, + "requested_aws_regions", regions, + "available_aws_regions", availableRegions, + "unavailable_endpoint_count", len(diagnostics.Failures), + ) + regions = availableRegions + } + } + } + + args := []string{"run"} + if req.Debug { + args = append(args, "--debug") + } + if req.Verbose { + args = append(args, "-v") + } + args = append(args, "--dryrun", "-s", req.OutputDir, policyPath) + if strings.EqualFold(req.Check.Provider, "aws") { + for _, region := range regions { + args = append(args, "--region", region) + } } cmd := exec.CommandContext(runCtx, req.BinaryPath, args...) cmd.Env = os.Environ() @@ -650,7 +681,7 @@ commandFinished: } if result.Err != nil { - result.Error = strings.Join(result.Errors, "; ") + result.Error = executionErrorString(result) e.Logger.Warn("Custodian execution completed with errors", "check_name", req.Check.Name, "error_count", len(result.Errors), @@ -784,40 +815,83 @@ type networkDiagnosticEndpoint struct { Port string ServerName string Source string + Service string + Region string +} + +type awsEndpointDiagnosticFailure struct { + Endpoint networkDiagnosticEndpoint + Stage string + Err error +} + +type awsEndpointDiagnosticResult struct { + Failures []awsEndpointDiagnosticFailure + regionProbeSucceeded map[string]bool + regionProbeFailed map[string]bool +} + +func (r awsEndpointDiagnosticResult) availableAWSRegions(regions []string) ([]string, bool) { + if len(r.regionProbeSucceeded) == 0 && len(r.regionProbeFailed) == 0 { + return nil, false + } + available := make([]string, 0, len(regions)) + for _, region := range regions { + region = strings.TrimSpace(region) + if region == "" || strings.EqualFold(region, "all") { + continue + } + if r.regionProbeSucceeded[region] && !r.regionProbeFailed[region] { + available = append(available, region) + } + } + return available, true } -func (e *CommandCustodianExecutor) runAWSEndpointDiagnostics(ctx context.Context, req CustodianExecutionRequest) error { +func (e *CommandCustodianExecutor) runAWSEndpointDiagnostics(ctx context.Context, req CustodianExecutionRequest) (awsEndpointDiagnosticResult, error) { + result := awsEndpointDiagnosticResult{ + Failures: []awsEndpointDiagnosticFailure{}, + regionProbeSucceeded: map[string]bool{}, + regionProbeFailed: map[string]bool{}, + } endpoints, knownResource, endpointErr := awsDiagnosticEndpointsForCheck(req.Check.Resource, req.AWSRegions, req.NetworkDiagnosticEndpoints) if endpointErr != nil { e.Logger.Error("AWS endpoint diagnostics configuration is invalid", "check_name", req.Check.Name, "resource", req.Check.Resource, "error", endpointErr) - return endpointErr + return result, endpointErr } if !knownResource && len(endpoints) == 0 { - err := fmt.Errorf("resource service is not mapped for AWS endpoint diagnostics: %s", req.Check.Resource) - e.Logger.Error("AWS endpoint diagnostics failed before custodian execution", "check_name", req.Check.Name, "resource", req.Check.Resource, "error", err) - return err + e.Logger.Warn("Skipping AWS endpoint diagnostics because resource service is not mapped and no explicit endpoints are configured", "check_name", req.Check.Name, "resource", req.Check.Resource) + return result, nil } if len(endpoints) == 0 { e.Logger.Warn("Skipping AWS endpoint diagnostics because no concrete endpoint hosts are available; configure aws_regions or custodian_network_diagnostic_endpoints for preflight probes", "check_name", req.Check.Name, "resource", req.Check.Resource, "aws_regions", req.AWSRegions) - return nil + return result, nil } if !knownResource { e.Logger.Warn("AWS endpoint diagnostics will use only configured endpoints because resource service is not mapped", "check_name", req.Check.Name, "resource", req.Check.Resource) } - var diagnosticsErr error for _, endpoint := range endpoints { if err := ctx.Err(); err != nil { - return errors.Join(diagnosticsErr, err) + return result, err } lookupCtx, cancel := context.WithTimeout(ctx, 5*time.Second) lookupStarted := time.Now() ips, err := lookupHost(lookupCtx, endpoint.Host) cancel() if err != nil { - probeErr := fmt.Errorf("DNS lookup failed for %s endpoint %s: %w", endpoint.Source, endpoint.Host, err) - diagnosticsErr = errors.Join(diagnosticsErr, probeErr) - e.Logger.Warn("AWS endpoint DNS lookup failed", "check_name", req.Check.Name, "host", endpoint.Host, "source", endpoint.Source, "elapsed", time.Since(lookupStarted).Round(time.Millisecond).String(), "error", err) + result.recordFailure(endpoint, "DNS lookup", err) + e.Logger.Warn("AWS endpoint diagnostics detected an unreachable service endpoint; evaluation may be partial", + "check_name", req.Check.Name, + "resource", req.Check.Resource, + "service", endpoint.Service, + "region", endpoint.Region, + "host", endpoint.Host, + "stage", "DNS lookup", + "source", endpoint.Source, + "elapsed", time.Since(lookupStarted).Round(time.Millisecond).String(), + "error", err, + ) continue } else { e.Logger.Info("AWS endpoint DNS lookup succeeded", "check_name", req.Check.Name, "host", endpoint.Host, "source", endpoint.Source, "ips", ips, "elapsed", time.Since(lookupStarted).Round(time.Millisecond).String()) @@ -826,14 +900,90 @@ func (e *CommandCustodianExecutor) runAWSEndpointDiagnostics(ctx context.Context tlsStarted := time.Now() tlsResult, err := tlsProbeEndpoint(ctx, endpoint) if err != nil { - probeErr := fmt.Errorf("TLS handshake failed for %s endpoint %s:%s: %w", endpoint.Source, endpoint.Host, endpoint.Port, err) - diagnosticsErr = errors.Join(diagnosticsErr, probeErr) - e.Logger.Warn("AWS endpoint TLS probe failed", "check_name", req.Check.Name, "host", endpoint.Host, "port", endpoint.Port, "server_name", endpoint.ServerName, "source", endpoint.Source, "elapsed", time.Since(tlsStarted).Round(time.Millisecond).String(), "error", err) + result.recordFailure(endpoint, "TLS handshake", err) + e.Logger.Warn("AWS endpoint diagnostics detected an unreachable service endpoint; evaluation may be partial", + "check_name", req.Check.Name, + "resource", req.Check.Resource, + "service", endpoint.Service, + "region", endpoint.Region, + "host", endpoint.Host, + "stage", "TLS handshake", + "port", endpoint.Port, + "server_name", endpoint.ServerName, + "source", endpoint.Source, + "elapsed", time.Since(tlsStarted).Round(time.Millisecond).String(), + "error", err, + ) continue } + if endpoint.Source == "aws-service" && endpoint.Region != "" { + if result.regionProbeSucceeded == nil { + result.regionProbeSucceeded = map[string]bool{} + } + result.regionProbeSucceeded[endpoint.Region] = true + } e.Logger.Info("AWS endpoint TLS probe succeeded", "check_name", req.Check.Name, "host", endpoint.Host, "port", endpoint.Port, "server_name", endpoint.ServerName, "source", endpoint.Source, "remote_addr", tlsResult.RemoteAddr, "tls_version", tlsResult.TLSVersion, "elapsed", time.Since(tlsStarted).Round(time.Millisecond).String()) } - return diagnosticsErr + return result, nil +} + +func (r *awsEndpointDiagnosticResult) recordFailure(endpoint networkDiagnosticEndpoint, stage string, err error) { + if r.regionProbeFailed == nil { + r.regionProbeFailed = map[string]bool{} + } + r.Failures = append(r.Failures, awsEndpointDiagnosticFailure{ + Endpoint: endpoint, + Stage: stage, + Err: err, + }) + if endpoint.Source == "aws-service" && endpoint.Region != "" { + r.regionProbeFailed[endpoint.Region] = true + } +} + +func (r awsEndpointDiagnosticResult) executionWarnings(check CustodianCheck) []string { + if len(r.Failures) == 0 { + return nil + } + messages := make([]string, 0, len(r.Failures)) + for _, failure := range r.Failures { + endpoint := failure.Endpoint + service := endpoint.Service + if service == "" { + service = endpoint.Source + } + region := endpoint.Region + if region == "" { + region = "global" + } + messages = append(messages, fmt.Sprintf( + "unreachable AWS service endpoint %s.%s (%s:%s) detected while evaluating cloud custodian policy %s; evaluation may be partial: %s failed: %v", + service, + region, + endpoint.Host, + endpoint.Port, + check.Name, + failure.Stage, + failure.Err, + )) + } + return messages +} + +func executionErrorString(result CustodianExecutionResult) string { + messages := make([]string, 0, len(result.Errors)+len(result.DiagnosticWarnings)) + seen := map[string]bool{} + for _, values := range [][]string{result.Errors, result.DiagnosticWarnings} { + for _, message := range values { + message = strings.TrimSpace(message) + if message == "" || seen[message] { + continue + } + messages = append(messages, message) + seen[message] = true + } + } + return strings.Join(messages, "; ") } func awsEndpointHostsForCheck(resource string, regions []string) ([]string, bool) { @@ -877,11 +1027,17 @@ func awsDiagnosticEndpointsForCheck(resource string, regions []string, configure for _, region := range diagnosticRegions { for _, service := range services { host := awsServiceEndpointHost(service, region) + endpointRegion := region + if _, ok := awsGlobalEndpointServices[service]; ok { + endpointRegion = "" + } endpoints = append(endpoints, networkDiagnosticEndpoint{ Host: host, Port: "443", ServerName: host, Source: "aws-service", + Service: service, + Region: endpointRegion, }) } } @@ -960,14 +1116,20 @@ func networkDiagnosticEndpointSource(host string) string { } func compactUniqueNetworkDiagnosticEndpoints(values []networkDiagnosticEndpoint) []networkDiagnosticEndpoint { - seen := map[string]bool{} + seen := map[string]int{} result := make([]networkDiagnosticEndpoint, 0, len(values)) for _, value := range values { key := strings.ToLower(net.JoinHostPort(value.Host, value.Port)) - if value.Host == "" || value.Port == "" || seen[key] { + if value.Host == "" || value.Port == "" { continue } - seen[key] = true + if existingIndex, ok := seen[key]; ok { + if result[existingIndex].Source != "aws-service" && value.Source == "aws-service" { + result[existingIndex] = value + } + continue + } + seen[key] = len(result) result = append(result, value) } return result @@ -1015,7 +1177,7 @@ func awsServicesForResource(resource string) ([]string, bool) { if !ok { return nil, false } - return compactUniqueStrings(append([]string{"sts", "ec2", "tagging"}, services...)), true + return compactUniqueStrings(services), true } func tlsVersionString(version uint16) string { @@ -1387,6 +1549,7 @@ type StandardizedExecution struct { Stderr string `json:"stderr,omitempty"` Error string `json:"error,omitempty"` Errors []string `json:"errors,omitempty"` + Warnings []string `json:"warnings,omitempty"` } // StandardizedResourcePayload is the per-resource OPA input contract. @@ -1498,6 +1661,10 @@ func buildExecutionInfo(execution CustodianExecutionResult) StandardizedExecutio if len(execution.Errors) > 0 { executionErrors = append([]string{}, execution.Errors...) } + var executionWarnings []string + if len(execution.DiagnosticWarnings) > 0 { + executionWarnings = append([]string{}, execution.DiagnosticWarnings...) + } return StandardizedExecution{ Status: status, @@ -1510,6 +1677,7 @@ func buildExecutionInfo(execution CustodianExecutionResult) StandardizedExecutio Stderr: execution.Stderr, Error: execution.Error, Errors: executionErrors, + Warnings: executionWarnings, } } @@ -2298,6 +2466,15 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api p.Logger.Debug("Created temporary execution root", "execution_root", executionRoot) baselines := p.collectInventoryBaselines(ctx, executionRoot) + for resourceType, baseline := range baselines { + if baseline == nil || baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 { + continue + } + err := formatExecutionDiagnosticWarnings(baseline.Execution.DiagnosticWarnings) + p.Logger.Warn("Inventory baseline completed with unavailable AWS service endpoints", "resource", resourceType, "error", err) + accumulatedErrors = errors.Join(accumulatedErrors, err) + hadCheckExecutionFailures = true + } for _, check := range p.checks { p.Logger.Debug("Processing check", "check_name", check.Name, "check_index", check.Index, "resource", check.Resource, "provider", check.Provider) if len(check.ParseErrors) > 0 { @@ -2393,6 +2570,12 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api pendingEvidences = pendingEvidences[evidenceBatchSize:] } } + if len(execution.DiagnosticWarnings) > 0 { + err := formatExecutionDiagnosticWarnings(execution.DiagnosticWarnings) + p.Logger.Warn("Check completed with unavailable AWS service endpoints", "check_name", check.Name, "error", err) + accumulatedErrors = errors.Join(accumulatedErrors, err) + hadCheckExecutionFailures = true + } } if len(pendingEvidences) > 0 { @@ -3015,6 +3198,18 @@ func formatExecutionFailure(checkName string, execution CustodianExecutionResult } } +func formatExecutionDiagnosticWarnings(messages []string) error { + var err error + for _, message := range messages { + message = strings.TrimSpace(message) + if message == "" { + continue + } + err = errors.Join(err, fmt.Errorf("custodian policy execution warning: %s", message)) + } + return err +} + func (p *CloudCustodianPlugin) logPolicyPayload(payload *StandardizedResourcePayload) { if payload == nil || !p.Logger.IsDebug() { return diff --git a/main_test.go b/main_test.go index 3dc4600..21109f9 100644 --- a/main_test.go +++ b/main_test.go @@ -515,7 +515,7 @@ printf '[]' > "$out/test-policy/resources.json" } }) - t.Run("network diagnostics failure prevents custodian execution", func(t *testing.T) { + t.Run("network diagnostics failure warns and still executes custodian", func(t *testing.T) { executedFile := filepath.Join(t.TempDir(), "executed.txt") t.Setenv("EXECUTED_FILE", executedFile) stubNetworkDiagnostics( @@ -537,6 +537,17 @@ printf '[]' > "$out/test-policy/resources.json" script := `#!/bin/sh set -eu touch "$EXECUTED_FILE" +out="" +while [ "$#" -gt 0 ]; do + if [ "$1" = "-s" ]; then + out="$2" + shift 2 + continue + fi + shift +done +mkdir -p "$out/test-policy" +printf '[]' > "$out/test-policy/resources.json" ` binary := writeExecutableScript(t, script) executor := &CommandCustodianExecutor{Logger: hclog.NewNullLogger()} @@ -555,11 +566,192 @@ touch "$EXECUTED_FILE" NetworkDiagnosticEndpoints: []string{"https://vpce-123.backup.eu-west-1.vpce.amazonaws.com"}, }) + if result.Err != nil { + t.Fatalf("expected diagnostic failure to be non-fatal, got: %v", result.Err) + } + if _, err := os.Stat(executedFile); err != nil { + t.Fatalf("expected custodian command to execute, stat err: %v", err) + } + if len(result.DiagnosticWarnings) != 1 || !strings.Contains(result.DiagnosticWarnings[0], "handshake failed") { + t.Fatalf("expected diagnostic warning to be captured, got %#v", result.DiagnosticWarnings) + } + if len(result.Errors) != 0 { + t.Fatalf("did not expect successful execution warnings to be surfaced as errors, got %#v", result.Errors) + } + if result.Error != "" { + t.Fatalf("did not expect successful execution to set Error, got %q", result.Error) + } + executionInfo := buildExecutionInfo(result) + if executionInfo.Status != "success" { + t.Fatalf("expected successful execution status, got %q", executionInfo.Status) + } + if len(executionInfo.Warnings) != 1 || !strings.Contains(executionInfo.Warnings[0], "handshake failed") { + t.Fatalf("expected diagnostic warning in execution warnings, got %#v", executionInfo.Warnings) + } + if len(executionInfo.Errors) != 0 { + t.Fatalf("did not expect diagnostic warning in execution errors, got %#v", executionInfo.Errors) + } + }) + + t.Run("network diagnostics warnings are included when custodian execution fails", func(t *testing.T) { + stubNetworkDiagnostics( + t, + func(ctx context.Context, host string) ([]string, error) { + return []string{"10.0.0.10"}, nil + }, + func(ctx context.Context, endpoint networkDiagnosticEndpoint) (tlsProbeResult, error) { + return tlsProbeResult{}, errors.New("handshake failed") + }, + ) + + script := `#!/bin/sh +set -eu +exit 7 +` + binary := writeExecutableScript(t, script) + executor := &CommandCustodianExecutor{Logger: hclog.NewNullLogger()} + + result := executor.Execute(context.Background(), CustodianExecutionRequest{ + BinaryPath: binary, + Check: CustodianCheck{ + Name: "test-policy", + Resource: "aws.backup-vault", + Provider: "aws", + RawPolicy: map[string]interface{}{"name": "test-policy", "resource": "aws.backup-vault"}, + }, + Timeout: 5 * time.Second, + OutputDir: filepath.Join(t.TempDir(), "out"), + NetworkDiagnostics: true, + NetworkDiagnosticEndpoints: []string{"https://vpce-123.backup.eu-west-1.vpce.amazonaws.com"}, + }) if result.Err == nil { - t.Fatalf("expected network diagnostics failure") + t.Fatalf("expected custodian execution failure") + } + if !strings.Contains(result.Error, "custodian execution failed") { + t.Fatalf("expected custodian execution error, got %q", result.Error) } - if !strings.Contains(result.Error, "aws endpoint network diagnostics failed") || !strings.Contains(result.Error, "handshake failed") { - t.Fatalf("expected diagnostic failure detail, got: %s", result.Error) + if !strings.Contains(result.Error, "handshake failed") { + t.Fatalf("expected diagnostic warning to be included in execution error, got %q", result.Error) + } + for _, executionError := range result.Errors { + if strings.Contains(executionError, "handshake failed") { + t.Fatalf("did not expect diagnostic warning to be classified as execution error, got %#v", result.Errors) + } + } + executionInfo := buildExecutionInfo(result) + if len(executionInfo.Warnings) != 1 || !strings.Contains(executionInfo.Warnings[0], "handshake failed") { + t.Fatalf("expected diagnostic warning in execution warnings, got %#v", executionInfo.Warnings) + } + }) + + t.Run("network diagnostics filters unavailable concrete aws regions", func(t *testing.T) { + argsFile := filepath.Join(t.TempDir(), "args.txt") + t.Setenv("ARGS_FILE", argsFile) + stubNetworkDiagnostics( + t, + func(ctx context.Context, host string) ([]string, error) { + return []string{"10.0.0.10"}, nil + }, + func(ctx context.Context, endpoint networkDiagnosticEndpoint) (tlsProbeResult, error) { + if endpoint.Host == "s3.eu-west-2.amazonaws.com" { + return tlsProbeResult{}, errors.New("handshake failed") + } + return tlsProbeResult{RemoteAddr: "10.0.0.10:443", TLSVersion: "TLS1.3"}, nil + }, + ) + + script := `#!/bin/sh +set -eu +echo "$@" > "$ARGS_FILE" +out="" +while [ "$#" -gt 0 ]; do + if [ "$1" = "-s" ]; then + out="$2" + shift 2 + continue + fi + shift +done +mkdir -p "$out/test-policy" +printf '[]' > "$out/test-policy/resources.json" +` + binary := writeExecutableScript(t, script) + executor := &CommandCustodianExecutor{Logger: hclog.NewNullLogger()} + + result := executor.Execute(context.Background(), CustodianExecutionRequest{ + BinaryPath: binary, + Check: CustodianCheck{ + Name: "test-policy", + Resource: "aws.s3", + Provider: "aws", + RawPolicy: map[string]interface{}{"name": "test-policy", "resource": "aws.s3"}, + }, + Timeout: 5 * time.Second, + OutputDir: filepath.Join(t.TempDir(), "out"), + AWSRegions: []string{"eu-west-1", "eu-west-2"}, + NetworkDiagnostics: true, + }) + if result.Err != nil { + t.Fatalf("expected successful execution, got error: %v", result.Err) + } + if len(result.DiagnosticWarnings) != 1 || !strings.Contains(result.DiagnosticWarnings[0], "s3.eu-west-2.amazonaws.com") { + t.Fatalf("expected unavailable region diagnostic warning, got %#v", result.DiagnosticWarnings) + } + + argsContent, err := os.ReadFile(argsFile) + if err != nil { + t.Fatalf("failed to read args capture file: %v", err) + } + argsStr := string(argsContent) + if !strings.Contains(argsStr, "--region eu-west-1") { + t.Fatalf("expected available region to be passed, got: %s", argsStr) + } + if strings.Contains(argsStr, "--region eu-west-2") { + t.Fatalf("did not expect unavailable region to be passed, got: %s", argsStr) + } + }) + + t.Run("network diagnostics skips custodian when all concrete aws regions are unavailable", func(t *testing.T) { + executedFile := filepath.Join(t.TempDir(), "executed.txt") + t.Setenv("EXECUTED_FILE", executedFile) + stubNetworkDiagnostics( + t, + func(ctx context.Context, host string) ([]string, error) { + return []string{"10.0.0.10"}, nil + }, + func(ctx context.Context, endpoint networkDiagnosticEndpoint) (tlsProbeResult, error) { + return tlsProbeResult{}, fmt.Errorf("%s unavailable", endpoint.Host) + }, + ) + + script := `#!/bin/sh +set -eu +touch "$EXECUTED_FILE" +` + binary := writeExecutableScript(t, script) + executor := &CommandCustodianExecutor{Logger: hclog.NewNullLogger()} + + result := executor.Execute(context.Background(), CustodianExecutionRequest{ + BinaryPath: binary, + Check: CustodianCheck{ + Name: "test-policy", + Resource: "aws.backup-vault", + Provider: "aws", + RawPolicy: map[string]interface{}{"name": "test-policy", "resource": "aws.backup-vault"}, + }, + Timeout: 5 * time.Second, + OutputDir: filepath.Join(t.TempDir(), "out"), + AWSRegions: []string{"eu-west-1", "eu-west-2"}, + NetworkDiagnostics: true, + }) + if result.Err == nil { + t.Fatalf("expected all-unavailable diagnostics to skip with execution error") + } + if !strings.Contains(result.Error, "no AWS service endpoints were reachable") { + t.Fatalf("expected all-unavailable error detail, got %q", result.Error) + } + if len(result.DiagnosticWarnings) != 2 { + t.Fatalf("expected both unavailable endpoint diagnostics, got %#v", result.DiagnosticWarnings) } if _, err := os.Stat(executedFile); !errors.Is(err, os.ErrNotExist) { t.Fatalf("expected custodian command not to execute, stat err: %v", err) @@ -632,7 +824,7 @@ printf '[]' > "$out/test-policy/resources.json" ctx, cancel := context.WithCancel(context.Background()) cancel() - err := executor.runAWSEndpointDiagnostics(ctx, CustodianExecutionRequest{ + _, err := executor.runAWSEndpointDiagnostics(ctx, CustodianExecutionRequest{ Check: CustodianCheck{ Name: "test-policy", Resource: "aws.backup-vault", @@ -1152,9 +1344,6 @@ func TestDiagnosticHelpers(t *testing.T) { t.Fatalf("expected backup-vault to be mapped") } want := []string{ - "sts.eu-west-1.amazonaws.com", - "ec2.eu-west-1.amazonaws.com", - "tagging.eu-west-1.amazonaws.com", "backup.eu-west-1.amazonaws.com", } if strings.Join(hosts, ",") != strings.Join(want, ",") { @@ -1168,13 +1357,7 @@ func TestDiagnosticHelpers(t *testing.T) { t.Fatalf("expected iam-role to be mapped") } want := []string{ - "sts.eu-west-1.amazonaws.com", - "ec2.eu-west-1.amazonaws.com", - "tagging.eu-west-1.amazonaws.com", "iam.amazonaws.com", - "sts.us-east-1.amazonaws.com", - "ec2.us-east-1.amazonaws.com", - "tagging.us-east-1.amazonaws.com", } if strings.Join(hosts, ",") != strings.Join(want, ",") { t.Fatalf("unexpected hosts: %#v", hosts) @@ -1192,9 +1375,6 @@ func TestDiagnosticHelpers(t *testing.T) { t.Fatalf("expected iam-role to be mapped") } want := []string{ - "sts.cn-north-1.amazonaws.com.cn", - "ec2.cn-north-1.amazonaws.com.cn", - "tagging.cn-north-1.amazonaws.com.cn", "iam.amazonaws.com.cn", } if strings.Join(hosts, ",") != strings.Join(want, ",") { @@ -1208,9 +1388,6 @@ func TestDiagnosticHelpers(t *testing.T) { t.Fatalf("expected iam-role to be mapped") } want := []string{ - "sts.us-gov-west-1.amazonaws.com", - "ec2.us-gov-west-1.amazonaws.com", - "tagging.us-gov-west-1.amazonaws.com", "iam.us-gov.amazonaws.com", } if strings.Join(hosts, ",") != strings.Join(want, ",") { @@ -1244,6 +1421,29 @@ func TestDiagnosticHelpers(t *testing.T) { } }) + t.Run("prefers derived aws service endpoint metadata over duplicate configured endpoints", func(t *testing.T) { + endpoints, known, err := awsDiagnosticEndpointsForCheck( + "aws.s3", + []string{"eu-west-1"}, + []string{"https://s3.eu-west-1.amazonaws.com"}, + ) + if err != nil { + t.Fatalf("unexpected endpoint parse error: %v", err) + } + if !known { + t.Fatalf("expected s3 to be mapped") + } + if len(endpoints) != 1 { + t.Fatalf("expected duplicate endpoint to compact to one entry, got %#v", endpoints) + } + if endpoints[0].Source != "aws-service" { + t.Fatalf("expected derived aws-service metadata to win, got %#v", endpoints[0]) + } + if endpoints[0].Service != "s3" || endpoints[0].Region != "eu-west-1" { + t.Fatalf("expected service/region metadata to be preserved, got %#v", endpoints[0]) + } + }) + t.Run("uses strict vpc endpoint suffix classification", func(t *testing.T) { if got := networkDiagnosticEndpointSource("evil.vpce.amazonaws.com.attacker.com"); got != "configured" { t.Fatalf("expected attacker suffix not to be classified as vpc endpoint, got %s", got) @@ -1560,6 +1760,61 @@ func TestEvalLoopBehavior(t *testing.T) { } }) + t.Run("returns failure after submitting evidence for partial diagnostic warnings", func(t *testing.T) { + executor := &fakeExecutor{results: map[string]CustodianExecutionResult{ + "inventory-aws-s3": { + StartedAt: now, + EndedAt: now.Add(5 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{map[string]interface{}{"id": "bucket-1"}}, + DiagnosticWarnings: []string{"unreachable AWS service endpoint s3.eu-west-1 detected while evaluating cloud custodian policy inventory-aws-s3; evaluation may be partial"}, + }, + "check-a": { + StartedAt: now, + EndedAt: now.Add(20 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + DiagnosticWarnings: []string{"unreachable AWS service endpoint s3.eu-west-1 detected while evaluating cloud custodian policy check-a; evaluation may be partial"}, + }, + }} + + evaluator := &fakePolicyEvaluator{} + apiHelper := &fakeAPIHelper{} + + plugin := &CloudCustodianPlugin{ + Logger: hclog.NewNullLogger(), + parsedConfig: &ParsedConfig{ + PolicyLabels: map[string]string{}, + CheckTimeout: 30 * time.Second, + }, + checks: []CustodianCheck{ + {Index: 0, Name: "check-a", Resource: "aws.s3", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-a", "resource": "aws.s3"}}, + }, + executor: executor, + evaluator: evaluator, + } + + resp, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper) + if err == nil { + t.Fatalf("expected eval failure to capture diagnostic execution errors") + } + if resp.GetStatus() != proto.ExecutionStatus_FAILURE { + t.Fatalf("expected failure status, got %s", resp.GetStatus().String()) + } + if apiHelper.calls != 1 { + t.Fatalf("expected CreateEvidence once before returning diagnostic failure, got %d", apiHelper.calls) + } + if len(apiHelper.evidence) == 0 { + t.Fatalf("expected evidence to be submitted before diagnostic failure is returned") + } + if len(evaluator.calls) == 0 { + t.Fatalf("expected evaluator to run for available diagnostic endpoints") + } + if !strings.Contains(err.Error(), "s3.eu-west-1") { + t.Fatalf("expected diagnostic warning detail, got %v", err) + } + }) + t.Run("fails when all policy evaluations fail", func(t *testing.T) { executor := &fakeExecutor{results: map[string]CustodianExecutionResult{ "inventory-aws-s3": {