Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
305183c
feat(resilience): implement enhanced error classification system for …
starbops Aug 17, 2025
97be12d
feat(monitoring): implement comprehensive resource monitoring and ale…
starbops Aug 17, 2025
5d80d98
feat(resilience): implement circuit breaker pattern and graceful degr…
starbops Aug 17, 2025
68b62a3
feat(resilience): implement enhanced retry logic with jitter, backoff…
starbops Aug 17, 2025
c2089de
test(resilience): add comprehensive test suite for circuit breaker, l…
starbops Aug 17, 2025
63cd369
chore: update Claude Code settings configuration
starbops Aug 17, 2025
e462071
feat(reporting): implement comprehensive error reporting and aggregat…
starbops Aug 17, 2025
6ccb57f
feat(chaos): implement comprehensive chaos engineering framework for …
starbops Aug 17, 2025
745ac95
feat(reporting): finalize comprehensive error handling and cleanup me…
starbops Aug 20, 2025
65e1093
fix(tests): resolve CI failures - lint, race conditions, and security…
starbops Aug 20, 2025
c2b03a5
fix: comprehensive CI fixes - lint, tests, race conditions, and security
starbops Aug 20, 2025
821fcfa
fix: comprehensive CI fixes - formatting, race conditions, and securi…
starbops Aug 20, 2025
e53208d
fix: resolve lint CI formatting issue in metrics_collector.go
starbops Aug 20, 2025
4c1f291
fix(chaos): resolve compilation errors in chaos framework
starbops Aug 20, 2025
5e16ce4
fix(security): prevent integer overflow in disk metrics collection
starbops Aug 21, 2025
7a492f7
fix(format): remove trailing whitespace in chaos experiments
starbops Aug 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .claude/settings.local.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,10 @@
"mcp__github__get_pull_request_status",
"mcp__github__list_workflow_runs",
"mcp__github__get_job_logs",
"mcp__github__list_workflow_jobs"
"mcp__github__list_workflow_jobs",
"mcp__github__add_issue_comment",
"mcp__github__list_pull_requests",
"mcp__github__create_pull_request"
],
"deny": []
}
Expand Down
302 changes: 302 additions & 0 deletions internal/executor/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,42 @@ package executor
import (
"errors"
"fmt"
"strings"
"time"

"github.com/google/uuid"
)

// ErrorType represents different categories of execution errors
type ErrorType string

const (
// ErrorTypeInfrastructure indicates infrastructure-related failures
ErrorTypeInfrastructure ErrorType = "infrastructure"

// ErrorTypeResource indicates resource exhaustion or limits
ErrorTypeResource ErrorType = "resource"

// ErrorTypeTimeout indicates execution timeout
ErrorTypeTimeout ErrorType = "timeout"

// ErrorTypeUserCode indicates user script execution errors
ErrorTypeUserCode ErrorType = "user_code"

// ErrorTypeValidation indicates input validation errors
ErrorTypeValidation ErrorType = "validation"

// ErrorTypeSecurity indicates security policy violations
ErrorTypeSecurity ErrorType = "security"

// ErrorTypeNetwork indicates network connectivity issues
ErrorTypeNetwork ErrorType = "network"

// ErrorTypeRateLimit indicates rate limiting enforcement
ErrorTypeRateLimit ErrorType = "rate_limit"

// ErrorTypeQuota indicates quota enforcement
ErrorTypeQuota ErrorType = "quota"
)

// Common executor errors
Expand Down Expand Up @@ -36,6 +72,15 @@ var (

// ErrNetworkUnavailable indicates network connectivity issues
ErrNetworkUnavailable = errors.New("network unavailable")

// ErrRateLimitExceeded indicates rate limit has been exceeded
ErrRateLimitExceeded = errors.New("rate limit exceeded")

// ErrQuotaExceeded indicates quota has been exceeded
ErrQuotaExceeded = errors.New("quota exceeded")

// ErrSystemOverloaded indicates system is under high load
ErrSystemOverloaded = errors.New("system overloaded")
)

// ExecutorError represents a structured error from the executor
Expand Down Expand Up @@ -154,6 +199,85 @@ func NewSecurityError(operation, reason string, cause error) *SecurityError {
}
}

// ExecutionError represents a comprehensive error with classification and context
type ExecutionError struct {
Type ErrorType `json:"type"`
Code string `json:"code"`
Message string `json:"message"`
Details string `json:"details,omitempty"`
Retryable bool `json:"retryable"`
TaskID string `json:"task_id"`
ExecutionID string `json:"execution_id,omitempty"`
ContainerID string `json:"container_id,omitempty"`
Timestamp time.Time `json:"timestamp"`
Context map[string]interface{} `json:"context,omitempty"`
Cause error `json:"-"`
}

// Error implements the error interface
func (e *ExecutionError) Error() string {
if e.Cause != nil {
return fmt.Sprintf("%s [%s:%s]: %s: %v", e.Type, e.Code, e.TaskID, e.Message, e.Cause)
}
return fmt.Sprintf("%s [%s:%s]: %s", e.Type, e.Code, e.TaskID, e.Message)
}

// Unwrap returns the underlying cause
func (e *ExecutionError) Unwrap() error {
return e.Cause
}

// NewExecutionError creates a new execution error with classification
func NewExecutionError(errorType ErrorType, code, message string, taskID uuid.UUID) *ExecutionError {
return &ExecutionError{
Type: errorType,
Code: code,
Message: message,
TaskID: taskID.String(),
Timestamp: time.Now(),
Context: make(map[string]interface{}),
}
}

// WithCause adds a cause to the execution error
func (e *ExecutionError) WithCause(cause error) *ExecutionError {
e.Cause = cause
return e
}

// WithDetails adds additional details to the execution error
func (e *ExecutionError) WithDetails(details string) *ExecutionError {
e.Details = details
return e
}

// WithContext adds context information to the execution error
func (e *ExecutionError) WithContext(key string, value interface{}) *ExecutionError {
if e.Context == nil {
e.Context = make(map[string]interface{})
}
e.Context[key] = value
return e
}

// WithExecutionID adds execution ID to the execution error
func (e *ExecutionError) WithExecutionID(executionID uuid.UUID) *ExecutionError {
e.ExecutionID = executionID.String()
return e
}

// WithContainerID adds container ID to the execution error
func (e *ExecutionError) WithContainerID(containerID string) *ExecutionError {
e.ContainerID = containerID
return e
}

// SetRetryable sets whether the error is retryable
func (e *ExecutionError) SetRetryable(retryable bool) *ExecutionError {
e.Retryable = retryable
return e
}

// IsTimeoutError checks if an error is a timeout error
func IsTimeoutError(err error) bool {
return errors.Is(err, ErrExecutionTimeout)
Expand Down Expand Up @@ -187,3 +311,181 @@ func IsConfigError(err error) bool {
var confErr *ConfigError
return errors.As(err, &confErr)
}

// IsExecutionError checks if an error is an ExecutionError
func IsExecutionError(err error) bool {
var execErr *ExecutionError
return errors.As(err, &execErr)
}

// ClassifyError analyzes an error and creates a classified ExecutionError
func ClassifyError(err error, taskID uuid.UUID, context string) *ExecutionError {
if err == nil {
return nil
}

// Check if it's already an ExecutionError
if execErr, ok := err.(*ExecutionError); ok {
return execErr
}

errMsg := err.Error()
errMsgLower := strings.ToLower(errMsg)

// Default execution error
execErr := NewExecutionError(ErrorTypeInfrastructure, "UNKNOWN_ERROR", "Unknown execution error", taskID).
WithCause(err).
WithContext("classification_context", context)

// Classify based on error message patterns
switch {
// Docker daemon errors
case strings.Contains(errMsgLower, "docker daemon") || strings.Contains(errMsgLower, "connection refused"):
execErr.Type = ErrorTypeInfrastructure
execErr.Code = "DOCKER_DAEMON_UNAVAILABLE"
execErr.Message = "Docker daemon is unavailable"
_ = execErr.SetRetryable(true)

// Container not found errors
case strings.Contains(errMsgLower, "no such container"):
execErr.Type = ErrorTypeInfrastructure
execErr.Code = "CONTAINER_NOT_FOUND"
execErr.Message = "Container not found"
_ = execErr.SetRetryable(false)

// Image not found errors
case strings.Contains(errMsgLower, "no such image") || strings.Contains(errMsgLower, "pull access denied"):
execErr.Type = ErrorTypeInfrastructure
execErr.Code = "IMAGE_NOT_FOUND"
execErr.Message = "Container image not found or access denied"
_ = execErr.SetRetryable(false)

// Timeout errors
case strings.Contains(errMsgLower, "timeout") || strings.Contains(errMsgLower, "deadline exceeded"):
execErr.Type = ErrorTypeTimeout
execErr.Code = "EXECUTION_TIMEOUT"
execErr.Message = "Task execution timed out"
_ = execErr.SetRetryable(true)

// Resource exhaustion errors
case strings.Contains(errMsgLower, "out of memory") || strings.Contains(errMsgLower, "oom"):
execErr.Type = ErrorTypeResource
execErr.Code = "OUT_OF_MEMORY"
execErr.Message = "Container ran out of memory"
_ = execErr.SetRetryable(false)

case strings.Contains(errMsgLower, "no space left"):
execErr.Type = ErrorTypeResource
execErr.Code = "OUT_OF_DISK_SPACE"
execErr.Message = "Insufficient disk space"
_ = execErr.SetRetryable(true)

case strings.Contains(errMsgLower, "cpu quota"):
execErr.Type = ErrorTypeResource
execErr.Code = "CPU_QUOTA_EXCEEDED"
execErr.Message = "CPU quota exceeded"
_ = execErr.SetRetryable(true)

// Network errors
case strings.Contains(errMsgLower, "network") || strings.Contains(errMsgLower, "dns"):
execErr.Type = ErrorTypeNetwork
execErr.Code = "NETWORK_ERROR"
execErr.Message = "Network connectivity issue"
_ = execErr.SetRetryable(true)

// Permission errors
case strings.Contains(errMsgLower, "permission denied") || strings.Contains(errMsgLower, "access denied"):
execErr.Type = ErrorTypeSecurity
execErr.Code = "PERMISSION_DENIED"
execErr.Message = "Permission denied"
_ = execErr.SetRetryable(false)

// Rate limiting errors
case strings.Contains(errMsgLower, "rate limit") || strings.Contains(errMsgLower, "too many requests"):
execErr.Type = ErrorTypeRateLimit
execErr.Code = "RATE_LIMIT_EXCEEDED"
execErr.Message = "Rate limit exceeded"
_ = execErr.SetRetryable(true)

// Quota errors
case strings.Contains(errMsgLower, "quota") || strings.Contains(errMsgLower, "limit exceeded"):
execErr.Type = ErrorTypeQuota
execErr.Code = "QUOTA_EXCEEDED"
execErr.Message = "Quota exceeded"
_ = execErr.SetRetryable(false)

// Validation errors
case strings.Contains(errMsgLower, "invalid") || strings.Contains(errMsgLower, "malformed"):
execErr.Type = ErrorTypeValidation
execErr.Code = "VALIDATION_ERROR"
execErr.Message = "Input validation failed"
_ = execErr.SetRetryable(false)

// User code errors (script execution failures)
case strings.Contains(errMsgLower, "exit status") || strings.Contains(errMsgLower, "command failed"):
execErr.Type = ErrorTypeUserCode
execErr.Code = "USER_CODE_ERROR"
execErr.Message = "User script execution failed"
_ = execErr.SetRetryable(false)

// Cancellation errors
case strings.Contains(errMsgLower, "cancelled") || strings.Contains(errMsgLower, "canceled"):
execErr.Type = ErrorTypeTimeout
execErr.Code = "EXECUTION_CANCELLED"
execErr.Message = "Task execution was cancelled"
_ = execErr.SetRetryable(false)

default:
// Keep default classification for unknown errors
_ = execErr.SetRetryable(true) // Conservative approach - assume retryable unless known otherwise
}

return execErr
}

// ClassifyDockerError specifically classifies Docker-related errors
func ClassifyDockerError(err error, taskID uuid.UUID, containerID string) *ExecutionError {
execErr := ClassifyError(err, taskID, "docker_operation")

if containerID != "" {
_ = execErr.WithContainerID(containerID)
}

// Add Docker-specific context
_ = execErr.WithContext("error_source", "docker_client")

return execErr
}

// IsRetryableError determines if an error should trigger a retry
func IsRetryableError(err error) bool {
if execErr, ok := err.(*ExecutionError); ok {
return execErr.Retryable
}

// For non-ExecutionError types, use existing classification
return IsTimeoutError(err) || IsDockerError(err) || IsResourceError(err)
}

// GetErrorType extracts the error type from an error
func GetErrorType(err error) ErrorType {
if execErr, ok := err.(*ExecutionError); ok {
return execErr.Type
}

// Legacy error type classification
switch {
case IsTimeoutError(err):
return ErrorTypeTimeout
case IsDockerError(err):
return ErrorTypeInfrastructure
case IsResourceError(err):
return ErrorTypeResource
case IsSecurityError(err):
return ErrorTypeSecurity
case IsConfigError(err):
return ErrorTypeValidation
default:
return ErrorTypeInfrastructure
}
}
Loading