| layout | default |
|---|---|
| title | Chapter 6: Safety & Reliability |
| parent | AgentGPT Tutorial |
| nav_order | 6 |
Welcome to Chapter 6: Safety & Reliability. In this part of AgentGPT Tutorial: Building Autonomous AI Agents, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs.
This chapter covers implementing safety mechanisms, reliability features, and ethical considerations for autonomous AI agents.
// Core safety principles for agents
const safetyPrinciples = {
harmPrevention: "Prevent harm to users and systems",
transparency: "Be transparent about agent capabilities and limitations",
controllability: "Maintain human control over agent actions",
robustness: "Handle failures gracefully and recover automatically",
privacy: "Protect user data and maintain confidentiality",
fairness: "Ensure equitable treatment and avoid bias"
}
class SafetyManager {
private violations: SafetyViolation[] = []
async checkAction(action: AgentAction): Promise<SafetyCheckResult> {
const checks = await Promise.all([
this.checkHarmPrevention(action),
this.checkTransparency(action),
this.checkControllability(action),
this.checkRobustness(action),
this.checkPrivacy(action),
this.checkFairness(action)
])
const violations = checks.filter(check => !check.passed)
if (violations.length > 0) {
await this.recordViolations(violations)
return {
approved: false,
violations,
recommendations: this.generateRecommendations(violations)
}
}
return { approved: true, violations: [] }
}
private async checkHarmPrevention(action: AgentAction): Promise<SafetyCheck> {
// Check for potentially harmful actions
const harmfulPatterns = [
/delete.*all/i,
/format.*drive/i,
/drop.*table/i,
/shutdown.*system/i
]
const isHarmful = harmfulPatterns.some(pattern =>
pattern.test(action.description)
)
return {
principle: 'harmPrevention',
passed: !isHarmful,
severity: isHarmful ? 'high' : 'low'
}
}
private async recordViolations(violations: SafetyCheck[]) {
const violationRecords = violations.map(violation => ({
id: this.generateId(),
timestamp: new Date(),
principle: violation.principle,
severity: violation.severity,
action: violation.action,
details: violation.details
}))
this.violations.push(...violationRecords)
await this.persistViolations(violationRecords)
}
}// Validate agent actions before execution
class ActionValidator {
private rules: ValidationRule[] = []
addRule(rule: ValidationRule) {
this.rules.push(rule)
}
async validate(action: AgentAction): Promise<ValidationResult> {
const results = await Promise.all(
this.rules.map(rule => this.applyRule(rule, action))
)
const failures = results.filter(result => !result.passed)
return {
valid: failures.length === 0,
failures,
score: this.calculateSafetyScore(results)
}
}
private async applyRule(rule: ValidationRule, action: AgentAction): Promise<RuleResult> {
try {
const passed = await rule.check(action)
return {
rule: rule.name,
passed,
severity: rule.severity,
message: passed ? null : rule.failureMessage
}
} catch (error) {
return {
rule: rule.name,
passed: false,
severity: 'high',
message: `Rule execution failed: ${error.message}`
}
}
}
private calculateSafetyScore(results: RuleResult[]): number {
const weights = { low: 1, medium: 2, high: 3 }
const totalWeight = results.reduce((sum, result) =>
sum + weights[result.severity], 0
)
const passedWeight = results
.filter(result => result.passed)
.reduce((sum, result) => sum + weights[result.severity], 0)
return passedWeight / totalWeight
}
}// Secure agent operations
class SecurityManager {
private sessions = new Map<string, Session>()
async authenticate(token: string): Promise<User | null> {
try {
const decoded = await this.verifyToken(token)
return await this.getUser(decoded.userId)
} catch (error) {
return null
}
}
async authorize(user: User, action: string, resource: string): Promise<boolean> {
const permissions = await this.getUserPermissions(user.id)
return permissions.some(permission =>
permission.action === action &&
permission.resource === resource
)
}
async createSession(user: User): Promise<string> {
const sessionId = this.generateSecureId()
const session: Session = {
id: sessionId,
userId: user.id,
createdAt: new Date(),
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000), // 24 hours
permissions: await this.getUserPermissions(user.id)
}
this.sessions.set(sessionId, session)
return sessionId
}
async validateSession(sessionId: string): Promise<Session | null> {
const session = this.sessions.get(sessionId)
if (!session) return null
if (session.expiresAt < new Date()) {
this.sessions.delete(sessionId)
return null
}
return session
}
}// Sanitize user inputs and agent outputs
class InputSanitizer {
private patterns = {
sqlInjection: /(\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER)\b)/gi,
scriptInjection: /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi,
pathTraversal: /\.\.[\/\\]/g,
commandInjection: /[;&|`$()]/g
}
sanitize(input: string, context: 'user' | 'agent' | 'system' = 'user'): string {
let sanitized = input
// Apply context-specific sanitization
switch (context) {
case 'user':
sanitized = this.sanitizeUserInput(sanitized)
break
case 'agent':
sanitized = this.sanitizeAgentOutput(sanitized)
break
case 'system':
sanitized = this.sanitizeSystemInput(sanitized)
break
}
// Apply general sanitization
sanitized = this.applyGeneralSanitization(sanitized)
return sanitized
}
private sanitizeUserInput(input: string): string {
// Remove potentially dangerous patterns
for (const [name, pattern] of Object.entries(this.patterns)) {
input = input.replace(pattern, '[FILTERED]')
}
return input
}
private sanitizeAgentOutput(output: string): string {
// Ensure agent outputs don't contain sensitive information
return this.removeSensitiveData(output)
}
private applyGeneralSanitization(input: string): string {
// HTML encode special characters
return input
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, ''')
}
}// Comprehensive error handling
class ErrorHandler {
private errorHistory: ErrorRecord[] = []
private recoveryStrategies = new Map<string, RecoveryStrategy>()
async handleError(error: Error, context: ErrorContext): Promise<RecoveryResult> {
// Record error
await this.recordError(error, context)
// Determine recovery strategy
const strategy = this.determineRecoveryStrategy(error, context)
// Execute recovery
try {
const result = await this.executeRecovery(strategy, context)
await this.recordRecovery(result)
return result
} catch (recoveryError) {
await this.recordRecoveryFailure(recoveryError, strategy)
throw recoveryError
}
}
private determineRecoveryStrategy(error: Error, context: ErrorContext): RecoveryStrategy {
// Analyze error type and context to choose strategy
if (error.name === 'NetworkError') {
return { type: 'retry', attempts: 3, delay: 1000 }
}
if (error.name === 'ValidationError') {
return { type: 'fallback', alternative: 'safe_default' }
}
if (error.name === 'ResourceError') {
return { type: 'degrade', reducedFunctionality: true }
}
return { type: 'fail', message: 'Unable to recover from error' }
}
private async executeRecovery(strategy: RecoveryStrategy, context: ErrorContext): Promise<RecoveryResult> {
switch (strategy.type) {
case 'retry':
return await this.executeRetry(strategy, context)
case 'fallback':
return await this.executeFallback(strategy, context)
case 'degrade':
return await this.executeDegradation(strategy, context)
default:
throw new Error(strategy.message || 'Recovery failed')
}
}
private async executeRetry(strategy: RecoveryStrategy, context: ErrorContext): Promise<RecoveryResult> {
for (let attempt = 1; attempt <= strategy.attempts; attempt++) {
try {
const result = await context.retryFunction()
return { success: true, attempt, result }
} catch (error) {
if (attempt === strategy.attempts) {
throw error
}
await this.delay(strategy.delay * attempt) // Exponential backoff
}
}
}
}// Prevent cascading failures
class CircuitBreaker {
private state: 'closed' | 'open' | 'half-open' = 'closed'
private failures = 0
private lastFailureTime = 0
private nextAttemptTime = 0
constructor(
private failureThreshold = 5,
private recoveryTimeout = 60000,
private monitoringPeriod = 60000
) {}
async execute(operation: () => Promise<any>): Promise<any> {
if (this.state === 'open') {
if (Date.now() < this.nextAttemptTime) {
throw new Error('Circuit breaker is OPEN')
}
this.state = 'half-open'
}
try {
const result = await operation()
this.onSuccess()
return result
} catch (error) {
this.onFailure()
throw error
}
}
private onSuccess() {
this.failures = 0
this.state = 'closed'
}
private onFailure() {
this.failures++
this.lastFailureTime = Date.now()
if (this.failures >= this.failureThreshold) {
this.state = 'open'
this.nextAttemptTime = Date.now() + this.recoveryTimeout
}
}
getState() {
return {
state: this.state,
failures: this.failures,
lastFailureTime: this.lastFailureTime,
nextAttemptTime: this.nextAttemptTime
}
}
}// Track safety and reliability metrics
class SafetyMonitor {
private metrics = {
violations: 0,
blocks: 0,
recoveries: 0,
failures: 0,
responseTime: 0,
uptime: 0
}
recordViolation(violation: SafetyViolation) {
this.metrics.violations++
// Alert if violation rate is too high
if (this.getViolationRate() > 0.1) { // 10% violation rate
this.alert('High violation rate detected')
}
}
recordBlock(action: string, reason: string) {
this.metrics.blocks++
console.warn(`Action blocked: ${action}, Reason: ${reason}`)
}
recordRecovery(recovery: RecoveryResult) {
this.metrics.recoveries++
}
getViolationRate(): number {
const totalActions = this.metrics.violations + this.metrics.blocks + this.metrics.recoveries
return totalActions > 0 ? this.metrics.violations / totalActions : 0
}
getReliabilityScore(): number {
const total = this.metrics.recoveries + this.metrics.failures
return total > 0 ? this.metrics.recoveries / total : 1
}
private alert(message: string) {
// Send alert to monitoring system
console.error(`SAFETY ALERT: ${message}`)
// In production, send to alerting service
}
}// Ensure regulatory compliance
class ComplianceMonitor {
private regulations = {
gdpr: {
dataRetention: 2555, // days
consentRequired: true,
dataPortability: true
},
ccpa: {
dataDeletion: true,
dataSaleOptOut: true,
privacyNotice: true
}
}
async checkCompliance(action: AgentAction, user: User): Promise<ComplianceResult> {
const issues = []
// GDPR compliance
if (this.regulations.gdpr.consentRequired && !user.consentGiven) {
issues.push({
regulation: 'GDPR',
issue: 'User consent not obtained',
severity: 'high'
})
}
// Data retention compliance
if (this.isDataRetentionViolation(user)) {
issues.push({
regulation: 'GDPR',
issue: 'Data retention policy violation',
severity: 'medium'
})
}
return {
compliant: issues.length === 0,
issues,
recommendations: this.generateRecommendations(issues)
}
}
private isDataRetentionViolation(user: User): boolean {
const accountAge = Date.now() - user.createdAt.getTime()
const maxAge = this.regulations.gdpr.dataRetention * 24 * 60 * 60 * 1000
return accountAge > maxAge
}
private generateRecommendations(issues: ComplianceIssue[]): string[] {
return issues.map(issue => {
switch (issue.regulation) {
case 'GDPR':
return `Review GDPR compliance for ${issue.issue.toLowerCase()}`
case 'CCPA':
return `Ensure CCPA compliance regarding ${issue.issue.toLowerCase()}`
default:
return `Address compliance issue: ${issue.issue}`
}
})
}
}// Detect and mitigate bias in agent decisions
class BiasDetector {
private biasPatterns: BiasPattern[] = []
async analyzeDecision(decision: AgentDecision, context: DecisionContext): Promise<BiasAnalysis> {
const biases = []
for (const pattern of this.biasPatterns) {
const detected = await this.detectPattern(pattern, decision, context)
if (detected) {
biases.push({
type: pattern.type,
severity: detected.severity,
evidence: detected.evidence,
mitigation: pattern.mitigation
})
}
}
return {
hasBias: biases.length > 0,
biases,
confidence: this.calculateConfidence(biases),
recommendations: this.generateRecommendations(biases)
}
}
private async detectPattern(pattern: BiasPattern, decision: AgentDecision, context: DecisionContext): Promise<BiasDetection | null> {
// Implement specific bias detection logic
// This is a simplified example
if (pattern.type === 'confirmation_bias') {
const confirmationEvidence = this.findConfirmationBias(decision, context)
if (confirmationEvidence) {
return {
severity: 'medium',
evidence: confirmationEvidence
}
}
}
return null
}
}// Ensure agent transparency
class TransparencyManager {
async generateExplanation(action: AgentAction, context: ActionContext): Promise<Explanation> {
return {
action: action.description,
reasoning: await this.extractReasoning(action),
evidence: await this.gatherEvidence(action, context),
confidence: this.calculateConfidence(action),
alternatives: await this.identifyAlternatives(action, context),
limitations: this.identifyLimitations(action)
}
}
private async extractReasoning(action: AgentAction): Promise<string[]> {
// Extract the reasoning chain that led to this action
const reasoning = []
if (action.context?.goal) {
reasoning.push(`Working towards goal: ${action.context.goal}`)
}
if (action.context?.constraints) {
reasoning.push(`Considering constraints: ${action.context.constraints.join(', ')}`)
}
return reasoning
}
private async gatherEvidence(action: AgentAction, context: ActionContext): Promise<Evidence[]> {
// Gather evidence supporting the action
return [
{
type: 'data',
source: 'user_input',
content: context.userInput,
relevance: 0.9
},
{
type: 'analysis',
source: 'agent_reasoning',
content: action.reasoning,
relevance: 1.0
}
]
}
}- ✅ Implemented comprehensive safety framework
- ✅ Built security measures and input validation
- ✅ Created reliability features with error recovery
- ✅ Set up monitoring and alerting systems
- ✅ Developed compliance monitoring
- ✅ Addressed ethical considerations and bias detection
Key Takeaways:
- Safety must be built into every layer of the system
- Multiple validation checks prevent harmful actions
- Reliability requires comprehensive error handling
- Monitoring enables proactive issue detection
- Compliance ensures legal and regulatory adherence
- Ethics and transparency build user trust
Most teams struggle here because the hard part is not writing more code, but deciding clear boundaries for action, context, Promise so behavior stays predictable as complexity grows.
In practical terms, this chapter helps you avoid three common failures:
- coupling core logic too tightly to one implementation path
- missing the handoff boundaries between setup, execution, and validation
- shipping changes without clear rollback or observability strategy
After working through this chapter, you should be able to reason about Chapter 6: Safety & Reliability as an operating subsystem inside AgentGPT Tutorial: Building Autonomous AI Agents, with explicit contracts for inputs, state transitions, and outputs.
Use the implementation notes around error, user, result as your checklist when adapting these patterns to your own repository.
Under the hood, Chapter 6: Safety & Reliability usually follows a repeatable control path:
- Context bootstrap: initialize runtime config and prerequisites for
action. - Input normalization: shape incoming data so
contextreceives stable contracts. - Core execution: run the main logic branch and propagate intermediate state through
Promise. - Policy and safety checks: enforce limits, auth scopes, and failure boundaries.
- Output composition: return canonical result payloads for downstream consumers.
- Operational telemetry: emit logs/metrics needed for debugging and performance tuning.
When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions.
Use the following upstream sources to verify implementation details while reading this chapter:
- View Repo
Why it matters: authoritative reference on
View Repo(github.com).
Suggested trace strategy:
- search upstream code for
actionandcontextto map concrete implementation paths - compare docs claims against actual runtime/config code before reusing patterns in production