|
| 1 | +--- |
| 2 | +title: "A2A Communication Best Practices" |
| 3 | +--- |
| 4 | + |
| 5 | +# A2A Communication Best Practices |
| 6 | + |
| 7 | +Guidelines for reliable, secure, and efficient Agent-to-Agent communication in the OpenSIN ecosystem. |
| 8 | + |
| 9 | +## Protocol Fundamentals |
| 10 | + |
| 11 | +### Message Structure |
| 12 | + |
| 13 | +Every A2A message must follow this structure: |
| 14 | + |
| 15 | +```typescript |
| 16 | +interface A2AMessage { |
| 17 | + id: string // UUID for tracking |
| 18 | + from: string // Sender agent ID |
| 19 | + to: string // Target agent ID |
| 20 | + type: 'request' | 'response' | 'notification' | 'error' |
| 21 | + content: unknown // Task payload or response data |
| 22 | + metadata?: { |
| 23 | + priority: 'low' | 'normal' | 'high' | 'critical' |
| 24 | + timeout?: number // ms until message expires |
| 25 | + correlationId?: string // Link related request/response |
| 26 | + retryCount?: number // Current retry attempt |
| 27 | + } |
| 28 | +} |
| 29 | +``` |
| 30 | + |
| 31 | +### Communication Patterns |
| 32 | + |
| 33 | +| Pattern | Use Case | Example | |
| 34 | +|---------|----------|---------| |
| 35 | +| Request/Response | Task delegation | Research agent → Data agent | |
| 36 | +| Fire-and-Forget | Notifications | Status updates, logging | |
| 37 | +| Pub/Sub | Broadcast | Fleet-wide announcements | |
| 38 | +| Pipeline | Sequential processing | Data → Analysis → Report | |
| 39 | + |
| 40 | +## Reliability |
| 41 | + |
| 42 | +### Retry Strategy |
| 43 | + |
| 44 | +Implement exponential backoff for failed deliveries: |
| 45 | + |
| 46 | +```typescript |
| 47 | +async function sendWithRetry(message: A2AMessage, maxRetries = 3) { |
| 48 | + for (let i = 0; i < maxRetries; i++) { |
| 49 | + try { |
| 50 | + return await a2aClient.send(message) |
| 51 | + } catch (error) { |
| 52 | + if (i === maxRetries - 1) throw error |
| 53 | + const delay = Math.pow(2, i) * 1000 // 1s, 2s, 4s |
| 54 | + await sleep(delay) |
| 55 | + } |
| 56 | + } |
| 57 | +} |
| 58 | +``` |
| 59 | + |
| 60 | +### Dead Letter Queue |
| 61 | + |
| 62 | +Messages that fail after all retries go to a dead letter queue: |
| 63 | + |
| 64 | +```typescript |
| 65 | +interface DeadLetterEntry { |
| 66 | + originalMessage: A2AMessage |
| 67 | + failureReason: string |
| 68 | + failedAt: Date |
| 69 | + retryCount: number |
| 70 | +} |
| 71 | +``` |
| 72 | + |
| 73 | +### Idempotency |
| 74 | + |
| 75 | +All A2A operations must be idempotent — processing the same message twice must not cause duplicate side effects: |
| 76 | + |
| 77 | +```typescript |
| 78 | +// Use correlationId to detect duplicates |
| 79 | +const processed = await checkProcessed(message.correlationId) |
| 80 | +if (processed) return { status: 'duplicate', id: message.correlationId } |
| 81 | + |
| 82 | +// Process the message |
| 83 | +const result = await handleMessage(message) |
| 84 | + |
| 85 | +// Mark as processed |
| 86 | +await markProcessed(message.correlationId) |
| 87 | +``` |
| 88 | + |
| 89 | +## Security |
| 90 | + |
| 91 | +### Authentication |
| 92 | + |
| 93 | +All inter-agent communication requires JWT authentication: |
| 94 | + |
| 95 | +```typescript |
| 96 | +const response = await fetch('https://agent-b.opensin.ai/a2a/v1', { |
| 97 | + method: 'POST', |
| 98 | + headers: { |
| 99 | + 'Authorization': `Bearer ${jwt}`, |
| 100 | + 'X-Agent-ID': 'sin-hermes', |
| 101 | + 'X-Fleet-ID': 'production', |
| 102 | + 'Content-Type': 'application/json', |
| 103 | + }, |
| 104 | + body: JSON.stringify(taskPayload), |
| 105 | +}) |
| 106 | +``` |
| 107 | + |
| 108 | +### Transport Encryption |
| 109 | + |
| 110 | +- **Production**: HTTPS (TLS 1.3) required for all external communication |
| 111 | +- **Internal**: Cloudflare Tunnels for zero-trust networking |
| 112 | +- **Never** transmit credentials over plaintext HTTP |
| 113 | + |
| 114 | +### Payload Validation |
| 115 | + |
| 116 | +Validate all incoming messages before processing: |
| 117 | + |
| 118 | +```typescript |
| 119 | +function validateMessage(msg: unknown): A2AMessage { |
| 120 | + const schema = z.object({ |
| 121 | + id: z.string().uuid(), |
| 122 | + from: z.string().min(1), |
| 123 | + to: z.string().min(1), |
| 124 | + type: z.enum(['request', 'response', 'notification', 'error']), |
| 125 | + content: z.unknown(), |
| 126 | + metadata: z.object({ |
| 127 | + priority: z.enum(['low', 'normal', 'high', 'critical']).optional(), |
| 128 | + timeout: z.number().positive().optional(), |
| 129 | + }).optional(), |
| 130 | + }) |
| 131 | + |
| 132 | + return schema.parse(msg) |
| 133 | +} |
| 134 | +``` |
| 135 | + |
| 136 | +## Performance |
| 137 | + |
| 138 | +### Connection Pooling |
| 139 | + |
| 140 | +Reuse connections to avoid handshake overhead: |
| 141 | + |
| 142 | +```typescript |
| 143 | +const pool = new A2AConnectionPool({ |
| 144 | + size: 10, |
| 145 | + idleTimeout: 60_000, |
| 146 | + healthCheckInterval: 30_000, |
| 147 | +}) |
| 148 | + |
| 149 | +// Get connection from pool |
| 150 | +const conn = await pool.acquire() |
| 151 | +try { |
| 152 | + await conn.send(message) |
| 153 | +} finally { |
| 154 | + pool.release(conn) |
| 155 | +} |
| 156 | +``` |
| 157 | + |
| 158 | +### Message Batching |
| 159 | + |
| 160 | +Batch independent messages to reduce network overhead: |
| 161 | + |
| 162 | +```typescript |
| 163 | +// Instead of sending individually |
| 164 | +messages.forEach(msg => a2aClient.send(msg)) |
| 165 | + |
| 166 | +// Batch them |
| 167 | +await a2aClient.sendBatch(messages) |
| 168 | +``` |
| 169 | + |
| 170 | +### Timeout Configuration |
| 171 | + |
| 172 | +Set appropriate timeouts based on message priority: |
| 173 | + |
| 174 | +| Priority | Timeout | Retry Count | |
| 175 | +|----------|---------|-------------| |
| 176 | +| Critical | 30s | 5 | |
| 177 | +| High | 60s | 3 | |
| 178 | +| Normal | 120s | 2 | |
| 179 | +| Low | 300s | 1 | |
| 180 | + |
| 181 | +## Error Handling |
| 182 | + |
| 183 | +### Error Response Format |
| 184 | + |
| 185 | +Standardize error responses: |
| 186 | + |
| 187 | +```typescript |
| 188 | +interface A2AError { |
| 189 | + type: 'error' |
| 190 | + code: 'AGENT_UNAVAILABLE' | 'TIMEOUT' | 'INVALID_PAYLOAD' | 'RATE_LIMITED' | 'INTERNAL_ERROR' |
| 191 | + message: string |
| 192 | + details?: unknown |
| 193 | + retryable: boolean |
| 194 | +} |
| 195 | +``` |
| 196 | + |
| 197 | +### Circuit Breaker |
| 198 | + |
| 199 | +Prevent cascading failures with circuit breakers: |
| 200 | + |
| 201 | +```typescript |
| 202 | +const circuit = new CircuitBreaker({ |
| 203 | + failureThreshold: 5, |
| 204 | + resetTimeout: 30_000, |
| 205 | + halfOpenMaxCalls: 1, |
| 206 | +}) |
| 207 | + |
| 208 | +circuit.execute(() => a2aClient.send(message)) |
| 209 | +``` |
| 210 | + |
| 211 | +## Monitoring |
| 212 | + |
| 213 | +### Message Tracking |
| 214 | + |
| 215 | +Track all messages for debugging and auditing: |
| 216 | + |
| 217 | +```typescript |
| 218 | +interface MessageLog { |
| 219 | + messageId: string |
| 220 | + correlationId?: string |
| 221 | + from: string |
| 222 | + to: string |
| 223 | + type: string |
| 224 | + status: 'sent' | 'delivered' | 'processed' | 'failed' | 'expired' |
| 225 | + timestamp: Date |
| 226 | + latency?: number |
| 227 | +} |
| 228 | +``` |
| 229 | + |
| 230 | +### Key Metrics |
| 231 | + |
| 232 | +| Metric | Alert Threshold | |
| 233 | +|--------|----------------| |
| 234 | +| Message delivery rate | < 99% | |
| 235 | +| Average latency | > 5s | |
| 236 | +| Dead letter queue size | > 100 | |
| 237 | +| Circuit breaker trips | > 3/hour | |
| 238 | + |
| 239 | +## Checklist |
| 240 | + |
| 241 | +Before deploying A2A communication: |
| 242 | + |
| 243 | +- [ ] Message schema validated with Zod/JSON Schema |
| 244 | +- [ ] JWT authentication configured |
| 245 | +- [ ] Retry logic with exponential backoff implemented |
| 246 | +- [ ] Idempotency handling via correlation IDs |
| 247 | +- [ ] Circuit breaker for external agent calls |
| 248 | +- [ ] Dead letter queue configured |
| 249 | +- [ ] Timeout values set per priority level |
| 250 | +- [ ] Error responses follow standard format |
| 251 | +- [ ] Message logging enabled for debugging |
| 252 | +- [ ] Transport uses HTTPS/TLS 1.3 |
0 commit comments