mcp-context-forge/docker-compose-debug.yml at main · IBM/mcp-context-forge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#version: "3.9"          # Supported by both podman-compose and Docker Compose v2+

###############################################################################
#  HOST SYSTEM TUNING FOR LOAD TESTING (run before docker compose up)
#  See docs/docs/testing/performance.md for full details
#
#  One-liner (TCP + VM + I/O tuning):
#  sudo sysctl -w net.core.somaxconn=65535 net.core.netdev_max_backlog=65535 net.ipv4.tcp_max_syn_backlog=65535 net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_fin_timeout=15 net.ipv4.ip_local_port_range="1024 65535" vm.swappiness=10 fs.aio-max-nr=1048576
#
#  Make persistent: sudo tee /etc/sysctl.d/99-mcp-loadtest.conf (see docs)
###############################################################################

###############################################################################
#  NETWORKS + VOLUMES - declared first so they can be referenced later
###############################################################################
networks:
  mcpnet:               # Single user-defined bridge network keeps traffic private
    driver: bridge

volumes:                # Named volumes survive podman-compose down/up
  pgdata:
  # pgdata18:  # Enable for postgres 18+
  pgadmindata:
  redisinsight_data:
  nginx_cache:
  grafanadata:
  prometheusdata:
  lokidata:
  prometheus_token:

###############################################################################
#  CORE SERVICE - ContextForge
###############################################################################
services:

  # ──────────────────────────────────────────────────────────────────────
  # Nginx Caching Proxy - High-performance reverse proxy with CDN-like caching
  # ──────────────────────────────────────────────────────────────────────
  nginx:
    build:
      context: ./infra/nginx
      dockerfile: Dockerfile
    image: mcpgateway/nginx-cache:latest
    restart: unless-stopped
    ports:
      - "8080:80"                   # HTTP caching proxy (public-facing)
    networks: [mcpnet]
    depends_on:
      gateway:
        condition: service_healthy
    volumes:
      - nginx_cache:/var/cache/nginx    # Persistent cache storage
      - ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro  # Mount config as read-only
    # TCP kernel tuning for 3000 concurrent connections
    # Note: net.core.* sysctls are host-level and cannot be set per-container
    # Only net.ipv4.* sysctls that are network-namespace aware work here
    sysctls:
      - net.ipv4.tcp_fin_timeout=15          # Faster cleanup of FIN_WAIT2 sockets
      - net.ipv4.ip_local_port_range=1024 65535  # More ephemeral ports for upstream
    ulimits:
      nofile:
        soft: 65535
        hard: 65535
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 1G
        reservations:
          cpus: '2'
          memory: 512M

  # ──────────────────────────────────────────────────────────────────────
  # ContextForge - the main API server for the MCP stack
  # ──────────────────────────────────────────────────────────────────────
  gateway:
    image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest} # Use the local latest image. Run `make docker-prod` to build it.
    #image: ghcr.io/ibm/mcp-context-forge:1.0.0-BETA-2 # Use the release ContextForge image
    #image: ghcr.io/ibm/mcp-context-forge:0.7.0 # Testing migration from 0.7.0
    build:
      context: .
      dockerfile: Containerfile.lite     # Same one the Makefile builds
    restart: unless-stopped
    # NOTE: When using replicas > 1, access via nginx:8080 instead of direct port 4444
    # ports:
    #   - "4444:4444"               # Disabled for multi-replica mode
    networks: [mcpnet]

    # ──────────────────────────────────────────────────────────────────────
    # Environment - pick ONE database URL line, comment the rest
    # ──────────────────────────────────────────────────────────────────────
    environment:
      # ═══════════════════════════════════════════════════════════════════════════
      # HTTP Server Selection: gunicorn vs granian
      # ═══════════════════════════════════════════════════════════════════════════
      # Performance comparison (2500 concurrent users, PostgreSQL backend):
      #   Gunicorn: ~2.7GB RAM, ~740% CPU, no backpressure (queues unbounded)
      #   Granian:  ~4.0GB RAM, ~680% CPU, native backpressure (rejects excess with 503)
      #
      # Choose Gunicorn for: memory-constrained environments (32% less RAM)
      # Choose Granian for:  load spike protection, bursty traffic (graceful degradation)
      # Both achieve same RPS when database is the bottleneck.
      # ═══════════════════════════════════════════════════════════════════════════
      - HTTP_SERVER=granian    # Rust-based, native backpressure, +47% memory, -8% CPU
      # - HTTP_SERVER=gunicorn # Python-based, battle-tested, lower memory usage
      - HOST=0.0.0.0
      - PORT=4444
      # Transport: sse, streamablehttp, http, or all (default: all)
      - TRANSPORT_TYPE=streamablehttp
      # Prometheus metrics endpoint (disabled by default; requires JWT auth when enabled)
      - ENABLE_METRICS=true
      # Database connection: Via PgBouncer (default) or direct PostgreSQL
      # PgBouncer provides connection pooling for better performance under high concurrency
      - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@pgbouncer:6432/mcp
      # Direct PostgreSQL connection (bypass PgBouncer - increase DB_POOL_SIZE if using):
      # - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp
      # SQLAlchemy query logging (useful for N+1 detection; noisy under load)
      # NOTE: SQLALCHEMY_ECHO logs at INFO; set LOG_LEVEL=INFO/DEBUG to see output.
      - SQLALCHEMY_ECHO=true
      - CACHE_TYPE=redis # backend for caching (memory, redis, database, or none)
      - REDIS_URL=redis://redis:6379/0
      # Redis parser: hiredis (C extension ~83x faster for large responses)
      - REDIS_PARSER=hiredis
      # Redis connection pool tuning for load testing (32 workers × 150 = 4800 < 5000 maxclients)
      - REDIS_MAX_CONNECTIONS=150
      - REDIS_SOCKET_TIMEOUT=5.0
      - REDIS_SOCKET_CONNECT_TIMEOUT=5.0
      - REDIS_HEALTH_CHECK_INTERVAL=30
      # ═══════════════════════════════════════════════════════════════════════════
      # Redis Startup Resilience (prevents crash-loop on Redis outage)
      # ═══════════════════════════════════════════════════════════════════════════
      # With exponential backoff: 2s, 4s, 8s, 16s, 30s (capped), 30s...
      # 30 retries = ~5 minutes total wait before worker gives up
      - REDIS_MAX_RETRIES=30             # Max attempts before worker exits (default: 30)
      - REDIS_RETRY_INTERVAL_MS=2000     # Base interval, grows exponentially with jitter
      - REDIS_MAX_BACKOFF_SECONDS=30     # Max backoff cap (jitter ±25% applied after)
      # Auth Cache Configuration (reduces DB queries per auth request from 3-4 to 0-1)
      - AUTH_CACHE_ENABLED=true
      - AUTH_CACHE_USER_TTL=300
      - AUTH_CACHE_REVOCATION_TTL=120
      - AUTH_CACHE_TEAM_TTL=300
      - AUTH_CACHE_ROLE_TTL=300
      - AUTH_CACHE_BATCH_QUERIES=true
      - AUTH_CACHE_TEAMS_TTL=300
      # Registry Cache Configuration (reduces DB queries for list endpoints)
      - REGISTRY_CACHE_ENABLED=true
      - REGISTRY_CACHE_TOOLS_TTL=300
      - REGISTRY_CACHE_PROMPTS_TTL=300
      - REGISTRY_CACHE_RESOURCES_TTL=300
      - REGISTRY_CACHE_AGENTS_TTL=300
      - REGISTRY_CACHE_SERVERS_TTL=300
      - REGISTRY_CACHE_GATEWAYS_TTL=300
      - REGISTRY_CACHE_CATALOG_TTL=300
      # Admin Stats Cache Configuration (reduces aggregate queries for dashboard)
      - ADMIN_STATS_CACHE_ENABLED=true
      - ADMIN_STATS_CACHE_SYSTEM_TTL=60
      - ADMIN_STATS_CACHE_OBSERVABILITY_TTL=30
      - ADMIN_STATS_CACHE_TAGS_TTL=120
      - ADMIN_STATS_CACHE_PLUGINS_TTL=120
      - ADMIN_STATS_CACHE_PERFORMANCE_TTL=60
      # Team member count cache (reduces N+1 queries)
      - TEAM_MEMBER_COUNT_CACHE_ENABLED=true
      - TEAM_MEMBER_COUNT_CACHE_TTL=300
      # Metrics aggregation cache (reduces full table scans, see #1906)
      - METRICS_CACHE_ENABLED=true
      - METRICS_CACHE_TTL_SECONDS=120
      # MCP Server Health Check
      # Interval in seconds between health checks (default: 300)
      - HEALTH_CHECK_INTERVAL=300
      # Timeout in seconds for each health check request (default: 5)
      - HEALTH_CHECK_TIMEOUT=5
      # Consecutive failures before marking gateway offline (default: 3)
      - UNHEALTHY_THRESHOLD=3
      # Gateway URL validation timeout in seconds (default: 5)
      - GATEWAY_VALIDATION_TIMEOUT=5
      # Max concurrent health checks per worker (default: 10)
      - MAX_CONCURRENT_HEALTH_CHECKS=10
      # JWT Configuration - Choose ONE approach:
      # Option 1: HMAC (Default - Simple deployments)
      - JWT_ALGORITHM=HS256
      - JWT_SECRET_KEY=my-test-key-but-now-longer-than-32-bytes
      # Option 2: RSA (Production - Asymmetric, uncomment and generate certs)
      # - JWT_ALGORITHM=RS256
      # - JWT_PUBLIC_KEY_PATH=/app/certs/jwt/public.pem
      # - JWT_PRIVATE_KEY_PATH=/app/certs/jwt/private.pem
      - JWT_AUDIENCE=mcpgateway-api
      - JWT_ISSUER=mcpgateway
      - EMAIL_AUTH_ENABLED=true
      - PROTECT_ALL_ADMINS=${PROTECT_ALL_ADMINS:-true}
      - PLATFORM_ADMIN_EMAIL=admin@example.com
      - PLATFORM_ADMIN_PASSWORD=changeme
      - REQUIRE_TOKEN_EXPIRATION=false
      - MCPGATEWAY_UI_ENABLED=true
      - MCPGATEWAY_ADMIN_API_ENABLED=true
      # Security configuration (using defaults)
      - ENVIRONMENT=development
      - SECURITY_HEADERS_ENABLED=true
      - CORS_ALLOW_CREDENTIALS=true
      - SECURE_COOKIES=false
      ## Uncomment to enable HTTPS
      # - SSL=true
      # - CERT_FILE=/app/certs/cert.pem
      # - KEY_FILE=/app/certs/key.pem
      # - KEY_FILE_PASSWORD=${KEY_FILE_PASSWORD}  # Optional: Set in .env for passphrase-protected keys
      # Uncomment to enable plugins
      - PLUGINS_ENABLED=false
      # Uncomment to enable catalog
      - MCPGATEWAY_CATALOG_ENABLED=true
      - MCPGATEWAY_CATALOG_FILE=/app/mcp-catalog.yml
      # Authentication configuration
      - AUTH_REQUIRED=true
      - MCP_CLIENT_AUTH_ENABLED=true
      - TRUST_PROXY_AUTH=false
      # Logging configuration
      # NOTE: LOG_LEVEL=INFO/DEBUG is required for SQLALCHEMY_ECHO output.
      - LOG_LEVEL=INFO  # Required for SQLALCHEMY_ECHO output during load testing
      - DISABLE_ACCESS_LOG=true  # Disable uvicorn access logs for performance (massive I/O overhead)
      # Template auto-reload disabled for performance (prevents re-parsing templates on each request)
      - TEMPLATES_AUTO_RELOAD=false
      - STRUCTURED_LOGGING_DATABASE_ENABLED=false  # Disable DB logging for performance (use true only for debugging)
      # Audit trail logging - disabled by default for performance
      # WARNING: Causes a DB write on EVERY API request - can generate millions of rows during load testing!
      - AUDIT_TRAIL_ENABLED=false  # Set to true for compliance requirements (SOC2, HIPAA, etc.)
      # Security event logging - disabled by default for performance
      # WARNING: "all" level logs every request and causes massive DB write load
      - SECURITY_LOGGING_ENABLED=false  # Set to true to enable security event logging
      - SECURITY_LOGGING_LEVEL=failures_only  # Options: all, failures_only, high_severity
      # Performance optimizations - disable CPU-intensive middlewares
      # NOTE: Keep compression enabled when running without nginx that already has compression
      # Disabling causes throughput drop due to larger payloads
      - COMPRESSION_ENABLED=false
      # Disable optional middlewares for maximum throughput
      - VALIDATION_MIDDLEWARE_ENABLED=true
      - JSON_SCHEMA_VALIDATION_STRICT=true
      - CORRELATION_ID_ENABLED=false
      - OBSERVABILITY_ENABLED=false
      # ═══════════════════════════════════════════════════════════════════════════
      # Database Connection Pool Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Pool class options:
      #   - "null": NullPool - no application pooling, PgBouncer handles all pooling (recommended)
      #   - "queue": QueuePool - application-side pooling (use with direct PostgreSQL)
      #   - "auto": Automatic - NullPool if PgBouncer detected in URL, else QueuePool
      #
      # WITH PgBouncer (default in docker-compose):
      # Option A: NullPool - safest, eliminates stale connection errors, ~10% slower
      # - DB_POOL_CLASS=null
      # Option B: QueuePool + pre_ping - better performance, validates before use
      - DB_POOL_CLASS=queue
      - DB_POOL_PRE_PING=true          # Validate connections before use (SELECT 1)
      - DB_POOL_SIZE=20                # Pool size per worker
      - DB_MAX_OVERFLOW=10             # Extra connections under load
      - DB_POOL_TIMEOUT=60             # Time to wait for connection before failing
      - DB_POOL_RECYCLE=60             # Recycle before PgBouncer CLIENT_IDLE_TIMEOUT (half of 120s)
      # ═══════════════════════════════════════════════════════════════════════════
      # Database Startup Resilience (prevents crash-loop on DB outage)
      # ═══════════════════════════════════════════════════════════════════════════
      # With exponential backoff: 2s, 4s, 8s, 16s, 30s (capped), 30s...
      # 30 retries = ~5 minutes total wait before worker gives up
      - DB_MAX_RETRIES=30              # Max attempts before worker exits (default: 30)
      - DB_RETRY_INTERVAL_MS=2000      # Base interval, grows exponentially with jitter
      - DB_MAX_BACKOFF_SECONDS=30      # Max backoff cap (jitter ±25% applied after)
      # Tool configuration for high-concurrency load testing
      - TOOL_TIMEOUT=60               # Seconds before tool invocation times out
      - MAX_TOOL_RETRIES=3            # Retry attempts for failed tool invocations
      - TOOL_RATE_LIMIT=60000         # Max tool invocations per minute
      - TOOL_CONCURRENT_LIMIT=1000    # Max concurrent tool invocations
      - FEDERATION_TIMEOUT=30
      # ═══════════════════════════════════════════════════════════════════════════
      # HTTPX Client Connection Pool Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Shared HTTP client for all outbound requests (federation, health checks,
      # A2A, SSO, catalog). Provides ~20x better performance than per-request clients.
      - HTTPX_MAX_CONNECTIONS=200           # Total connections in pool (default: 200)
      - HTTPX_MAX_KEEPALIVE_CONNECTIONS=100 # Keepalive connections (default: 100)
      - HTTPX_KEEPALIVE_EXPIRY=30.0         # Idle connection expiry (seconds)
      - HTTPX_CONNECT_TIMEOUT=5.0           # TCP connection timeout (seconds)
      - HTTPX_READ_TIMEOUT=120.0            # Response read timeout (seconds, high for slow tools)
      - HTTPX_WRITE_TIMEOUT=30.0            # Request write timeout (seconds)
      - HTTPX_POOL_TIMEOUT=10.0             # Wait for available connection (seconds)
      - HTTPX_HTTP2_ENABLED=false           # HTTP/2 support (requires server support)
      - HTTPX_ADMIN_READ_TIMEOUT=30.0       # Admin UI/health check timeout (seconds)
      # Worker and server tuning for high-concurrency load testing
      - GUNICORN_WORKERS=16
      # ═══════════════════════════════════════════════════════════════════════════
      # Granian Backpressure Configuration (used when HTTP_SERVER=granian)
      # ═══════════════════════════════════════════════════════════════════════════
      # Backpressure provides overload protection by rejecting excess requests with
      # immediate 503 responses instead of queuing them (which can cause OOM/timeouts).
      # Total capacity = GRANIAN_WORKERS × GRANIAN_BACKPRESSURE = 16 × 128 = 2048 concurrent
      # Requests beyond this limit receive immediate 503 (no queuing, no OOM)
      - GRANIAN_WORKERS=16
      - GRANIAN_BACKLOG=4096
      - GRANIAN_BACKPRESSURE=128
      - GRANIAN_HTTP1_BUFFER_SIZE=524288
      - GRANIAN_RESPAWN_FAILED=true
      # HTTP/2: Granian supports native HTTP/2 multiplexing, but not useful here because:
      # - nginx sits in front and downgrades to HTTP/1.1 for upstream connections
      # - nginx open-source doesn't support HTTP/2 to backends (only nginx Plus does)
      # - Internal Docker network is fast enough that HTTP/2 gains are negligible
      # To use HTTP/2, either bypass nginx or use Granian with TLS directly.
      # - GRANIAN_HTTP=2

      # ═══════════════════════════════════════════════════════════════════════════
      # MCP Session Pool Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Session pooling for MCP ClientSessions reduces per-request overhead from
      # ~20ms to ~1-2ms (10-20x improvement). Sessions are isolated per user/tenant
      # via identity hashing to prevent cross-user session sharing.
      - MCP_SESSION_POOL_ENABLED=true              # Enable session pooling (default: false, enabled for docker-compose)
      - MCP_SESSION_POOL_MAX_PER_KEY=50            # Max sessions per (URL, identity, transport)
      - MCP_SESSION_POOL_TTL=300.0                 # Session TTL in seconds (default: 300)
      - MCP_SESSION_POOL_HEALTH_CHECK_INTERVAL=60.0  # Idle time before health check (default: 60)
      - MCP_SESSION_POOL_ACQUIRE_TIMEOUT=60.0     # Timeout waiting for session slot (default: 30)
      - MCP_SESSION_POOL_CREATE_TIMEOUT=30.0       # Timeout creating new session (default: 30)
      - MCP_SESSION_POOL_CIRCUIT_BREAKER_THRESHOLD=5  # Failures before circuit opens
      - MCP_SESSION_POOL_CIRCUIT_BREAKER_RESET=60.0   # Seconds before circuit resets
      - MCP_SESSION_POOL_IDLE_EVICTION=600.0      # Evict idle pool keys after (default: 600)
      - MCP_SESSION_POOL_TRANSPORT_TIMEOUT=30.0   # Timeout for all HTTP operations (default: 30)
      - MCP_SESSION_POOL_EXPLICIT_HEALTH_RPC=false  # Force RPC on health checks (default: false)
      # Configurable health check chain - ordered list of methods to try (JSON array)
      # Options: ping, list_tools, list_prompts, list_resources, skip
      - MCP_SESSION_POOL_HEALTH_CHECK_METHODS=["ping", "skip"]  # Try ping, skip if unsupported
      - MCP_SESSION_POOL_HEALTH_CHECK_TIMEOUT=5.0               # Timeout per health check attempt

      # ═══════════════════════════════════════════════════════════════════════════
      # Execution Metrics Recording
      # ═══════════════════════════════════════════════════════════════════════════
      # Controls tool/resource/prompt/server/A2A execution metrics (one DB row per operation).
      # Disable when using external observability to improve performance.
      # Set to true if you need per-operation metrics in the database.
      # Note: Does NOT affect log aggregation (METRICS_AGGREGATION_ENABLED) or Prometheus.
      - DB_METRICS_RECORDING_ENABLED=false

      # ═══════════════════════════════════════════════════════════════════════════
      # Metrics Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Raw metrics are deleted after hourly rollups exist (default: 1 hour retention).
      # Rollups preserve all analytics (counts, p50/p95/p99) for 365 days.
      #
      # If using external observability (ELK, Datadog, Splunk), raw metrics are
      # redundant - your external platform handles debugging and audit trails.
      #
      # Configurable settings (uncomment to override defaults):
      # - METRICS_DELETE_RAW_AFTER_ROLLUP=true      # Delete raw after rollup (default)
      # - METRICS_DELETE_RAW_AFTER_ROLLUP_HOURS=1    # Raw retention when rollup exists
      # - METRICS_CLEANUP_INTERVAL_HOURS=1          # Cleanup frequency (default: hourly)
      # - METRICS_RETENTION_DAYS=7                  # Fallback retention (rollup disabled)
      #
      # For debugging without external observability, increase raw retention:
      # - METRICS_DELETE_RAW_AFTER_ROLLUP_HOURS=168  # Keep raw data 7 days

      # Phoenix Observability Integration (uncomment when using Phoenix)
      # - PHOENIX_ENDPOINT=${PHOENIX_ENDPOINT:-http://phoenix:6006}
      # - OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:-http://phoenix:4317}
      # - OTEL_SERVICE_NAME=${OTEL_SERVICE_NAME:-mcp-gateway}
      # - OTEL_TRACES_EXPORTER=${OTEL_TRACES_EXPORTER:-otlp}
      # - OTEL_METRICS_EXPORTER=${OTEL_METRICS_EXPORTER:-otlp}
      # - OTEL_RESOURCE_ATTRIBUTES=${OTEL_RESOURCE_ATTRIBUTES:-deployment.environment=docker,service.namespace=mcp}

    # TCP kernel tuning for high-concurrency MCP tool invocations
    # Each tool call creates a new connection → many TIME_WAIT sockets
    sysctls:
      - net.ipv4.tcp_fin_timeout=15          # Faster cleanup of FIN_WAIT2 sockets (default: 60)
      - net.ipv4.ip_local_port_range=1024 65535  # More ephemeral ports (default: 32768-60999)
    ulimits:
      nofile:
        soft: 65535
        hard: 65535

    depends_on:          # Default stack: PgBouncer + Redis (PgBouncer depends on Postgres)
      pgbouncer:
        condition: service_healthy   # ▶ wait for connection pooler
      redis:
        condition: service_started
      # Direct PostgreSQL (uncomment if bypassing PgBouncer):
      # postgres:
      #   condition: service_healthy
      # migration:
      #   condition: service_completed_successfully

    healthcheck:
      ## Uncomment for HTTP healthcheck
      test: ["CMD", "python3", "-c", "import urllib.request; import json; resp = urllib.request.urlopen('http://localhost:4444/health', timeout=5); data = json.loads(resp.read()); exit(0 if data.get('status') == 'healthy' else 1)"]
      ## Uncomment for HTTPS healthcheck
      # test: ["CMD", "curl", "-f", "https://localhost:4444/health"]
      ## Uncomment to skip SSL validation (self-signed certs)
      # test: ["CMD", "curl", "-fk", "https://localhost:4444/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s

    # Scaling options:
    # - Single instance: use port 4444 directly, replicas: 1
    # - Multi-instance: comment out ports, set replicas: 2+, access via nginx:8080

    # ──────────────────────────────────────────────────────────────────────
    # Server Engine Selection (Default: Granian - Rust-based HTTP server)
    # ──────────────────────────────────────────────────────────────────────
    # Default is Granian. For Gunicorn with Uvicorn workers:
    # command: ["./run-gunicorn.sh"]

    deploy:
      mode: replicated
      replicas: 3
      resources:
        limits:
          cpus: '8'
          memory: 8G
        reservations:
          cpus: '4'
          memory: 4G

    # ──────────────────────────────────────────────────────────────────────
    # Volume Mounts
    # ──────────────────────────────────────────────────────────────────────
    # Uncomment to mount catalog configuration and SSL certificates
    # volumes:
    #   - ./mcp-catalog.yml:/app/mcp-catalog.yml:ro # mount catalog configuration
    #   - ./certs:/app/mcpgateway/certs:ro   # mount certs folder read-only (includes both SSL and JWT keys)
    #
    # SSL/TLS Certificate Setup:
    # 1. Generate certificates:
    #    - Without passphrase: make certs
    #    - With passphrase: make certs-passphrase
    # 2. Uncomment the volumes mount above
    # 3. Set SSL environment variables
    # 4. If using passphrase-protected key, set KEY_FILE_PASSWORD in .env file
    #
    # For JWT asymmetric keys:
    # 1. Generate keys: make certs-jwt
    # 2. Uncomment volumes mount above
    # 3. Switch JWT_ALGORITHM to RS256 and uncomment JWT_*_KEY_PATH variables

###############################################################################
#  DATABASES - enable ONE of these blocks and adjust DATABASE_URL
###############################################################################

  postgres:
    image: postgres:18
    ulimits:
      nofile:
        soft: 8192
        hard: 8192
    ports:
      - "5433:5432"      # Expose for baseline load testing (5433 to avoid conflict with local postgres)
    # Performance tuning for high-load testing (3000 sustained users)
    # WITH PgBouncer (default): 800 connections provides headroom for 700 pool + system overhead
    # DIRECT connection mode: increase to 4000 for (3 replicas × 16 workers × 80 pool)
    command:
      - "postgres"
      - "-c"
      - "max_connections=800"     # Must exceed PgBouncer MAX_DB_CONNECTIONS (700) + overhead
      - "-c"
      - "shared_buffers=512MB"
      - "-c"
      - "work_mem=16MB"
      - "-c"
      - "effective_cache_size=1536MB"
      - "-c"
      - "maintenance_work_mem=128MB"
      - "-c"
      - "checkpoint_completion_target=0.9"
      - "-c"
      - "wal_buffers=16MB"
      - "-c"
      - "random_page_cost=1.1"
      - "-c"
      - "effective_io_concurrency=200"
      - "-c"
      - "max_worker_processes=8"           # Total background workers (must be >= max_parallel_workers)
      - "-c"
      - "max_parallel_workers_per_gather=4" # Max workers per query's parallel operation
      - "-c"
      - "max_parallel_workers=8"            # Total parallel workers available system-wide
      # === HIGH-CONCURRENCY TUNING (3000 users) ===
      # CRITICAL: idle_in_transaction_session_timeout prevents connection starvation
      # Application code now properly closes transactions via get_db() commit-on-success pattern
      # This timeout is a safety net for any edge cases
      - "-c"
      - "idle_in_transaction_session_timeout=300s"  # Kill stuck transactions after 300s (aligned with PgBouncer)
      - "-c"
      - "statement_timeout=120s"           # Kill runaway queries after 120s
      - "-c"
      - "synchronous_commit=off"           # Async WAL writes (2-10x faster commits)
      - "-c"
      - "commit_delay=100"                 # Batch commits within 100μs window
      # ═══════════════════════════════════════════════════════════════════════════
      # AUTOVACUUM TUNING - High-insert workloads (metrics tables)
      # ═══════════════════════════════════════════════════════════════════════════
      # High insert rates cause dead tuple accumulation. These settings help
      # PostgreSQL keep up with table bloat from metrics writes.
      # Uncomment if experiencing performance degradation under sustained load:
      # - "-c"
      # - "autovacuum_naptime=30s"              # Check more frequently (default: 60s)
      # - "-c"
      # - "autovacuum_vacuum_scale_factor=0.05" # Vacuum at 5% dead tuples (default: 0.2)
      # - "-c"
      # - "autovacuum_vacuum_cost_limit=1000"   # More vacuum work per cycle (default: 200)
      # === PG_STAT_STATEMENTS + AUTO_EXPLAIN ===
      # Query performance tracking and slow query plan logging
      # NOTE: Both extensions must be in the SAME shared_preload_libraries line!
      # After enabling, run in psql:
      #   CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
      #   SELECT * FROM pg_stat_statements ORDER BY total_exec_time DESC LIMIT 10;
      - "-c"
      - "shared_preload_libraries=pg_stat_statements,auto_explain"
      - "-c"
      - "pg_stat_statements.track=all"
      - "-c"
      - "pg_stat_statements.max=10000"
      - "-c"
      - "auto_explain.log_min_duration=1000"
      - "-c"
      - "auto_explain.log_analyze=on"
      # === ROLLBACK DEBUGGING (disabled for performance) ===
      - "-c"
      - "log_min_error_statement=error"
      - "-c"
      - "log_min_messages=warning"
      - "-c"
      - "log_error_verbosity=verbose"
      - "-c"
      - "log_line_prefix=%t [%p]: user=%u,db=%d,app=%a,client=%h "
      - "-c"
      - "log_lock_waits=on"
      - "-c"
      - "deadlock_timeout=1s"
      - "-c"
      - "log_temp_files=0"
      - "-c"
      - "log_checkpoints=on"
      - "-c"
      - "log_connections=on"
      - "-c"
      - "log_disconnections=on"
    environment:
      - POSTGRES_USER=postgres
      - POSTGRES_PASSWORD=mysecretpassword
      - POSTGRES_DB=mcp
    volumes:
      # - pgdata:/var/lib/postgresql/data
      - pgdata:/var/lib/postgresql  # Enable for postgres 18+
    networks: [mcpnet]
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 20s
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G
        reservations:
          cpus: '2'
          memory: 2G

  # ──────────────────────────────────────────────────────────────────────
  # PgBouncer - Connection Pooler for PostgreSQL
  # Reduces connection overhead, improves throughput under high concurrency.
  # Enable by switching gateway DATABASE_URL to use pgbouncer:6432 instead of postgres:5432
  # ──────────────────────────────────────────────────────────────────────
  pgbouncer:
    image: edoburu/pgbouncer:latest
    restart: unless-stopped
    networks: [mcpnet]
    ulimits:
      nofile:
        soft: 65536
        hard: 65536
    ports:
      - "6432:6432"    # PgBouncer port (optional external access)
    environment:
      # Connection to upstream PostgreSQL
      - DATABASE_URL=postgres://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp
      # PgBouncer listen port (default would be 5432, using 6432 to distinguish from PostgreSQL)
      - LISTEN_PORT=6432
      # Pool mode: transaction (recommended), session, or statement
      # transaction: connection returned after each transaction (best for web apps)
      - POOL_MODE=transaction
      # ═══════════════════════════════════════════════════════════════════════════
      # Connection Pool Tuning for 3000 Sustained Users
      # PgBouncer handles connection multiplexing - many app connections share fewer DB connections
      # ═══════════════════════════════════════════════════════════════════════════
      # Client-side limits (from gateway workers via SQLAlchemy)
      - MAX_CLIENT_CONN=5000           # Max app connections; must exceed (replicas × workers × pool)
      - DEFAULT_POOL_SIZE=600          # Shared DB connections; sized for ~70 concurrent tx × 8x headroom
      - MIN_POOL_SIZE=100              # Pre-warmed connections for instant response to load spikes
      - RESERVE_POOL_SIZE=150          # Emergency pool for burst traffic beyond DEFAULT_POOL_SIZE
      - RESERVE_POOL_TIMEOUT=2         # Seconds before tapping reserve pool
      # Server-side limits (to PostgreSQL)
      - MAX_DB_CONNECTIONS=700         # Max connections to PostgreSQL; must be < PG max_connections
      - MAX_USER_CONNECTIONS=700       # Per-user limit; typically equals MAX_DB_CONNECTIONS
      # Connection lifecycle
      - SERVER_LIFETIME=3600           # Recycle server connections after 1 hour (prevents stale state)
      - SERVER_IDLE_TIMEOUT=600        # Close unused server connections after 10 min
      # Timeout settings
      - QUERY_WAIT_TIMEOUT=60          # Max wait for available connection before failing request
      - CLIENT_IDLE_TIMEOUT=300        # Close idle client connections (aligned with pool_recycle)
      - SERVER_CONNECT_TIMEOUT=5       # Timeout for new connections to PostgreSQL
      # Transaction cleanup - critical for avoiding idle-in-transaction buildup
      # NOTE: In transaction pooling, session-level advisory locks (used by migrations)
      # can stick unless the reset query clears them; DISCARD ALL is safest.
      - SERVER_RESET_QUERY=DISCARD ALL # Reset connection state when returned to pool
      - SERVER_RESET_QUERY_ALWAYS=1    # Always run reset query even after clean transactions
      - IDLE_TRANSACTION_TIMEOUT=300   # Kill transactions idle > 300s (allows slow admin page rendering)
      # Authentication
      - AUTH_TYPE=scram-sha-256        # Match PostgreSQL auth method
    depends_on:
      postgres:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "pg_isready", "-h", "localhost", "-p", "6432"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 10s
    deploy:
      resources:
        limits:
          cpus: '1'
          memory: 256M
        reservations:
          cpus: '0.5'
          memory: 128M

  # migration:
  #   #image: ghcr.io/ibm/mcp-context-forge:0.7.0 # Testing migration from 0.7.0
  #   image: mcpgateway/mcpgateway:latest # Use the local latest image. Run `make docker-prod` to build it.
  #   build:
  #     context: .
  #     dockerfile: Containerfile
  #   environment:
  #     - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp
  #   command: alembic -c mcpgateway/alembic.ini upgrade head
  #   depends_on:
  #     postgres:
  #       condition: service_healthy
  #   networks: [mcpnet]

###############################################################################
#  CACHE
###############################################################################
  redis:
    image: redis:latest
    ulimits:
      nofile:
        soft: 65536
        hard: 65536
    # Performance tuning for 1000+ RPS high-concurrency load testing
    command:
      - "redis-server"
      - "--maxmemory"
      - "1gb"
      - "--maxmemory-policy"
      - "allkeys-lru"
      - "--tcp-backlog"
      - "2048"
      - "--timeout"
      - "0"
      - "--tcp-keepalive"
      - "300"
      - "--maxclients"
      - "10000"
    ports:
      - "6379:6379"      # expose only if you want host access
    networks: [mcpnet]
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 2G
        reservations:
          cpus: '1'
          memory: 1G

###############################################################################
#  MONITORING STACK (enabled with --profile monitoring)
#  Usage: docker compose --profile monitoring up -d
#  Access: Grafana http://localhost:3000 (admin/changeme)
#          Prometheus http://localhost:9090
###############################################################################

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus PostgreSQL Exporter - Database metrics
  # Metrics: connections, query duration, locks, cache hit ratio
  # ──────────────────────────────────────────────────────────────────────
  postgres_exporter:
    image: quay.io/prometheuscommunity/postgres-exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9187:9187"    # http://localhost:9187/metrics
    environment:
      - DATA_SOURCE_NAME=postgresql://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp?sslmode=disable
      - PG_EXPORTER_AUTO_DISCOVER_DATABASES=true
    depends_on:
      postgres:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus Redis Exporter - Cache metrics
  # Metrics: memory, clients, commands/sec, keyspace stats
  # ──────────────────────────────────────────────────────────────────────
  redis_exporter:
    image: oliver006/redis_exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9121:9121"    # http://localhost:9121/metrics
    environment:
      - REDIS_ADDR=redis://redis:6379
    depends_on:
      redis:
        condition: service_started
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus PgBouncer Exporter - Connection pool metrics
  # Metrics: active/waiting clients, server connections, pool stats
  # ──────────────────────────────────────────────────────────────────────
  pgbouncer_exporter:
    image: prometheuscommunity/pgbouncer-exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9127:9127"    # http://localhost:9127/metrics
    environment:
      - PGBOUNCER_EXPORTER_CONNECTION_STRING=postgres://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@pgbouncer:6432/pgbouncer?sslmode=disable
    depends_on:
      pgbouncer:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus Nginx Exporter - Proxy metrics
  # Metrics: active connections, requests/sec, response codes
  # Requires stub_status enabled in nginx.conf (location /nginx_status)
  # ──────────────────────────────────────────────────────────────────────
  nginx_exporter:
    image: nginx/nginx-prometheus-exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9113:9113"    # http://localhost:9113/metrics
    command:
      - '-nginx.scrape-uri=http://nginx:80/nginx_status'
    depends_on:
      nginx:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # cAdvisor - Container metrics (CPU, memory, network, disk I/O)
  # Metrics: container_cpu_usage_seconds_total, container_memory_usage_bytes
  # Dashboard: Grafana ID 14282 (Docker and cAdvisor)
  # ──────────────────────────────────────────────────────────────────────
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "8085:8080"    # http://localhost:8085/metrics
    privileged: true
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus - Metrics collection and storage
  # Scrapes: gateway, postgres, redis, nginx, cadvisor
  # Retention: 7 days (configurable via --storage.tsdb.retention.time)
  # ──────────────────────────────────────────────────────────────────────
  prometheus:
    image: prom/prometheus:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9090:9090"    # http://localhost:9090
    volumes:
      - ./infra/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheusdata:/prometheus
      - prometheus_token:/tokens:ro
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=7d'
      - '--web.enable-lifecycle'
    depends_on:
      postgres_exporter:
        condition: service_started
      redis_exporter:
        condition: service_started
      nginx_exporter:
        condition: service_started
      cadvisor:
        condition: service_started
      prometheus_token:
        condition: service_completed_successfully
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus scrape token - generates a JWT for Prometheus to
  # authenticate against the gateway's /metrics/prometheus endpoint
  # ──────────────────────────────────────────────────────────────────────
  prometheus_token:
    image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest}
    networks: [mcpnet]
    restart: "no"
    # Gateway image runs as non-root (uid 1001); named volumes are root-owned
    # by default, so writing /tokens/gateway.jwt requires root.
    user: "0"
    volumes:
      - prometheus_token:/tokens
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -eu
        TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token \
          --username prometheus@monitoring --exp 0 \
          --secret my-test-key-but-now-longer-than-32-bytes --algo HS256 2>/dev/null)
        printf "%s" "$$TOKEN" > /tokens/gateway.jwt
        echo "✅ Prometheus scrape token written to /tokens/gateway.jwt"
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Loki - Log aggregation system (like Prometheus, but for logs)
  # Query logs with LogQL in Grafana
  # ──────────────────────────────────────────────────────────────────────
  loki:
    image: grafana/loki:latest
    restart: unless-stopped
    networks: [mcpnet]
    user: "0"  # Run as root to avoid permission issues
    ports:
      - "3100:3100"    # http://localhost:3100/ready
    volumes:
      - ./infra/monitoring/loki/loki-config.yaml:/etc/loki/local-config.yaml:ro
      - lokidata:/loki
    command: -config.file=/etc/loki/local-config.yaml
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Promtail - Log collector for Loki
  # Collects logs from all containers via Docker socket
  # ──────────────────────────────────────────────────────────────────────
  promtail:
    image: grafana/promtail:latest
    restart: unless-stopped
    networks: [mcpnet]
    volumes:
      - ./infra/monitoring/loki/promtail-config.yaml:/etc/promtail/config.yaml:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    command: -config.file=/etc/promtail/config.yaml
    depends_on:
      - loki
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Grafana - Dashboard visualization
  # Default login: admin / changeme
  # Recommended dashboards:
  #   - Docker/cAdvisor: 14282
  #   - PostgreSQL: 9628
  #   - Redis: 763
  #   - Nginx: 12708
  # ──────────────────────────────────────────────────────────────────────
  grafana:
    image: grafana/grafana:latest
    restart: unless-stopped
    networks: [mcpnet]
    user: "0"  # Run as root to avoid permission issues with provisioning
    ports:
      - "3000:3000"    # http://localhost:3000
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=changeme
      - GF_USERS_ALLOW_SIGN_UP=false
    volumes:
      - grafanadata:/var/lib/grafana
      - ./infra/monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
      - ./infra/monitoring/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
    depends_on:
      - prometheus
    profiles: ["monitoring"]

###############################################################################
#  OPTIONAL ADMIN TOOLS - handy web UIs for DB & cache (disabled by default)
###############################################################################
  pgadmin:              # 🔧 Postgres admin UI
    image: dpage/pgadmin4:9.11.0
    environment:
      - PGADMIN_DEFAULT_EMAIL=admin@example.com
      - PGADMIN_DEFAULT_PASSWORD=changeme
    ports:
      - "5050:80"      # http://localhost:5050
    volumes:
      - pgadmindata:/var/lib/pgadmin
    networks: [mcpnet]
    depends_on:
      postgres:
        condition: service_healthy

  # ──────────────────────────────────────────────────────────────────────
  # Redis Commander - a web-based Redis GUI
  # ──────────────────────────────────────────────────────────────────────
  redis_commander:       # 🔧 Redis key browser
    image: rediscommander/redis-commander:latest
    restart: unless-stopped
    networks: [mcpnet]
    depends_on:
      redis:
        condition: service_started
    ports:
      - "8081:8081"    # http://localhost:8081
    environment:
      - REDIS_HOSTS=local:redis:6379
      - HTTP_USER=admin
      - HTTP_PASSWORD=changeme

  # # ──────────────────────────────────────────────────────────────────────
  # # Redis Insight - a powerful Redis GUI (recently updated)
  # # ──────────────────────────────────────────────────────────────────────
  # redis_insight:                    # 🔧 Redis Insight GUI
  #   image: redis/redisinsight:latest
  #   container_name: redisinsight
  #   restart: unless-stopped
  #   networks: [mcpnet]
  #   ports:
  #     - "5540:5540"                 # Redis Insight UI (default 5540)
  #   depends_on:          # Default stack: Postgres + Redis
  #     redis:
  #       condition: service_started

  #   # ──────────────────────────────────────────────────────────────────────
  #   # Persist data (config, logs, history) between restarts
  #   # ──────────────────────────────────────────────────────────────────────
  #   # volumes:
  #   #   - ./redisinsight_data:/data
  #   volumes:
  #     - redisinsight_data:/data  # <- persist data in named volume

  #   # ──────────────────────────────────────────────────────────────────────
  #   # Preconfigure Redis connection(s) via env vars
  #   # ──────────────────────────────────────────────────────────────────────
  #   environment:
  #     # Single connection (omit "*" since only one):
  #     - RI_REDIS_HOST=redis         # <- your Redis hostname
  #     - RI_REDIS_PORT=6379          # <- your Redis port
  #     - RI_REDIS_USERNAME=default   # <- ACL/username (Redis 6+)
  #     #- RI_REDIS_PASSWORD=changeme  # <- Redis AUTH password
  #     #- RI_REDIS_TLS=true           # <- enable TLS

  #     # Optional: validate self-signed CA instead of trusting all:
  #     # - RI_REDIS_TLS_CA_PATH=/certs/selfsigned.crt
  #     # - RI_REDIS_TLS_CERT_PATH=/certs/client.crt
  #     # - RI_REDIS_TLS_KEY_PATH=/certs/client.key
  #     # - RI_REDIS_TLS=true           # (already set above)

  #     # ──────────────────────────────────────────────────────────────────
  #     # Core Redis Insight settings
  #     # ──────────────────────────────────────────────────────────────────
  #     - RI_APP_HOST=0.0.0.0          # <- listen on all interfaces
  #     - RI_APP_PORT=5540             # <- UI port (container-side)


###############################################################################
#  OPTIONAL MCP SERVERS - drop-in helpers the Gateway can call
###############################################################################

  ###############################################################################
  # Fast Time Server - High-performance time/timezone service for MCP
  # Note: This is an amd64-only image. On ARM platforms (Apple Silicon),
  # emulation may not work properly.
  ###############################################################################
  fast_time_server:
    image: ghcr.io/ibm/fast-time-server:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "8888:8080"    # Map host port 8888 to container port 8080
    # Use dual mode for both SSE (/sse) and Streamable HTTP (/http) endpoints
    command: ["-transport=dual", "-listen=0.0.0.0", "-port=8080", "-log-level=info"]

  ###############################################################################
  # Auto-registration service - registers fast_time_server with gateway
  ###############################################################################