Skip to content

Commit ada1d43

Browse files
authored
Merge pull request #724 from ashakirin/feature/observability
Feature/observability
2 parents ddd57c2 + c9d18f9 commit ada1d43

17 files changed

Lines changed: 11324 additions & 3 deletions

samples/spring-ai-agent/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,53 @@ docker run --name my-postgres \ ✘ 125
2222
## Run postgres and spring-ai-agent containers
2323
docker-compose up
2424

25+
## Observability Configuration
26+
27+
### Option A: Basic Monitoring (Default)
28+
**Includes:** Prometheus metrics + Grafana dashboards + AWS X-Ray tracing
29+
30+
1. Use default `docker-compose.yaml` (AWS OpenTelemetry agent is automatically included in Docker image)
31+
32+
2. Build and start:
33+
```bash
34+
mvn compile jib:dockerBuild
35+
docker-compose up
36+
```
37+
38+
3. Access:
39+
- Grafana: http://localhost:3000 (admin/admin)
40+
- Prometheus: http://localhost:9090
41+
- AWS X-Ray Console: https://console.aws.amazon.com/xray/
42+
43+
### Option B: Open source Observability Stack (Optional)
44+
**Includes:** Prometheus + Grafana + Tempo traces + Loki logs
45+
46+
1. Remove AWS OpenTelemetry agent configuration from pom.xml JIB plugin (jvmFlags section)
47+
48+
2. Copy observability files:
49+
```bash
50+
cp docker-compose.yaml-zipkin docker-compose.yaml
51+
cp logback-spring.xml-loki src/main/resources/logback-spring.xml
52+
```
53+
54+
3. Rebuild application:
55+
```bash
56+
mvn compile jib:dockerBuild
57+
```
58+
59+
4. Start full stack:
60+
```bash
61+
docker-compose up
62+
```
63+
64+
5. Access:
65+
- Grafana: http://localhost:3000 (admin/admin)
66+
- Prometheus: http://localhost:9090
67+
- Tempo: http://localhost:3200
68+
- Loki: http://localhost:3100
69+
70+
6. In Grafana Explore:
71+
- **Metrics**: Select Prometheus data source
72+
- **Traces**: Select Tempo data source
73+
- **Logs**: Select Loki data source
74+

samples/spring-ai-agent/docker-compose.yaml

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,64 @@ services:
2020
- "8080:8080"
2121
environment:
2222
AWS_REGION: us-east-1
23-
AWS_ACCESS_KEY_ID: XXX
24-
AWS_SECRET_ACCESS_KEY: YYY
23+
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
24+
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
2525
SPRING_DATASOURCE_URL: jdbc:postgresql://postgres:5432/ai-agent-db
2626
SPRING_DATASOURCE_USERNAME: chatuser
2727
SPRING_DATASOURCE_PASSWORD: chatpass
2828
networks:
2929
- spring-ai-network
3030

31+
prometheus:
32+
image: prom/prometheus:latest
33+
container_name: prometheus
34+
command:
35+
- --enable-feature=exemplar-storage
36+
- --web.enable-remote-write-receiver
37+
- --config.file=/etc/prometheus/prometheus.yml
38+
ports:
39+
- "9090:9090"
40+
volumes:
41+
- prometheus:/prometheus
42+
- ./thirdparty-configs/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
43+
networks:
44+
- spring-ai-network
45+
46+
grafana:
47+
image: grafana/grafana:latest
48+
container_name: grafana
49+
ports:
50+
- "3000:3000"
51+
environment:
52+
- GF_SECURITY_ADMIN_USER=admin
53+
- GF_SECURITY_ADMIN_PASSWORD=admin
54+
- GF_LOG_LEVEL=error
55+
volumes:
56+
- ./thirdparty-configs/grafana/grafana.ini:/etc/grafana/grafana.ini:ro
57+
- ./thirdparty-configs/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
58+
- ./thirdparty-configs/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
59+
- ./thirdparty-configs/grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro
60+
depends_on:
61+
- prometheus
62+
networks:
63+
- spring-ai-network
64+
65+
aws-otel-collector:
66+
image: public.ecr.aws/aws-observability/aws-otel-collector:latest
67+
container_name: aws-otel-collector
68+
ports:
69+
- "2000:2000/udp"
70+
- "4318:4318"
71+
environment:
72+
AWS_REGION: us-east-1
73+
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
74+
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
75+
networks:
76+
- spring-ai-network
77+
78+
volumes:
79+
prometheus:
80+
driver: local
81+
3182
networks:
3283
spring-ai-network:
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
services:
2+
postgres:
3+
image: pgvector/pgvector:pg16
4+
container_name: postgres-db
5+
environment:
6+
POSTGRES_DB: ai-agent-db
7+
POSTGRES_USER: chatuser
8+
POSTGRES_PASSWORD: chatpass
9+
ports:
10+
- "5432:5432"
11+
networks:
12+
- spring-ai-network
13+
14+
spring-ai-agent:
15+
image: spring-ai-agent:0.0.1-SNAPSHOT
16+
container_name: spring-ai-app
17+
depends_on:
18+
- postgres
19+
ports:
20+
- "8080:8080"
21+
environment:
22+
AWS_REGION: us-east-1
23+
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
24+
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
25+
SPRING_DATASOURCE_URL: jdbc:postgresql://postgres:5432/ai-agent-db
26+
SPRING_DATASOURCE_USERNAME: chatuser
27+
SPRING_DATASOURCE_PASSWORD: chatpass
28+
networks:
29+
- spring-ai-network
30+
31+
prometheus:
32+
image: prom/prometheus:latest
33+
container_name: prometheus
34+
command:
35+
- --enable-feature=exemplar-storage
36+
- --web.enable-remote-write-receiver
37+
- --config.file=/etc/prometheus/prometheus.yml
38+
ports:
39+
- "9090:9090"
40+
volumes:
41+
- prometheus:/prometheus
42+
- ./thirdparty-configs/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
43+
networks:
44+
- spring-ai-network
45+
46+
grafana:
47+
image: grafana/grafana:latest
48+
container_name: grafana
49+
ports:
50+
- "3000:3000"
51+
environment:
52+
- GF_SECURITY_ADMIN_USER=admin
53+
- GF_SECURITY_ADMIN_PASSWORD=admin
54+
volumes:
55+
- ./thirdparty-configs/grafana/grafana.ini:/etc/grafana/grafana.ini:ro
56+
- ./thirdparty-configs/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
57+
- ./thirdparty-configs/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
58+
- ./thirdparty-configs/grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro
59+
depends_on:
60+
- prometheus
61+
networks:
62+
- spring-ai-network
63+
64+
tempo-init:
65+
# Tempo runs as user 10001, and docker compose creates the volume as root.
66+
# As such, we need to chown the volume in order for Tempo to start correctly.
67+
# This should not be needed but this is the official solution recommended by Tempo maintainers
68+
# See: https://github.com/grafana/tempo/blob/a21001a72a5865bfcfc1b0d2dfa30160c5a26103/example/docker-compose/local/docker-compose.yaml
69+
# See: https://github.com/grafana/tempo/issues/1657
70+
image: &tempoImage grafana/tempo:2.5.0 # https://hub.docker.com/r/grafana/tempo/tags and https://github.com/grafana/tempo/releases
71+
user: root
72+
entrypoint:
73+
- "chown"
74+
- "10001:10001"
75+
- "/var/tempo"
76+
volumes:
77+
- tempo:/var/tempo
78+
79+
tempo:
80+
container_name: tempo
81+
image: *tempoImage
82+
command: ['-config.file=/etc/tempo.yml']
83+
depends_on: ['tempo-init']
84+
volumes:
85+
- tempo:/var/tempo
86+
- ./thirdparty-configs/tempo/tempo.yml:/etc/tempo.yml:ro
87+
ports:
88+
- "3200:3200" # tempo
89+
- "9411:9411" # zipkin
90+
networks:
91+
- spring-ai-network
92+
93+
loki:
94+
container_name: loki
95+
image: grafana/loki:3.0.1 # https://hub.docker.com/r/grafana/loki/tags and https://github.com/grafana/loki/releases
96+
command: ['-config.file=/etc/loki/local-config.yaml']
97+
ports:
98+
- "3100:3100"
99+
networks:
100+
- spring-ai-network
101+
102+
volumes:
103+
prometheus:
104+
driver: local
105+
tempo:
106+
driver: local
107+
108+
networks:
109+
spring-ai-network:

samples/spring-ai-agent/pom.xml

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
<postgresql.version>42.7.7</postgresql.version>
3333
<jib-maven-plugin.version>3.4.5</jib-maven-plugin.version>
3434
<testcontainers.version>1.21.1</testcontainers.version>
35+
<loki-logback-appender.version>1.5.2</loki-logback-appender.version>
3536
</properties>
3637
<dependencies>
3738
<dependency>
@@ -81,6 +82,36 @@
8182
<artifactId>liquibase-core</artifactId>
8283
</dependency>
8384

85+
<!-- Observability -->
86+
87+
<!-- Metrics-->
88+
<dependency>
89+
<groupId>org.springframework.boot</groupId>
90+
<artifactId>spring-boot-starter-actuator</artifactId>
91+
</dependency>
92+
<dependency>
93+
<groupId>io.micrometer</groupId>
94+
<artifactId>micrometer-registry-prometheus</artifactId>
95+
<scope>runtime</scope>
96+
</dependency>
97+
98+
<!-- Tracing -->
99+
<!-- <dependency>-->
100+
<!-- <groupId>io.micrometer</groupId>-->
101+
<!-- <artifactId>micrometer-tracing-bridge-brave</artifactId>-->
102+
<!-- </dependency>-->
103+
<!-- <dependency>-->
104+
<!-- <groupId>io.zipkin.reporter2</groupId>-->
105+
<!-- <artifactId>zipkin-reporter-brave</artifactId>-->
106+
<!-- </dependency>-->
107+
108+
<!-- Logging (Loki) -->
109+
<!-- <dependency>-->
110+
<!-- <groupId>com.github.loki4j</groupId>-->
111+
<!-- <artifactId>loki-logback-appender</artifactId>-->
112+
<!-- <version>${loki-logback-appender.version}</version>-->
113+
<!-- </dependency>-->
114+
84115
<!-- Test dependencies -->
85116
<dependency>
86117
<groupId>org.testcontainers</groupId>
@@ -124,10 +155,56 @@
124155
<groupId>org.springframework.boot</groupId>
125156
<artifactId>spring-boot-maven-plugin</artifactId>
126157
</plugin>
158+
<plugin>
159+
<groupId>org.apache.maven.plugins</groupId>
160+
<artifactId>maven-dependency-plugin</artifactId>
161+
<executions>
162+
<execution>
163+
<id>download-otel-agent</id>
164+
<phase>compile</phase>
165+
<goals>
166+
<goal>copy</goal>
167+
</goals>
168+
<configuration>
169+
<artifactItems>
170+
<artifactItem>
171+
<groupId>software.amazon.opentelemetry</groupId>
172+
<artifactId>aws-opentelemetry-agent</artifactId>
173+
<version>1.32.5</version>
174+
<type>jar</type>
175+
<outputDirectory>${project.build.directory}/agents</outputDirectory>
176+
<destFileName>aws-opentelemetry-agent.jar</destFileName>
177+
</artifactItem>
178+
</artifactItems>
179+
</configuration>
180+
</execution>
181+
</executions>
182+
</plugin>
127183
<plugin>
128184
<groupId>com.google.cloud.tools</groupId>
129185
<artifactId>jib-maven-plugin</artifactId>
130186
<version>${jib-maven-plugin.version}</version>
187+
<configuration>
188+
<container>
189+
<jvmFlags>
190+
<jvmFlag>-javaagent:/app/aws-opentelemetry-agent.jar</jvmFlag>
191+
<jvmFlag>-Dotel.traces.exporter=otlp</jvmFlag>
192+
<jvmFlag>-Dotel.exporter.otlp.endpoint=http://aws-otel-collector:4318</jvmFlag>
193+
<jvmFlag>-Dotel.exporter.otlp.protocol=http/protobuf</jvmFlag>
194+
<jvmFlag>-Dotel.service.name=springboot-ai-agent</jvmFlag>
195+
<jvmFlag>-Dotel.metrics.exporter=none</jvmFlag>
196+
<jvmFlag>-Dotel.logs.exporter=none</jvmFlag>
197+
</jvmFlags>
198+
</container>
199+
<extraDirectories>
200+
<paths>
201+
<path>
202+
<from>${project.build.directory}/agents</from>
203+
<into>/app</into>
204+
</path>
205+
</paths>
206+
</extraDirectories>
207+
</configuration>
131208
</plugin>
132209
</plugins>
133210
</build>

samples/spring-ai-agent/src/main/resources/application.yaml

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,36 @@ spring:
3131
dialect: org.hibernate.dialect.PostgreSQLDialect
3232

3333
liquibase:
34-
change-log: classpath:db/changelog/changelog-master.yaml
34+
change-log: classpath:db/changelog/changelog-master.yaml
35+
36+
management:
37+
endpoints:
38+
web:
39+
exposure:
40+
include: health, info, metrics, prometheus
41+
42+
# tracing:
43+
# sampling:
44+
# probability: 1.0
45+
#
46+
# zipkin:
47+
# tracing:
48+
# endpoint: http://tempo:9411/api/v2/spans
49+
50+
metrics:
51+
distribution:
52+
percentiles-histogram:
53+
spring:
54+
ai:
55+
chat:
56+
client: true
57+
gen_ai:
58+
client:
59+
operation: true
60+
db:
61+
vector:
62+
client:
63+
operation: true
64+
http:
65+
server:
66+
requests: true
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<configuration>
3+
<appender name="LOKI" class="com.github.loki4j.logback.Loki4jAppender">
4+
<http>
5+
<url>http://loki:3100/loki/api/v1/push</url>
6+
</http>
7+
<batchMaxItems>1000</batchMaxItems>
8+
<batchTimeoutMs>10000</batchTimeoutMs>
9+
<format>
10+
<label>
11+
<pattern>app=ai-agent,host=${HOSTNAME},level=%level</pattern>
12+
</label>
13+
<message>
14+
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
15+
</message>
16+
</format>
17+
</appender>
18+
19+
<root level="WARN">
20+
<appender-ref ref="LOKI"/>
21+
</root>
22+
</configuration>

0 commit comments

Comments
 (0)