Comprehensive monitoring strategy for NexusCad using:
- OpenTelemetry - Distributed tracing and metrics
- Seq - Structured logging (already implemented)
- Prometheus - Metrics storage (optional)
- Grafana - Metrics visualization (optional)
<!-- src/NexusCad.Api/NexusCad.Api.csproj -->
<PackageReference Include="OpenTelemetry" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Exporter.Console" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Extensions.Hosting" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Instrumentation.AspNetCore" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Instrumentation.Http" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Instrumentation.SqlClient" Version="1.7.0-beta.1" />using OpenTelemetry;
using OpenTelemetry.Resources;
using OpenTelemetry.Trace;
using OpenTelemetry.Metrics;
var builder = WebApplication.CreateBuilder(args);
// OpenTelemetry Tracing
builder.Services.AddOpenTelemetry()
.ConfigureResource(resource => resource
.AddService("NexusCad.Api")
.AddAttributes(new Dictionary<string, object>
{
["deployment.environment"] = builder.Environment.EnvironmentName,
["service.version"] = "1.0.0"
}))
.WithTracing(tracing => tracing
.AddAspNetCoreInstrumentation(options =>
{
options.RecordException = true;
options.Filter = (httpContext) =>
{
// Don't trace health checks
return !httpContext.Request.Path.StartsWithSegments("/health");
};
})
.AddHttpClientInstrumentation()
.AddSqlClientInstrumentation(options =>
{
options.SetDbStatementForText = true;
options.RecordException = true;
})
.AddConsoleExporter() // For local development
.AddOtlpExporter(options =>
{
options.Endpoint = new Uri(builder.Configuration["OpenTelemetry:Endpoint"]
?? "http://localhost:4317");
}))
.WithMetrics(metrics => metrics
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation()
.AddProcessInstrumentation()
.AddMeter("NexusCad.Api")
.AddConsoleExporter()
.AddOtlpExporter());// src/NexusCad.Api/Telemetry/ApiMetrics.cs
using System.Diagnostics.Metrics;
public class ApiMetrics
{
private readonly Meter _meter;
private readonly Counter<long> _specificationsCreated;
private readonly Counter<long> _generationJobsStarted;
private readonly Counter<long> _generationJobsCompleted;
private readonly Histogram<double> _generationDuration;
public ApiMetrics()
{
_meter = new Meter("NexusCad.Api", "1.0.0");
_specificationsCreated = _meter.CreateCounter<long>(
"nexuscad.specifications.created",
"specifications",
"Number of specifications created");
_generationJobsStarted = _meter.CreateCounter<long>(
"nexuscad.jobs.started",
"jobs",
"Number of generation jobs started");
_generationJobsCompleted = _meter.CreateCounter<long>(
"nexuscad.jobs.completed",
"jobs",
"Number of generation jobs completed");
_generationDuration = _meter.CreateHistogram<double>(
"nexuscad.generation.duration",
"seconds",
"Duration of generation jobs");
}
public void RecordSpecificationCreated() => _specificationsCreated.Add(1);
public void RecordGenerationStarted() => _generationJobsStarted.Add(1);
public void RecordGenerationCompleted(double durationSeconds)
{
_generationJobsCompleted.Add(1);
_generationDuration.Record(durationSeconds);
}
}// Program.cs
builder.Services.AddHealthChecks()
.AddNpgSql(
builder.Configuration.GetConnectionString("DefaultConnection")!,
name: "database",
tags: new[] { "db", "postgresql" })
.AddUrlGroup(
new Uri(builder.Configuration["Seq:ServerUrl"]!),
name: "seq",
tags: new[] { "logging" })
.AddCheck<SolidWorksWorkerHealthCheck>(
"solidworks-worker",
tags: new[] { "worker" });
var app = builder.Build();
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("db"),
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = _ => false, // No checks, just alive
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});// src/NexusCad.Api/HealthChecks/SolidWorksWorkerHealthCheck.cs
public class SolidWorksWorkerHealthCheck : IHealthCheck
{
private readonly IJobRepository _jobRepository;
public SolidWorksWorkerHealthCheck(IJobRepository jobRepository)
{
_jobRepository = jobRepository;
}
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
var recentFailures = await _jobRepository.GetRecentFailedJobsCount(
TimeSpan.FromMinutes(15));
if (recentFailures > 10)
{
return HealthCheckResult.Degraded(
$"Worker may be unhealthy: {recentFailures} failed jobs in last 15 minutes");
}
return HealthCheckResult.Healthy("Worker is operational");
}
}# docker-compose.yml (append)
otel-collector:
image: otel/opentelemetry-collector-contrib:latest
container_name: nexuscad-otel
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
- "8888:8888" # Prometheus metrics
- "8889:8889" # Prometheus exporter metrics
networks:
- nexuscad
restart: unless-stopped
# Optional: Prometheus
prometheus:
image: prom/prometheus:latest
container_name: nexuscad-prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"
networks:
- nexuscad
restart: unless-stopped
# Optional: Grafana
grafana:
image: grafana/grafana:latest
container_name: nexuscad-grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
ports:
- "3001:3000"
networks:
- nexuscad
depends_on:
- prometheus
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:# otel-collector-config.yaml
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 10s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 512
exporters:
logging:
loglevel: info
prometheus:
endpoint: "0.0.0.0:8889"
# Optional: Export to Seq
otlphttp:
endpoint: http://seq:80
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [logging]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [logging, prometheus]# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'nexuscad-api'
static_configs:
- targets: ['api:5140']
metrics_path: '/metrics'
- job_name: 'otel-collector'
static_configs:
- targets: ['otel-collector:8888', 'otel-collector:8889']- Request rate (req/s)
- Response time (p50, p95, p99)
- Error rate (4xx, 5xx)
- Active connections
- CPU/Memory usage
- Specifications created/hour
- Generation jobs started/completed
- Average generation duration
- Job failure rate
- Document downloads
- Database connections
- Database query duration
- SolidWorks worker availability
- Disk usage (generated files)
- Network I/O
# alerts.yml
groups:
- name: nexuscad
interval: 30s
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 5m
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} errors/s"
- alert: SlowRequests
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
annotations:
summary: "95th percentile latency is high"
description: "P95 latency is {{ $value }}s"
- alert: JobFailureRate
expr: rate(nexuscad_jobs_failed[15m]) > 0.1
for: 5m
annotations:
summary: "High job failure rate"
description: "{{ $value }} jobs failing per second"- Request rate by endpoint
- Response time distribution
- Error rate timeline
- Active users
- CPU/Memory usage
- Specifications created (daily/weekly)
- Generation jobs timeline
- Average generation time
- Success/failure ratio
- Most popular projects
- Database connections
- Query performance
- Worker health
- Disk usage trends
- Network throughput
- Add OpenTelemetry packages
- Configure basic tracing
- Enhanced health checks
- Custom API metrics
- OpenTelemetry Collector setup
- Prometheus setup
- Grafana setup
- Import dashboard templates
- Configure data sources
- Create custom dashboards
- Define alert rules
- Configure notification channels
- Test alert triggers
- Document runbooks
- Traces: 7 days
- Metrics: 30 days (5m resolution), 90 days (1h resolution)
- Logs (Seq): 30 days
- Tracing overhead: ~1-2% CPU
- Metrics overhead: ~0.5% CPU
- Sampling: 100% for errors, 10% for success in production
- Protect Grafana with authentication
- Restrict Prometheus to internal network
- Secure OTLP endpoints
- Redact sensitive data in traces
# Request rate
rate(http_requests_total[5m])
# Error rate
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])
# P95 latency
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Active specifications
increase(nexuscad_specifications_created[1h])
Last Updated: 15 Mayo 2026