Skip to content

Latest commit

 

History

History
483 lines (393 loc) · 12 KB

File metadata and controls

483 lines (393 loc) · 12 KB

📊 NexusCad Monitoring & Observability Plan

Overview

Comprehensive monitoring strategy for NexusCad using:

  • OpenTelemetry - Distributed tracing and metrics
  • Seq - Structured logging (already implemented)
  • Prometheus - Metrics storage (optional)
  • Grafana - Metrics visualization (optional)

1. OpenTelemetry Integration

1.1 NuGet Packages Required

<!-- src/NexusCad.Api/NexusCad.Api.csproj -->
<PackageReference Include="OpenTelemetry" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Exporter.Console" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Extensions.Hosting" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Instrumentation.AspNetCore" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Instrumentation.Http" Version="1.7.0" />
<PackageReference Include="OpenTelemetry.Instrumentation.SqlClient" Version="1.7.0-beta.1" />

1.2 Program.cs Configuration

using OpenTelemetry;
using OpenTelemetry.Resources;
using OpenTelemetry.Trace;
using OpenTelemetry.Metrics;

var builder = WebApplication.CreateBuilder(args);

// OpenTelemetry Tracing
builder.Services.AddOpenTelemetry()
    .ConfigureResource(resource => resource
        .AddService("NexusCad.Api")
        .AddAttributes(new Dictionary<string, object>
        {
            ["deployment.environment"] = builder.Environment.EnvironmentName,
            ["service.version"] = "1.0.0"
        }))
    .WithTracing(tracing => tracing
        .AddAspNetCoreInstrumentation(options =>
        {
            options.RecordException = true;
            options.Filter = (httpContext) =>
            {
                // Don't trace health checks
                return !httpContext.Request.Path.StartsWithSegments("/health");
            };
        })
        .AddHttpClientInstrumentation()
        .AddSqlClientInstrumentation(options =>
        {
            options.SetDbStatementForText = true;
            options.RecordException = true;
        })
        .AddConsoleExporter() // For local development
        .AddOtlpExporter(options =>
        {
            options.Endpoint = new Uri(builder.Configuration["OpenTelemetry:Endpoint"] 
                ?? "http://localhost:4317");
        }))
    .WithMetrics(metrics => metrics
        .AddAspNetCoreInstrumentation()
        .AddHttpClientInstrumentation()
        .AddRuntimeInstrumentation()
        .AddProcessInstrumentation()
        .AddMeter("NexusCad.Api")
        .AddConsoleExporter()
        .AddOtlpExporter());

1.3 Custom Metrics

// src/NexusCad.Api/Telemetry/ApiMetrics.cs
using System.Diagnostics.Metrics;

public class ApiMetrics
{
    private readonly Meter _meter;
    private readonly Counter<long> _specificationsCreated;
    private readonly Counter<long> _generationJobsStarted;
    private readonly Counter<long> _generationJobsCompleted;
    private readonly Histogram<double> _generationDuration;

    public ApiMetrics()
    {
        _meter = new Meter("NexusCad.Api", "1.0.0");

        _specificationsCreated = _meter.CreateCounter<long>(
            "nexuscad.specifications.created",
            "specifications",
            "Number of specifications created");

        _generationJobsStarted = _meter.CreateCounter<long>(
            "nexuscad.jobs.started",
            "jobs",
            "Number of generation jobs started");

        _generationJobsCompleted = _meter.CreateCounter<long>(
            "nexuscad.jobs.completed",
            "jobs",
            "Number of generation jobs completed");

        _generationDuration = _meter.CreateHistogram<double>(
            "nexuscad.generation.duration",
            "seconds",
            "Duration of generation jobs");
    }

    public void RecordSpecificationCreated() => _specificationsCreated.Add(1);
    public void RecordGenerationStarted() => _generationJobsStarted.Add(1);
    public void RecordGenerationCompleted(double durationSeconds) 
    {
        _generationJobsCompleted.Add(1);
        _generationDuration.Record(durationSeconds);
    }
}

2. Health Checks Enhancement

2.1 Detailed Health Checks

// Program.cs
builder.Services.AddHealthChecks()
    .AddNpgSql(
        builder.Configuration.GetConnectionString("DefaultConnection")!,
        name: "database",
        tags: new[] { "db", "postgresql" })
    .AddUrlGroup(
        new Uri(builder.Configuration["Seq:ServerUrl"]!),
        name: "seq",
        tags: new[] { "logging" })
    .AddCheck<SolidWorksWorkerHealthCheck>(
        "solidworks-worker",
        tags: new[] { "worker" });

var app = builder.Build();

app.MapHealthChecks("/health", new HealthCheckOptions
{
    ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});

app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
    Predicate = check => check.Tags.Contains("db"),
    ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});

app.MapHealthChecks("/health/live", new HealthCheckOptions
{
    Predicate = _ => false, // No checks, just alive
    ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});

2.2 Custom Health Check

// src/NexusCad.Api/HealthChecks/SolidWorksWorkerHealthCheck.cs
public class SolidWorksWorkerHealthCheck : IHealthCheck
{
    private readonly IJobRepository _jobRepository;

    public SolidWorksWorkerHealthCheck(IJobRepository jobRepository)
    {
        _jobRepository = jobRepository;
    }

    public async Task<HealthCheckResult> CheckHealthAsync(
        HealthCheckContext context,
        CancellationToken cancellationToken = default)
    {
        var recentFailures = await _jobRepository.GetRecentFailedJobsCount(
            TimeSpan.FromMinutes(15));

        if (recentFailures > 10)
        {
            return HealthCheckResult.Degraded(
                $"Worker may be unhealthy: {recentFailures} failed jobs in last 15 minutes");
        }

        return HealthCheckResult.Healthy("Worker is operational");
    }
}

3. Docker Compose for Observability Stack

3.1 Add OpenTelemetry Collector

# docker-compose.yml (append)
  otel-collector:
    image: otel/opentelemetry-collector-contrib:latest
    container_name: nexuscad-otel
    command: ["--config=/etc/otel-collector-config.yaml"]
    volumes:
      - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
    ports:
      - "4317:4317"   # OTLP gRPC receiver
      - "4318:4318"   # OTLP HTTP receiver
      - "8888:8888"   # Prometheus metrics
      - "8889:8889"   # Prometheus exporter metrics
    networks:
      - nexuscad
    restart: unless-stopped

  # Optional: Prometheus
  prometheus:
    image: prom/prometheus:latest
    container_name: nexuscad-prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus_data:/prometheus
    ports:
      - "9090:9090"
    networks:
      - nexuscad
    restart: unless-stopped

  # Optional: Grafana
  grafana:
    image: grafana/grafana:latest
    container_name: nexuscad-grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
    ports:
      - "3001:3000"
    networks:
      - nexuscad
    depends_on:
      - prometheus
    restart: unless-stopped

volumes:
  prometheus_data:
  grafana_data:

3.2 OpenTelemetry Collector Config

# otel-collector-config.yaml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch:
    timeout: 10s
    send_batch_size: 1024

  memory_limiter:
    check_interval: 1s
    limit_mib: 512

exporters:
  logging:
    loglevel: info

  prometheus:
    endpoint: "0.0.0.0:8889"

  # Optional: Export to Seq
  otlphttp:
    endpoint: http://seq:80

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [logging]

    metrics:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [logging, prometheus]

3.3 Prometheus Config

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'nexuscad-api'
    static_configs:
      - targets: ['api:5140']
    metrics_path: '/metrics'

  - job_name: 'otel-collector'
    static_configs:
      - targets: ['otel-collector:8888', 'otel-collector:8889']

4. Key Metrics to Monitor

4.1 API Metrics

  • Request rate (req/s)
  • Response time (p50, p95, p99)
  • Error rate (4xx, 5xx)
  • Active connections
  • CPU/Memory usage

4.2 Business Metrics

  • Specifications created/hour
  • Generation jobs started/completed
  • Average generation duration
  • Job failure rate
  • Document downloads

4.3 Infrastructure Metrics

  • Database connections
  • Database query duration
  • SolidWorks worker availability
  • Disk usage (generated files)
  • Network I/O

5. Alerting Rules (Prometheus)

# alerts.yml
groups:
  - name: nexuscad
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 5m
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }} errors/s"

      - alert: SlowRequests
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        annotations:
          summary: "95th percentile latency is high"
          description: "P95 latency is {{ $value }}s"

      - alert: JobFailureRate
        expr: rate(nexuscad_jobs_failed[15m]) > 0.1
        for: 5m
        annotations:
          summary: "High job failure rate"
          description: "{{ $value }} jobs failing per second"

6. Grafana Dashboards

6.1 API Performance Dashboard

  • Request rate by endpoint
  • Response time distribution
  • Error rate timeline
  • Active users
  • CPU/Memory usage

6.2 Business Metrics Dashboard

  • Specifications created (daily/weekly)
  • Generation jobs timeline
  • Average generation time
  • Success/failure ratio
  • Most popular projects

6.3 Infrastructure Dashboard

  • Database connections
  • Query performance
  • Worker health
  • Disk usage trends
  • Network throughput

7. Implementation Checklist

Phase 1: Basic Observability (Week 2)

  • Add OpenTelemetry packages
  • Configure basic tracing
  • Enhanced health checks
  • Custom API metrics
  • OpenTelemetry Collector setup

Phase 2: Metrics & Visualization (Week 2-3)

  • Prometheus setup
  • Grafana setup
  • Import dashboard templates
  • Configure data sources
  • Create custom dashboards

Phase 3: Alerting (Week 3)

  • Define alert rules
  • Configure notification channels
  • Test alert triggers
  • Document runbooks

8. Production Considerations

8.1 Data Retention

  • Traces: 7 days
  • Metrics: 30 days (5m resolution), 90 days (1h resolution)
  • Logs (Seq): 30 days

8.2 Performance Impact

  • Tracing overhead: ~1-2% CPU
  • Metrics overhead: ~0.5% CPU
  • Sampling: 100% for errors, 10% for success in production

8.3 Security

  • Protect Grafana with authentication
  • Restrict Prometheus to internal network
  • Secure OTLP endpoints
  • Redact sensitive data in traces

9. Useful Queries

Prometheus PromQL

# Request rate
rate(http_requests_total[5m])

# Error rate
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])

# P95 latency
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

# Active specifications
increase(nexuscad_specifications_created[1h])

Resources


Last Updated: 15 Mayo 2026